git » linux-kernel » commit efc4d39

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2158377a135..96b2cfc3761 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -314,6 +314,16 @@ config DM_DELAY
 
 	If unsure, say N.
 
+config DM_CSUM
+	tristate "Checksumming target (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	select CRC_CCITT
+	---help---
+	  A target that stores checksums on writes, and verifies
+	  them on reads.
+
+	  If unsure, say N.
+
 config DM_UEVENT
 	bool "DM uevents (EXPERIMENTAL)"
 	depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index e355e7f6a53..10c7a1886b3 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
+obj-$(CONFIG_DM_CSUM)		+= dm-csum.o
 
 quiet_cmd_unroll = UNROLL  $@
       cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \
diff --git a/drivers/md/dm-csum.c b/drivers/md/dm-csum.c
new file mode 100644
index 00000000000..eb580b9f843
--- /dev/null
+++ b/drivers/md/dm-csum.c
@@ -0,0 +1,1719 @@
+/*
+ * A target that stores checksums on writes, and verifies them on reads.
+ * Alberto Bertogli <albertito@blitiri.com.ar>
+ *
+ * This device-mapper module provides data integrity verification by storing
+ * checksums on writes, and verifying them on reads.
+ *
+ *
+ * On-disk format
+ * --------------
+ *
+ * It stores an 8-byte "integrity metadata" ("imd", from now on) structure for
+ * each 512-byte data sector. imd structures are clustered in groups of 62
+ * plus a small header, so they fit a sector (referred to as an "imd sector").
+ * Every imd sector has a "brother", another adjacent imd sector, for
+ * consistency purposes (explained below). That means we devote two sectors to
+ * imd storage for every 62 data sectors.
+ *
+ * The imd structure consists of:
+ *   - 16 bit CRC (CCITT) (big endian)
+ *   - 16 bit flags (big endian)
+ *   - 32 bit tag
+ *
+ * The CRC is, obviously, the CRC of the sector this structure refers to. The
+ * flags are unused at the moment. The tag is not used by this module, but
+ * made available to the upper layers through the integrity framework.
+ *
+ * The imd sector header contains a mark of the last update, so given two
+ * brothers we can determine which one is younger.
+ *
+ *
+ * We can either use the same device to store data sectors and imd sectors, or
+ * store each in different devices. If only one device is used, the sectors
+ * are interleaved: 1 sector is used to contain the imd for the following 62.
+ *
+ *
+ * Write procedure
+ * ---------------
+ *
+ * To guarantee consistency, two imd sectors (named M1 and M2) are kept for
+ * every 62 data sectors, and the following procedure is used to update them
+ * when a write to a given sector is required:
+ *
+ *  - Read both M1 and M2.
+ *  - Find out (using information stored in their headers) which one is newer.
+ *    Let's assume M1 is newer than M2.
+ *  - Update the M2 buffer to mark it's newer, and update the new data's CRC.
+ *  - Submit the write to M2, and then the write to the data, using a barrier
+ *    to make sure the metadata is updated _before_ the data.
+ *
+ * Accordingly, the read operations are handled as follows:
+ *
+ *  - Read both the data, M1 and M2.
+ *  - Find out which one is newer. Let's assume M1 is newer than M2.
+ *  - Calculate the data's CRC, and compare it to the one found in M1. If they
+ *    match, the reading is successful. If not, compare it to the one found in
+ *    M2. If they match, the reading is successful; otherwise, fail. If
+ *    the read involves multiple sectors, it is possible that some of the
+ *    correct CRCs are in M1 and some in M2.
+ *
+ * This scheme assumes that single sector writes are atomic in the presence of
+ * a crash.
+ * XXX: is this a reasonable assumption?
+ *
+ * TODO: would it be better to have M1 and M2 apart, to improve the chances of
+ * recovery in case of a failure?
+ *
+ * A simple locking structure is used to prevent simultaneous changes to the
+ * imd sectors.
+ *
+ * A last_accessed counter is stored in the imd sector header and used to
+ * find out if the given sector is newer than its brother. When writing out an
+ * imd sector, we will increase its count by 2.
+ *
+ *
+ * Code overview
+ * -------------
+ *
+ * The code uses the term "nice bio" to refer to a bio if all its sectors are
+ * covered by a single imd sector. Otherwise, the bio is "evil".
+ *
+ * The bulk of the code is the read and write handling, which is only designed
+ * to work with nice bios for simplicity. There's additional
+ * direction-independent code to split evil bios into nice ones.
+ *
+ * The rest is mostly concerned with device-mapper and module stuff.
+ *
+ * The code is divided in the following sections:
+ *
+ *  - Generic and miscellaneous code, including the csum_c structured used to
+ *    track a single csum device, and the functions used to manipulate sector
+ *    numbers.
+ *  - bio-integrity.
+ *  - imd generation and verification.
+ *  - Read handling (remember: only for nice bios).
+ *  - Write handling (idem).
+ *  - Work queue.
+ *  - Evil bios handling.
+ *  - Device mapper constructor, destructor and mapper functions.
+ *  - DM target registration and module stuff.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/crc-ccitt.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+#include <linux/device-mapper.h>
+#include <linux/workqueue.h>
+
+#define DM_MSG_PREFIX "csum"
+
+#if 0
+  #define dprint(...) printk(KERN_DEBUG __VA_ARGS__)
+  //#define ddprint(...) printk(KERN_DEBUG __VA_ARGS__)
+  #define ddprint(...)
+  #define dump_bio(name, bio) \
+	 do { \
+		dprint("%s bio: %p\n", name, bio); \
+		dprint("%s sector: %lu\n", name, bio->bi_sector); \
+		dprint("%s secn: %d\n", name, bio_sectors(bio)); \
+		dprint("%s size: %u\n", name, bio->bi_size); \
+		dprint("%s vcnt: %d\n", name, bio->bi_vcnt); \
+		dprint("%s idx: %d\n", name, bio->bi_idx); \
+		dprint("%s bvec: %p\n", name, bio->bi_io_vec); \
+		dprint("%s flags: %lx\n", name, bio->bi_flags); \
+		dprint("\n"); \
+	} while (0)
+
+  #define too_many_bytes(bio) __too_many_bytes(__func__, __LINE__, bio, 0, #bio)
+  #define too_many_bytes_w(bio) __too_many_bytes(__func__, __LINE__, bio, 1, #bio)
+
+int __too_many_bytes(const char *name, int line, const struct bio *bio,
+		int warn, const char *arg)
+{
+	int segno;
+	struct bio_vec *bvec;
+	size_t bvt = 0;
+
+	bio_for_each_segment(bvec, bio, segno) {
+		bvt += bvec->bv_len;
+	}
+
+	if (bvt != bio->bi_size) {
+		dprint("too many bytes (%s)\n", arg);
+		dprint("  at %s:%d - bvt:%zu bio:%u\n", name,
+				line, bvt, bio->bi_size);
+		//dump_bio("bio", bio);
+		//if (warn)
+		//	WARN_ON(1);
+		return 1;
+	}
+
+	return 0;
+}
+#else
+  #define dprint(...)
+  #define ddprint(...)
+  #define dump_bio(name, bio)
+  #define too_many_bytes(bio) (0)
+  #define too_many_bytes_w(bio) (0)
+#endif
+
+
+/* Context information for device mapper */
+
+typedef sector_t (map_data_sector_fn) (struct dm_target *ti, sector_t data);
+typedef sector_t (get_imd_sector_fn) (struct dm_target *ti, sector_t data);
+
+struct csum_c {
+	/* data backing device */
+	struct dm_dev *data_dev;
+	sector_t data_start;
+
+	/* imd backing device (can be the same as data_dev) */
+	struct dm_dev *imd_dev;
+	sector_t imd_start;
+	sector_t imd_len;
+
+	map_data_sector_fn *map_data_sector;
+	get_imd_sector_fn *get_imd_sector;
+};
+
+
+/* TODO: use decent locking. At the moment, this semaphore is locked prior to
+ * submission to the work queue, and gets released after the work has been
+ * processed. This is needed to avoid concurrent accesses to the imd sectors.
+ * In the future, fine grained locking will be implemented. */
+static DECLARE_MUTEX(wq_lock);
+static int lock_count = 0;
+#define wq_lock_down() \
+	do { \
+		down(&wq_lock); \
+		lock_count++; \
+		if (lock_count != 1) \
+			dprint("ARGH! down, lock_count != 1 -- at %s:%d\n", \
+				__func__, __LINE__); \
+	} while (0)
+
+#define wq_lock_up() \
+	do { \
+		lock_count--; \
+		if (lock_count != 0) \
+			dprint("ARGH! up, lock_count != 0 -- at %s:%d\n", \
+				__func__, __LINE__); \
+		up(&wq_lock); \
+	} while (0)
+
+
+
+/*
+ * Utility functions for disk data manipulation
+ */
+
+/* How many sectors we reserve at the beginning of the data device for
+ * identification and device metadata */
+#define RESERVED_INITIAL_SECTORS_D 1
+
+/* If the metadata is on a different device, how many sectors we reserve at
+ * the beginning for identification and device metadata */
+#define RESERVED_INITIAL_SECTORS_M 1
+
+/* How many data sectors for each metadata sector. See the initial comment for
+ * a rationale on the value. */
+#define SECTORS_PER_IMD 62
+
+
+/* Return how many sectors are needed to store the imd information for the
+ * given amount of data sectors */
+static sector_t imd_sectors_needed(sector_t sectors)
+{
+	return dm_sector_div_up(sectors, SECTORS_PER_IMD) * 2;
+}
+
+/* Given a dm device sector, return the corresponding data device sector to
+ * find it from. We have one function to use when data and metadata are stored
+ * in different devices, and one to use when they're in the same device. Which
+ * one to use will be determined via function pointers in the context
+ * structure.  */
+static sector_t map_data_sector_diff(struct dm_target *ti, sector_t data)
+{
+	struct csum_c *cc = ti->private;
+
+	/* When stored in different devices, data is stored directly at the
+	 * given offset */
+	return cc->data_start + RESERVED_INITIAL_SECTORS_D +
+		(data - ti->begin);
+}
+
+static sector_t map_data_sector_same(struct dm_target *ti, sector_t data)
+{
+	struct csum_c *cc = ti->private;
+
+	/* When stored in the same device, interleaving makes things a little
+	 * more complicated. The offset will be the same as if there was no
+	 * interleaving, plus the number of imd sectors.
+	 * We call imd_sectors_needed with (data - ti->begin + 1) because it
+	 * receives a number of sectors, so 0 means no sectors and not an
+	 * offset. */
+	return cc->data_start + RESERVED_INITIAL_SECTORS_D +
+		(data - ti->begin) + imd_sectors_needed(data - ti->begin + 1);
+}
+
+/* Return the imd sector that holds the tuple for the given data sector. Its
+ * brother imd sector will be the result + 1, as they're always adjacent. */
+static sector_t get_imd_sector_diff(struct dm_target *ti, sector_t data)
+{
+	return RESERVED_INITIAL_SECTORS_M +
+		imd_sectors_needed(data - ti->begin + 1);
+}
+
+static sector_t get_imd_sector_same(struct dm_target *ti, sector_t data)
+{
+	sector_t isn = imd_sectors_needed(data - ti->begin + 1);
+
+	return RESERVED_INITIAL_SECTORS_D + SECTORS_PER_IMD * ((isn - 2) / 2)
+		+ (isn - 2);
+}
+
+
+/*
+ * Integrity metadata manipulation
+ */
+
+/* Each sector's integrity metadata. We only use crc at the moment. */
+struct imd_tuple {
+	__be16 crc;
+	__be16 flags;
+	__be32 tag;
+} __attribute__ ((packed));
+
+/* imd sector header, holds internal metadata */
+struct imd_sector_header {
+	/* 8 bits is enough for last_updated,  */
+	u8 last_updated;
+	u8 unused1;
+	__be16 crc;
+	__be32 unused3;
+} __attribute__ ((packed));
+
+/* Return the older of m1 and m2, or NULL if it was impossible to determine */
+static struct imd_sector_header *older_imd(struct imd_sector_header *m1,
+		struct imd_sector_header *m2)
+{
+	int l1, l2;
+
+	/* we get the values into something signed so we can subtract them */
+	l1 = m1->last_updated;
+	l2 = m2->last_updated;
+
+	if (abs(l1 - l2) > 1) {
+		//dprint("wrap-around: %d %d %u\n", l1, l2, abs(l1 - l2));
+		if (l1 == 0) {
+			return m2;
+		} else if (l2 == 0) {
+			return m1;
+		} else {
+			dprint("can't find older - l1:%d l2:%d\n", l1, l2);
+			return NULL;
+		}
+	} else {
+		if (l1 > l2) {
+			return m2;
+		} else if (l1 < l2) {
+			return m1;
+		} else {
+			dprint("can't find older - l1:%d l2:%d\n", l1, l2);
+			return NULL;
+		}
+	}
+}
+
+/* Return a bio that reads the given imd sectors (both M1 and M2), setting
+ * the bi_bdev to bdev, bi_end_io callback to cb, and bi_private to private.
+ * The returned bio will have a single page allocated, that must be freed. */
+static struct bio *prepare_imd_read(struct block_device *bdev, sector_t sector,
+		bio_end_io_t *cb, void *private)
+{
+	struct page *page = NULL;
+	struct bio *bio = NULL;
+
+	page = alloc_page(GFP_NOIO);
+	if (page == NULL)
+		goto error;
+
+	bio = bio_alloc(GFP_NOIO, 1);
+	if (bio == NULL)
+		goto error;
+
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio->bi_rw |= READ;
+	bio->bi_end_io = cb;
+	bio->bi_private = private;
+	if (bio_add_page(bio, page, 1024, 0) != 1024)
+		goto error;
+
+	too_many_bytes(bio);
+	return bio;
+
+error:
+	if (page)
+		__free_page(page);
+	if (bio) {
+		bio->bi_end_io = NULL;
+		bio_put(bio);
+	}
+
+	return NULL;
+}
+
+/* Calculate the CRCs for the sectors in given bio. It assumes there is enough
+ * space in crc for all the sectors (i.e. crc can hold at least
+ * bio_sectors(bio) 16 bit integers). */
+static void crc_sectors_from_bio(const struct bio *bio, u16 *crc)
+{
+	int segno;
+	struct bio_vec *bvec;
+	unsigned char *src_data, *data;
+	unsigned long flags;
+	unsigned int sec_cnt;
+	size_t len;
+	u16 current_crc;
+
+	/* bytes needed to complete the current CRC */
+	unsigned int bytes_needed;
+
+	/* bytes left in the current bvec */
+	unsigned int left_in_bvec;
+
+	u16 *tmp_crc = crc;
+	unsigned long tmp_tot = 0;
+
+	sec_cnt = 0;
+
+	/* XXX: is there really no other way than using bvec_kmap_irq()? */
+	current_crc = 0;
+	bytes_needed = 512;
+	bio_for_each_segment(bvec, bio, segno) {
+		src_data = bvec_kmap_irq(bvec, &flags);
+		data = src_data;
+		left_in_bvec = bvec->bv_len;
+
+start:
+		len = min(left_in_bvec, bytes_needed);
+		current_crc = crc_ccitt(current_crc, data, len);
+
+		bytes_needed -= len;
+		left_in_bvec -= len;
+		tmp_tot += len;
+
+		if (unlikely(bytes_needed)) {
+			/* we need to go through the next bvec */
+			dprint("next bvec\n");
+			bvec_kunmap_irq(data, &flags);
+			continue;
+		}
+
+		*crc = current_crc;
+		crc++;
+		sec_cnt++;
+		current_crc = 0;
+		bytes_needed = 512;
+
+		if (left_in_bvec && sec_cnt < bio_sectors(bio)) {
+			/* this bvec still has some data left; if we still
+			 * have crcs to calculate, use it for the next one */
+			//dprint("remaining data: t:%u %u\n", tmp_cnt, left_in_bvec);
+			data += len;
+			goto start;
+		}
+
+		bvec_kunmap_irq(src_data, &flags);
+
+		if (sec_cnt >= bio_sectors(bio)) {
+			if (left_in_bvec) {
+				dprint("XXX early break\n");
+				goto TMP1;
+			}
+			break;
+		}
+
+		if (tmp_crc >= (crc + bio_sectors(bio))) {
+			dprint("XXX crc se fue de mambo\n");
+			goto TMP1;
+		}
+	}
+
+	if (sec_cnt != bio_sectors(bio)) {
+		dprint("XXX BUG\n");
+TMP1:
+		dprint("sec cnt: %u\n", sec_cnt);
+		dprint("segno: %u\n", segno);
+		dprint("tmp_tot: %lu\n", tmp_tot);
+		dprint("bytes_needed: %u\n", bytes_needed);
+		dprint("left_in_bvec: %u\n", left_in_bvec);
+		dprint("len: %zu\n", len);
+		dprint("bvec: %p\n", bvec);
+		dump_bio("bio", bio);
+	}
+}
+
+
+/*
+ * bio-integrity extensions
+ */
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+
+static void imd_generate(struct blk_integrity_exchg *bix)
+{
+	unsigned int i;
+	void *buf = bix->data_buf;
+	struct imd_tuple *imd = bix->prot_buf;
+
+	/* dprint("imd_gen(): s:%llu ss:%u ds:%u\n",
+			(unsigned long long) bix->sector, bix->sector_size,
+			bix->data_size); */
+
+	for (i = 0; i < bix->data_size; i += bix->sector_size) {
+		imd->crc = crc_ccitt(0, buf, bix->sector_size);
+		imd->tag = 0;
+		imd->flags = 0;
+
+		buf += bix->sector_size;
+		imd++;
+	}
+}
+
+static int imd_verify(struct blk_integrity_exchg *bix)
+{
+	unsigned int i;
+	void *buf = bix->data_buf;
+	struct imd_tuple *imd = bix->prot_buf;
+	u16 crc;
+	sector_t sector = bix->sector;
+
+	/* dprint("imd_vfy(): s:%llu ss:%u ds:%u\n", (unsigned long long) sector,
+			bix->sector_size, bix->data_size); */
+
+	for (i = 0; i < bix->data_size; i += bix->sector_size) {
+		crc = crc_ccitt(0, buf, bix->sector_size);
+		if (crc != imd->crc) {
+			printk(KERN_ERR "%s: checksum error on sector %llu"
+					" - disk:%04x imd:%04x\n",
+					bix->disk_name,
+					(unsigned long long) sector, crc,
+					imd->crc);
+			dprint("verify: d:%p p:%p imd:%p\n", bix->data_buf,
+					bix->prot_buf, imd);
+			return -EIO;
+		}
+
+		buf += bix->sector_size;
+		imd++;
+		sector++;
+	}
+
+	return 0;
+}
+
+static void imd_get_tag(void *prot, void *tag_buf, unsigned int sectors)
+{
+	unsigned int i;
+	struct imd_tuple *imd = prot;
+	u16 *tag = tag_buf;
+
+	for (i = 0; i < sectors; i++) {
+		*tag = imd->tag;
+		tag++;
+		imd++;
+	}
+}
+
+static void imd_set_tag(void *prot, void *tag_buf, unsigned int sectors)
+{
+	unsigned int i;
+	struct imd_tuple *imd = prot;
+	u16 *tag = tag_buf;
+
+	for (i = 0; i < sectors; i++) {
+		imd->tag = *tag;
+		tag++;
+		imd++;
+	}
+}
+
+static struct blk_integrity integrity_profile = {
+	.name           = "LINUX-DMCSUM-V0-CCITT",
+	.generate_fn    = imd_generate,
+	.verify_fn      = imd_verify,
+	.get_tag_fn     = imd_get_tag,
+	.set_tag_fn     = imd_set_tag,
+	.tuple_size     = sizeof(struct imd_tuple),
+	.tag_size       = sizeof(u32),
+};
+
+static int bi_register(struct dm_target *ti)
+{
+	struct mapped_device *md;
+	struct gendisk *disk;
+
+	md = dm_table_get_md(ti->table);
+	disk = dm_disk(md);
+
+	return blk_integrity_register(disk, &integrity_profile);
+}
+
+static void bi_unregister(struct dm_target *ti)
+{
+	struct mapped_device *md;
+	struct gendisk *disk;
+
+	md = dm_table_get_md(ti->table);
+	disk = dm_disk(md);
+
+	blk_integrity_unregister(disk);
+}
+
+/* Copy the given buffer into the given bip */
+static void copy_to_bip(struct bio_integrity_payload *bip,
+		const unsigned char *buf, unsigned int size)
+{
+	unsigned int i;
+	unsigned int advance;
+	unsigned long flags;
+	struct bio_vec *bvec;
+
+	bip_for_each_vec(bvec, bip, i) {
+		unsigned char *data = bvec_kmap_irq(bvec, &flags);
+
+		advance = min(bvec->bv_len, size);
+
+		memcpy(data, buf, advance);
+
+		buf += advance;
+		size -= advance;
+
+		bvec_kunmap_irq(data, &flags);
+
+		if (size == 0)
+			break;
+	}
+}
+
+/* Set bio's integrity information taking it from imd_bio */
+static void set_bi_from_imd(struct bio *bio, struct bio *imd_bio)
+{
+	unsigned long flags;
+	struct imd_tuple *t;
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	unsigned char *imd_buf;
+
+	imd_buf = bvec_kmap_irq(bio_iovec(imd_bio), &flags);
+
+	t = (struct imd_tuple *) (imd_buf + sizeof(struct imd_sector_header));
+	t += bio->bi_sector % SECTORS_PER_IMD;
+
+	copy_to_bip(bip, (unsigned char *) t,
+			bio_sectors(bio) * sizeof(struct imd_tuple));
+
+	bvec_kunmap_irq(imd_buf, &flags);
+}
+
+/* Updates bio's integrity information at the given position, taking it from
+ * the given imd tuple */
+static void update_bi_info(struct bio *bio, unsigned int pos,
+		struct imd_tuple *tuple)
+{
+	unsigned long flags;
+	unsigned char *bip_buf;
+	struct imd_tuple *t;
+
+	BUG_ON(bio_integrity(bio) == 0);
+
+	bip_buf = bvec_kmap_irq(bip_vec(bio->bi_integrity), &flags);
+
+	BUG_ON(bip_buf == NULL);
+
+	t = (struct imd_tuple *) bip_buf;
+	t += pos;
+	t->crc = tuple->crc;
+	t->tag = tuple->tag;
+	t->flags = tuple->flags;
+
+	bvec_kunmap_irq(bip_buf, &flags);
+}
+#else /* BLK_DEV_INTEGRITY */
+
+static int bi_register(struct dm_target *ti)
+{
+	return 0;
+}
+
+static void bi_unregister(struct dm_target *ti)
+{
+	return;
+}
+
+static void set_bi_from_imd(struct bio *bio, struct bio *imd_bio)
+{
+	return;
+}
+
+static void update_bi_info(struct bio *bio, unsigned int pos,
+		struct imd_tuple *tuple)
+{
+	return;
+}
+#endif /* BLK_DEV_INTEGRITY */
+
+
+/*
+ * imd generation and verification
+ */
+
+/* Update the imd information for the given data bio. The function deals with
+ * the imd bio directly, that holds one page with both imd sectors (M1 and
+ * M2), as returned from prepare_imd_read(), and assumes it's been read from
+ * disk (so it will only update what's needed).
+ *
+ * Modifies imd_bio so it only writes the sector needed.
+ *
+ * Returns:
+ *  - 0 on success
+ *  - -1 if there was a memory error
+ *  - -2 if there was a consistency error (cannot find older)
+ */
+static int update_imd_bio(const struct bio *data_bio, struct bio *imd_bio)
+{
+	int i;
+	u16 *crc;
+	unsigned long flags;
+	unsigned char *imd_buf;
+	struct imd_sector_header *m1, *m2, *older;
+	struct imd_tuple *t;
+
+	if (bio_sectors(imd_bio) != 2) {
+		dump_bio("imd_bio", imd_bio);
+		BUG();
+	}
+
+	crc = kmalloc(sizeof(u16) * bio_sectors(data_bio) + 1, GFP_NOIO);
+	if (crc == NULL)
+		return -1;
+
+	crc_sectors_from_bio(data_bio, crc);
+
+	imd_buf = bvec_kmap_irq(bio_iovec(imd_bio), &flags);
+
+	m1 = (struct imd_sector_header *) imd_buf;
+	m2 = (struct imd_sector_header *) (imd_buf + 512);
+
+	older = older_imd(m1, m2);
+	if (older == NULL) {
+		bvec_kunmap_irq(imd_buf, &flags);
+		kfree(crc);
+		return -2;
+	}
+
+	t = (struct imd_tuple *) (older + 1);
+	t = t + data_bio->bi_sector % SECTORS_PER_IMD;
+
+	for (i = 0; i < bio_sectors(data_bio); i++) {
+		t->crc = *(crc + i);
+		t++;
+	}
+
+	older->last_updated += 2;
+	older->crc = crc_ccitt(0, (unsigned char *) (older + 1),
+			512 - sizeof(struct imd_sector_header));
+
+	bvec_kunmap_irq(imd_buf, &flags);
+
+	kfree(crc);
+
+	imd_bio->bi_size = 512;
+	bio_iovec(imd_bio)->bv_len = 512;
+	if (older == m2) {
+		imd_bio->bi_sector++;
+		bio_iovec(imd_bio)->bv_offset = 512;
+	}
+
+	return 0;
+}
+
+/* Verify that the CRCs from data_bio match the ones stored in imd_bio (which
+ * contains both M1 and M2), and update data_bio integrity information (if
+ * there is any) */
+/* TODO: choose a better name */
+static int verify_crc(struct bio *data_bio, struct bio *imd_bio)
+{
+	int i, r;
+	u16 *crc;
+	unsigned long flags;
+	unsigned char *imd_buf;
+	struct imd_sector_header *m1, *m2, *older, *newer;
+	struct imd_tuple *nt, *ot;
+
+	crc = kmalloc(sizeof(u16) * bio_sectors(data_bio) + 1, GFP_NOIO);
+	if (crc == NULL)
+		return -ENOMEM;
+
+	crc_sectors_from_bio(data_bio, crc);
+
+	imd_buf = bvec_kmap_irq(bio_iovec(imd_bio), &flags);
+
+	m1 = (struct imd_sector_header *) imd_buf;
+	m2 = (struct imd_sector_header *) (imd_buf + 512);
+
+	older = older_imd(m1, m2);
+	if (older == NULL) {
+		printk(KERN_WARNING "dm-csum: couldn't find older\n");
+		r = -ENOMEM;
+		goto exit;
+	}
+
+	newer = m1;
+	if (older == m1)
+		newer = m2;
+
+	nt = (struct imd_tuple *) (newer + 1);
+	nt += data_bio->bi_sector % SECTORS_PER_IMD;
+	ot = (struct imd_tuple *) (older + 1);
+	ot += data_bio->bi_sector % SECTORS_PER_IMD;
+
+	r = 0;
+
+	BUG_ON(bio_sectors(data_bio) > SECTORS_PER_IMD);
+
+	for (i = 0; i < bio_sectors(data_bio); i++) {
+		if (nt->crc == *(crc + i)) {
+			update_bi_info(data_bio, i, nt);
+		} else if (ot->crc == *(crc + i)){
+			update_bi_info(data_bio, i, ot);
+
+			/* dprint("no match from new\n");
+			dprint(" new: %d %04x\n", newer->last_updated,
+					nt->crc);
+			dprint(" old: %d %04x\n", older->last_updated,
+					ot->crc);
+			dprint(" real: %04x\n", *(crc + i)); */
+		} else {
+			printk(KERN_WARNING
+				"dm-csum: CRC error at sector %lld\n",
+				(unsigned long long)
+					(data_bio->bi_sector + i));
+			dprint("CRC: %llu o:%x n:%x r:%x\n",
+					(unsigned long long)
+						(data_bio->bi_sector + i),
+					ot->crc, nt->crc, *(crc + i));
+			r = -EIO;
+			break;
+		}
+		nt++;
+		ot++;
+	}
+
+	/* TODO: validate the imd sector CRC */
+
+exit:
+	bvec_kunmap_irq(imd_buf, &flags);
+
+	kfree(crc);
+
+	return r;
+}
+
+
+/* Work queue where the read/write processing code is run.
+ * TODO: Unify with the submission workqueue once we have decent locking. */
+static struct workqueue_struct *io_wq;
+
+/*
+ * READ handling (nice bios only)
+ *
+ * Reads are handled by reading the requested data, and the imd sector
+ * associated with it. When both requests are completed, the data checksum is
+ * calculated and compared against what's in the imd sector.
+ */
+
+/* Used to track pending reads */
+struct pending_read {
+	struct dm_target *ti;
+	struct csum_c *cc;
+	struct bio *orig_bio;
+
+	struct bio *data_bio;
+	struct bio *imd_bio;
+
+	bool error;
+
+	/* number of operations pending */
+	atomic_t nr_pending;
+
+	struct work_struct work;
+};
+
+static void read_nice_bio(struct dm_target *ti, struct bio *bio);
+static struct bio *prepare_data_read(struct bio *orig_bio,
+		struct block_device *bdev, sector_t sector, bio_end_io_t *cb,
+		void *private);
+static void queue_read_complete(struct bio *bio, int error);
+static void read_complete(struct work_struct *work);
+
+/* Read a nice bio */
+static void read_nice_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct csum_c *cc = ti->private;
+	struct pending_read *pr;
+
+	pr = kmalloc(sizeof(*pr), GFP_NOIO);
+	if (pr == NULL)
+		goto error;
+
+	pr->ti = ti;
+	pr->cc = cc;
+	pr->orig_bio = bio;
+	pr->error = false;
+
+	pr->data_bio = prepare_data_read(pr->orig_bio, cc->data_dev->bdev,
+			cc->map_data_sector(ti, pr->orig_bio->bi_sector),
+			queue_read_complete, pr);
+	if (pr->data_bio == NULL)
+		goto error;
+
+	pr->imd_bio = prepare_imd_read(cc->imd_dev->bdev,
+			cc->get_imd_sector(ti, pr->orig_bio->bi_sector),
+			queue_read_complete, pr);
+	if (pr->imd_bio == NULL)
+		goto error;
+
+	too_many_bytes(pr->imd_bio);
+	too_many_bytes(pr->data_bio);
+
+	atomic_set(&pr->nr_pending, 2);
+
+	ddprint("read f 0:%p\n", pr->orig_bio);
+	submit_bio(pr->data_bio->bi_rw, pr->data_bio);
+	if (bio_flagged(pr->data_bio, BIO_EOPNOTSUPP))
+		BUG();
+
+	submit_bio(pr->imd_bio->bi_rw, pr->imd_bio);
+	if (bio_flagged(pr->imd_bio, BIO_EOPNOTSUPP))
+		BUG();
+	return;
+
+error:
+	//up(&wq_lock);
+	wq_lock_up();
+	bio_endio(bio, -ENOMEM);
+	return;
+}
+
+/* Prepare a new bio to read the data requested in orig_bio */
+static struct bio *prepare_data_read(struct bio *orig_bio,
+		struct block_device *bdev, sector_t sector, bio_end_io_t *cb,
+		void *private)
+{
+	struct bio *bio;
+
+	/* clone the bio so we don't override the original's bi_private and
+	 * bi_end_io */
+	bio = bio_clone(orig_bio, GFP_NOIO);
+	if (bio == NULL)
+		return NULL;
+
+	bio->bi_bdev = bdev;
+	bio->bi_sector = sector;
+	bio->bi_end_io = cb;
+	bio->bi_private = private;
+
+	return bio;
+}
+
+static void queue_read_complete(struct bio *bio, int error)
+{
+	struct pending_read *pr = bio->bi_private;
+
+	if (error || !bio_flagged(bio, BIO_UPTODATE)) {
+		dprint("q_read_c error\n");
+		dump_bio("bio", bio);
+		pr->error = true;
+	}
+
+	too_many_bytes(bio);
+
+	if (!atomic_dec_and_test(&pr->nr_pending))
+		return;
+
+	/* defer the completion so it's not run in interrupt context */
+	INIT_WORK(&(pr->work), read_complete);
+	queue_work(io_wq, &(pr->work));
+}
+
+static void read_complete(struct work_struct *work)
+{
+	int result = -EIO;
+	struct pending_read *pr;
+
+	pr = container_of(work, struct pending_read, work);
+
+	/* TODO: use decent locking */
+	//dprint("read f pre-up:%p\n", pr->orig_bio);
+	//up(&wq_lock);
+	//dprint("read f post-up:%p\n", pr->orig_bio);
+	wq_lock_up();
+
+	if (pr->error)
+		goto exit;
+
+	too_many_bytes(pr->imd_bio);
+	too_many_bytes(pr->orig_bio);
+	too_many_bytes(pr->data_bio);
+
+	/* it not only verifies the CRC, but also update orig_bio's integrity
+	 * information
+	 * TODO: add an option for those who do not want the bio to fail on
+	 * CRC errors */
+	/* XXX: should we update bip on failed bios? */
+	result = verify_crc(pr->orig_bio, pr->imd_bio);
+
+exit:
+	if (pr->error)
+		result = -EIO;
+
+	/* free the page allocated in prepare_imd_read() */
+	__free_page(pr->imd_bio->bi_io_vec->bv_page);
+
+	/* XXX: is the ordering between this and bio_put(pr->data_bio)
+	 * important? I think not, but confirmation wouldn't hurt */
+	bio_endio(pr->orig_bio, result);
+
+	bio_put(pr->data_bio);
+	bio_put(pr->imd_bio);
+
+	ddprint("read f done:%p\n", pr->orig_bio);
+	kfree(pr);
+}
+
+
+/*
+ * WRITE handling (nice bios only)
+ */
+
+/* Used to track pending writes */
+struct pending_write {
+	struct dm_target *ti;
+	struct csum_c *cc;
+
+	struct bio *orig_bio;
+	struct bio *imd_bio;
+	struct bio *data_bio;
+
+	bool error;
+	atomic_t nr_pending;
+
+	struct work_struct work1;
+	struct work_struct work2;
+};
+
+/* Writes begin with write_nice_bio(), that queues the imd bio read. When that
+ * bio is done, write_stage1() gets called, which updates the imd data and
+ * then queues both the imd write and the data write. When those are
+ * completed, write_stage2() gets called, which finishes up and ends the
+ * original bio. To avoid running the completion code in interrupt context,
+ * the stage functions run through a workqueue. */
+static void write_nice_bio(struct dm_target *ti, struct bio *bio);
+static void queue_write_stage1(struct bio *bio, int error);
+static void write_stage1(struct work_struct *work);
+static void queue_write_stage2(struct bio *bio, int error);
+static void write_stage2(struct work_struct *work);
+
+/* Write a nice bio */
+static void write_nice_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct csum_c *cc = ti->private;
+	struct pending_write *pw;
+
+	pw = kmalloc(sizeof(*pw), GFP_NOIO);
+	if (pw == NULL) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	pw->ti = ti;
+	pw->cc = cc;
+	pw->orig_bio = bio;
+	pw->data_bio = NULL;
+	pw->error = false;
+	atomic_set(&pw->nr_pending, 0);
+
+	pw->imd_bio = prepare_imd_read(cc->imd_dev->bdev,
+			cc->get_imd_sector(ti, pw->orig_bio->bi_sector),
+			queue_write_stage1, pw);
+	if (pw->imd_bio == NULL) {
+		kfree(pw);
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	if (bio_sectors(pw->imd_bio) != 2) {
+		dump_bio("imd_bio", pw->imd_bio);
+		BUG();
+	}
+
+	ddprint("write 0:%p\n", bio);
+	submit_bio(pw->imd_bio->bi_rw, pw->imd_bio);
+	if (bio_flagged(pw->imd_bio, BIO_EOPNOTSUPP))
+		BUG();
+}
+
+static void queue_write_stage1(struct bio *imd_bio, int error)
+{
+	struct pending_write *pw = imd_bio->bi_private;
+
+	if (error || !bio_flagged(imd_bio, BIO_UPTODATE)) {
+		dprint("q_write_st1 error\n");
+		dump_bio("imd_bio", imd_bio);
+		pw->error = true;
+	} else if (bio_sectors(imd_bio) != 2) {
+		dump_bio("imd_bio", imd_bio);
+		BUG();
+	}
+
+	INIT_WORK(&(pw->work1), write_stage1);
+	queue_work(io_wq, &(pw->work1));
+}
+
+static void write_stage1(struct work_struct *work)
+{
+	int r;
+	int err = -EIO;
+	struct bio *data_bio;
+	struct pending_write *pw;
+
+	pw = container_of(work, struct pending_write, work1);
+
+	//dprint("write stage 1 %llu\n", (unsigned long long) pw->orig_bio->bi_sector);
+
+	if (pw->error)
+		goto error;
+
+	r = update_imd_bio(pw->orig_bio, pw->imd_bio);
+	if (r == -1) {
+		err = -ENOMEM;
+		goto error;
+	} else if (r == -2) {
+		printk(KERN_WARNING "dm-csum: consistency error updating"
+				" imd sector (cannot find older)\n");
+		err = -EIO;
+		goto error;
+	}
+
+	/* prepare bio for reuse */
+	pw->imd_bio->bi_rw |= WRITE;
+	pw->imd_bio->bi_end_io = queue_write_stage2;
+
+	data_bio = bio_clone(pw->orig_bio, GFP_NOIO);
+	if (data_bio == NULL) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	data_bio->bi_private = pw;
+	data_bio->bi_end_io = queue_write_stage2;
+	data_bio->bi_bdev = pw->cc->data_dev->bdev;
+	data_bio->bi_sector = pw->cc->map_data_sector(pw->ti,
+			pw->orig_bio->bi_sector);
+
+	/* data bio takes a barrier, so we know the imd write will have
+	 * completed before it hits the disk */
+	/* TODO: the underlying device might not support barriers
+	 * TODO: when data and imd are on separate devices, the barrier trick
+	 * is no longer useful */
+	//data_bio->bi_rw |= (1 << BIO_RW_BARRIER);
+
+	pw->data_bio = data_bio;
+
+
+	too_many_bytes(data_bio);
+	too_many_bytes(pw->imd_bio);
+
+	/* submit both bios at the end to simplify error handling; remember
+	 * the order is very important because of the barrier */
+	atomic_set(&pw->nr_pending, 2);
+	submit_bio(pw->imd_bio->bi_rw, pw->imd_bio);
+	if (bio_flagged(pw->imd_bio, BIO_EOPNOTSUPP))
+		BUG();
+	submit_bio(data_bio->bi_rw, data_bio);
+	if (bio_flagged(data_bio, BIO_EOPNOTSUPP))
+		BUG();
+	return;
+
+error:
+	//up(&wq_lock);
+	wq_lock_up();
+
+	bio_endio(pw->orig_bio, err);
+	__free_page(pw->imd_bio->bi_io_vec->bv_page);
+	bio_put(pw->imd_bio);
+	kfree(pw);
+	return;
+}
+
+static void queue_write_stage2(struct bio *bio, int error)
+{
+	struct pending_write *pw = bio->bi_private;
+
+	if (error || !bio_flagged(bio, BIO_UPTODATE)) {
+		dprint("q_write_st2 error\n");
+		dump_bio("bio", bio);
+		pw->error = true;
+	}
+
+	if (!atomic_dec_and_test(&pw->nr_pending))
+		return;
+
+	INIT_WORK(&(pw->work2), write_stage2);
+	queue_work(io_wq, &(pw->work2));
+}
+
+static void write_stage2(struct work_struct *work)
+{
+	struct pending_write *pw;
+
+	pw = container_of(work, struct pending_write, work2);
+
+	/* TODO: use decent locking */
+	//dprint("write f pre-up:%p\n", pw->orig_bio);
+	//up(&wq_lock);
+	//dprint("write f post-up:%p\n", pw->orig_bio);
+	wq_lock_up();
+
+	too_many_bytes_w(pw->imd_bio);
+
+	if (bio_integrity(pw->orig_bio))
+		set_bi_from_imd(pw->orig_bio, pw->imd_bio);
+
+	/* free the imd_bio resources */
+	__free_page(pw->imd_bio->bi_io_vec->bv_page);
+	bio_put(pw->imd_bio);
+
+	bio_put(pw->data_bio);
+
+	/* XXX: like read_complete(): is the order between this and
+	 * bio_put(pw->data_bio) important? */
+	bio_endio(pw->orig_bio, pw->error ? -EIO : 0);
+
+	ddprint("write f done:%p\n", pw->orig_bio);
+	kfree(pw);
+}
+
+
+/*
+ * Work queue to process bios.
+ *
+ * It is created in dm_csum_init(). It handles both the bios queued by
+ * queue_nice_bio() and starts the read/write work.
+ *
+ * TODO: handle more than one pending bio, and dispatch more than one as long
+ * as they don't overlap. Maybe one worqueue per ctx? Or maybe delay the
+ * creation of the workqueue until the first ctx?
+ */
+
+static struct workqueue_struct *submit_wq;
+
+struct pending_work {
+	struct dm_target *ti;
+	struct bio *bio;
+	struct work_struct w;
+};
+
+static void process_nice_bio(struct work_struct *work)
+{
+	struct pending_work *pending;
+	struct dm_target *ti;
+	struct bio *bio;
+
+	pending = container_of(work, struct pending_work, w);
+
+	ti = pending->ti;
+	bio = pending->bio;
+
+	/* TODO: use decent locking
+	 * At the moment, this lock is up()ed at the final stage of the
+	 * read/write code, when the bio has been processed */
+	//dprint("process pre-lock b:%p\n", bio);
+	//down(&wq_lock);
+	//dprint("process post-lock b:%p\n", bio);
+	wq_lock_down();
+
+	switch (bio_data_dir(bio)) {
+	case READ:
+		read_nice_bio(ti, bio);
+		break;
+	case WRITE:
+		write_nice_bio(ti, bio);
+		break;
+	default:
+		dprint("Unknown direction\n");
+		BUG();
+		break;
+	}
+
+	ddprint("process done b:%p\n", bio);
+	kfree(pending);
+}
+
+static noinline int queue_nice_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct pending_work *pending;
+
+	too_many_bytes_w(bio);
+
+	pending = kmalloc(sizeof(struct pending_work), GFP_NOIO);
+	if (pending == NULL)
+		return -ENOMEM;
+
+	pending->ti = ti;
+	pending->bio = bio;
+
+	INIT_WORK(&(pending->w), process_nice_bio);
+
+	ddprint("queueing work w:%p b:%p\n", &(pending->w), bio);
+	queue_work(submit_wq, &(pending->w));
+	ddprint("work queued w:%p b:%p\n", &(pending->w), bio);
+
+	return 0;
+}
+
+
+/*
+ * Evil bio handling
+ *
+ * Evil bios are split into nice ones in a direction-independant way, and then
+ * go through the direction-dependant code (which is prepared to deal with
+ * nice bios only, because it makes the code much simpler).
+ *
+ * When all the nice bios are completed, we end the original, evil bio.
+ */
+
+/* Determines if a bio is evil */
+static int bio_is_evil(struct dm_target *ti, struct bio *bio)
+{
+	sector_t mapped_first, mapped_last;
+
+	/* To detect when a bio is evil, we see if the mapped sector count is
+	 * larger than the bio sector count */
+	mapped_first = map_data_sector_same(ti, bio->bi_sector);
+	mapped_last = map_data_sector_same(ti,
+			bio->bi_sector + bio_sectors(bio) - 1);
+
+	return (mapped_last - mapped_first) != (bio_sectors(bio) - 1);
+}
+
+
+/* Used to track pending evil bios */
+struct pending_evil_bio {
+	struct csum_c *cc;
+
+	/* original evil bio */
+	struct bio *orig_bio;
+
+	/* number of bios pending */
+	atomic_t nr_pending;
+
+	/* were there any errors? */
+	bool error;
+
+};
+
+static int handle_evil_bio(struct dm_target *ti, struct bio *bio);
+static struct bio *prepare_nice_bio(struct pending_evil_bio *peb,
+		struct bio *bio, sector_t begin, sector_t size);
+static void evil_bio_complete(struct bio *bio, int error);
+
+/* Handle an evil bio, by splitting it into nice ones and processing them */
+static noinline int handle_evil_bio(struct dm_target *ti, struct bio *bio)
+{
+	int i, r;
+	sector_t first, last, prelude, postlude;
+	unsigned int nmiddle, submitted_bios, expected_bios;
+	struct pending_evil_bio *peb;
+	struct bio *new;
+
+	dprint("evil bio! s:%lu n:%u l:%u d:%lu \ti:%d o:%u\t\tp:%p\n",
+			bio->bi_sector, bio_sectors(bio), bio->bi_size,
+			bio_data_dir(bio),
+			bio->bi_idx, bio_iovec(bio)->bv_offset,
+			bio_iovec(bio)->bv_page);
+	too_many_bytes_w(bio);
+
+	peb = kmalloc(sizeof(*peb), GFP_NOIO);
+	if (peb == NULL)
+		return -ENOMEM;
+
+	peb->orig_bio = bio;
+	peb->error = false;
+	peb->cc = ti->private;
+
+	/* We will split the bio in:
+	 *  - optionally a "prelude bio" of sectors <= SECTORS_PER_IMD
+	 *  - 0 or more "middle bios" sectors == SECTORS_PER_IMD
+	 *  - a "postlude bio" <= SECTORS_PER_IMD
+	 *
+	 * TODO: there's room to simplify this math, we're keeping it simple
+	 * for now
+	 */
+	first = bio->bi_sector;
+	last = bio->bi_sector + bio_sectors(bio);
+
+	/* How many sectors until the first cut */
+	prelude = dm_sector_div_up(first, SECTORS_PER_IMD)
+		* SECTORS_PER_IMD - first;
+
+	/* How many sectors from the last cut until last */
+	postlude = last - (dm_sector_div_up(last, SECTORS_PER_IMD) - 1)
+		* SECTORS_PER_IMD;
+
+	/* How many SECTORS_PER_IMD are between the first and last cuts */
+	nmiddle = ( (last - postlude) - (first + prelude) ) / SECTORS_PER_IMD;
+
+	expected_bios = 1 + nmiddle + 1;
+	atomic_set(&peb->nr_pending, expected_bios);
+
+	/*
+	dprint("  first:%lu last:%lu pre:%lu nm:%lu post:%lu pending:%lu\n",
+			first, last, prelude, nmiddle, postlude,
+			peb->nr_pending);
+	*/
+
+	submitted_bios = 0;
+
+	/* From now on, access to peb will be locked to avoid races with
+	 * evil_bio_complete() */
+
+	/* Submit the prelude bio */
+	if (prelude) {
+		new = prepare_nice_bio(peb, bio, first, prelude);
+		if (new == NULL) {
+			kfree(peb);
+			return -ENOMEM;
+		}
+
+		r = queue_nice_bio(ti, new);
+		if (r < 0)
+			goto prepare_error;
+
+		submitted_bios++;
+	}
+
+	/* Submit the middle bios */
+	for (i = 0; i < nmiddle; i++) {
+		new = prepare_nice_bio(peb, bio,
+				first + prelude + i * SECTORS_PER_IMD,
+				SECTORS_PER_IMD);
+		if (new == NULL)
+			goto prepare_error;
+
+		r = queue_nice_bio(ti, new);
+		if (r < 0)
+			goto prepare_error;
+
+		submitted_bios++;
+	}
+
+	/* Submit the postlude bio */
+	new = prepare_nice_bio(peb, bio, (last - postlude), postlude);
+	if (new == NULL) {
+		goto prepare_error;
+	}
+	r = queue_nice_bio(ti, new);
+	if (r < 0)
+		goto prepare_error;
+
+	submitted_bios++;
+
+	return 0;
+
+prepare_error:
+	/* There was an error in prepare_nice_bio(), but we already have some
+	 * in-flight bios that have been submitted and will call
+	 * evil_bio_complete() when they're done; decrement the expected
+	 * number of bios, and check if we're already done */
+	atomic_sub(expected_bios - submitted_bios, &peb->nr_pending);
+	peb->error = true;
+
+	if (atomic_read(&peb->nr_pending) == 0) {
+		kfree(peb);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/* Prepare a new nice bio cloned from the original one */
+static noinline struct bio *prepare_nice_bio(struct pending_evil_bio *peb,
+		struct bio *bio, sector_t begin, sector_t new_sectors)
+{
+	unsigned int segno, advance, sofar;
+	unsigned short bvec_count;
+	struct bio *new;
+	struct bio_vec *bvec;
+
+	new = bio_clone(bio, GFP_NOIO);
+	if (new == NULL)
+		return NULL;
+
+	/* Make the new bio start in the right idx and offset
+	 * TODO: this can be optimized because we're walking the same thing
+	 * over and over */
+	advance = (begin - bio->bi_sector) * 512;
+	sofar = 0;
+	segno = 0; /* will be set to bio->bi_idx by bio_for_each_segment */
+	bvec_count = 0;
+	bio_for_each_segment(bvec, new, segno) {
+		if (sofar + bvec->bv_len > advance) {
+			break;
+		}
+
+		sofar += bvec->bv_len;
+		bvec_count++;
+	}
+
+	new->bi_sector = begin;
+	new->bi_size = new_sectors * 512;
+
+	new->bi_idx = segno;
+	bio_iovec(new)->bv_offset += advance - sofar;
+	//bio_iovec(new)->bv_len -= advance - sofar;
+	bio_iovec(new)->bv_len =
+		min(new->bi_size, bio_iovec(new)->bv_len - (advance - sofar));
+	//new->bi_vcnt -= bvec_count; // XXX ACA: este -1 esta bien?
+
+	WARN_ON(bio_sectors(new) != new_sectors);
+
+	new->bi_private = peb;
+	new->bi_end_io = evil_bio_complete;
+
+	/* trim it so that the new bip_vec (which is shared with the original
+	 * bio) points to the right offset */
+	if (bio_integrity(bio))
+		bio_integrity_trim(new, begin - bio->bi_sector, new_sectors);
+
+	too_many_bytes_w(bio);
+	return new;
+}
+
+static void evil_bio_complete(struct bio *bio, int error)
+{
+	struct pending_evil_bio *peb = bio->bi_private;
+
+	if (error || !bio_flagged(bio, BIO_UPTODATE)) {
+		dprint("evil_b_c error\n");
+		dump_bio("bio", bio);
+		peb->error = true;
+	}
+
+	if (atomic_dec_and_test(&peb->nr_pending)) {
+		bio_endio(peb->orig_bio, peb->error ? -EIO : 0);
+		kfree(peb);
+	}
+
+	/* put the bio created with bio_clone() because we don't longer care
+	 * about it */
+	bio_put(bio);
+}
+
+
+/*
+ * Device mapper
+ */
+
+/* Constructor: <data dev path> <data dev offset> \
+ * 	[ <integrity dev path> <integrity dev offset> ] */
+static int csum_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	int err;
+	fmode_t mode;
+	unsigned long long data_offset, imd_offset;
+	sector_t data_dev_len;
+	struct csum_c *cc;
+
+	if (argc != 2 && argc != 4) {
+		ti->error = "Incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	cc = kmalloc(sizeof(*cc), GFP_KERNEL);
+	if (cc == NULL) {
+		ti->error = "Cannot allocate context information";
+		return -ENOMEM;
+	}
+	cc->data_dev = cc->imd_dev = NULL;
+	cc->data_start = cc->imd_start = cc->imd_len = 0;
+
+	err = -EINVAL;
+
+	if (sscanf(argv[1], "%llu", &data_offset) != 1) {
+		ti->error = "Invalid data dev offset";
+		goto error;
+	}
+	cc->data_start = data_offset;
+
+	/* If we have both data and metadata on the same device, the
+	 * advertised size of the dm device will be slightly less than the
+	 * total, to account for the space dedicated to the metadata */
+	if (argc == 2) {
+		data_dev_len = ti->len + imd_sectors_needed(ti->len);
+	} else {
+		data_dev_len = ti->len;
+	}
+
+	mode = dm_table_get_mode(ti->table);
+	if (dm_get_device(ti, argv[0], cc->data_start, data_dev_len, mode,
+			&(cc->data_dev))) {
+		ti->error = "data device lookup failed";
+		goto error;
+	}
+
+	if (argc == 2) {
+		cc->map_data_sector = map_data_sector_same;
+		cc->get_imd_sector = get_imd_sector_same;
+		cc->imd_dev = cc->data_dev;
+	} else if (argc == 4) {
+		if (sscanf(argv[3], "%llu", &imd_offset) != 1) {
+			ti->error = "Invalid integrity dev offset";
+			goto error;
+		}
+		cc->imd_start = imd_offset;
+		cc->imd_len = imd_sectors_needed(ti->len);
+
+		if (dm_get_device(ti, argv[2], cc->imd_start,
+				cc->imd_len, mode, &(cc->imd_dev))) {
+			ti->error = "Integrity device lookup failed";
+			goto error;
+		}
+
+		cc->map_data_sector = map_data_sector_diff;
+		cc->get_imd_sector = get_imd_sector_diff;
+	}
+
+	ti->private = cc;
+
+	if (bi_register(ti) != 0) {
+		ti->error = "Couldn't register with bio-integrity";
+		goto error;
+	}
+
+	return 0;
+
+error:
+	if (cc->data_dev) {
+		if (cc->data_dev == cc->imd_dev) {
+			dm_put_device(ti, cc->data_dev);
+		} else {
+			dm_put_device(ti, cc->data_dev);
+			dm_put_device(ti, cc->imd_dev);
+		}
+	}
+	kfree(cc);
+	return err;
+}
+
+/* Destructor, undoes what was done in the constructor */
+static void csum_dtr(struct dm_target *ti)
+{
+	struct csum_c *cc = ti->private;
+
+	bi_unregister(ti);
+
+	if (cc->data_dev == cc->imd_dev) {
+		dm_put_device(ti, cc->data_dev);
+	} else {
+		dm_put_device(ti, cc->data_dev);
+		dm_put_device(ti, cc->imd_dev);
+	}
+
+	kfree(cc);
+}
+
+/* Operation mapping */
+static int csum_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	int rv;
+
+	too_many_bytes_w(bio);
+
+	if (bio_is_evil(ti, bio))
+		rv = handle_evil_bio(ti, bio);
+	else
+		rv = queue_nice_bio(ti, bio);
+
+	if (rv < 0)
+		return rv;
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+
+/*
+ * Target registration and module stuff
+ */
+
+static struct target_type csum_target = {
+	.name   = "csum",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr    = csum_ctr,
+	.dtr	= csum_dtr,
+	.map    = csum_map,
+};
+
+static int __init dm_csum_init(void)
+{
+	int dm_rv;
+
+	submit_wq = create_workqueue("dm-csum-s");
+	if (submit_wq == NULL)
+		return -ENOMEM;
+
+	io_wq = create_workqueue("dm-csum-io");
+	if (io_wq == NULL) {
+		destroy_workqueue(submit_wq);
+		return -ENOMEM;
+	}
+
+	dm_rv = dm_register_target(&csum_target);
+	if (dm_rv < 0) {
+		DMERR("register failed: %d", dm_rv);
+		destroy_workqueue(submit_wq);
+		destroy_workqueue(io_wq);
+		return dm_rv;
+	}
+
+	return 0;
+}
+
+static void __exit dm_csum_exit(void)
+{
+	dm_unregister_target(&csum_target);
+	destroy_workqueue(submit_wq);
+	destroy_workqueue(io_wq);
+}
+
+module_init(dm_csum_init)
+module_exit(dm_csum_exit)
+
+MODULE_AUTHOR("Alberto Bertogli <albertito@blitiri.com.ar>");
+MODULE_DESCRIPTION(DM_NAME " checksumming I/O target");
+MODULE_LICENSE("GPL v2");
+
author	Alberto Bertogli <albertito@blitiri.com.ar> 2010-05-24 23:19:51 UTC
committer	Alberto Bertogli <albertito@blitiri.com.ar> 2010-05-24 23:19:51 UTC
parent	04b002a1f257d7905053aeff103d03370d8450fe
drivers/md/Kconfig	+10	-0
drivers/md/Makefile	+1	-0
drivers/md/dm-csum.c	+1719	-0
git » linux-kernel » commit efc4d39

dm-csum: A checksumming device mapper target