git » libjio » commit c02fcd2

[FORMAT CHANGE] Improve the on-disk transaction format

author Alberto Bertogli
2009-07-10 09:40:57 UTC
committer Alberto Bertogli
2009-07-11 03:33:41 UTC
parent f30c385b5288a682f0a16cf36d81a1c0fc23476f

[FORMAT CHANGE] Improve the on-disk transaction format

This patch changes the way transactions are stored on disk. The
fundamental reason for this change is to put the number of operations
contained in the transaction at the end.

That will allow us to write operations as they are added to the
transaction, instead of waiting until commit time, which can improve
performance on some operation-intensive workloads.

Other changes include the addition of a much needed version number (which
will hopefully keep the library backwards compatible in case of future
disk format changes), a more robust checksum algorithm, and defined
endianness of the stored data for portability purporses.

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>

UPGRADING +5 -0
doc/libjio.rst +10 -10
libjio/check.c +5 -91
libjio/checksum.c +84 -22
libjio/common.c +54 -0
libjio/common.h +4 -2
libjio/journal.c +279 -74
libjio/journal.h +4 -0
libjio/trans.c +6 -0
libjio/trans.h +0 -22
tests/behaviour/t_corruption.py +5 -2
tests/behaviour/tf.py +24 -12

diff --git a/UPGRADING b/UPGRADING
index dc7ded7..759a658 100644
--- a/UPGRADING
+++ b/UPGRADING
@@ -6,6 +6,11 @@ You should always clean all your files before upgrading. While I don't expect
 the transaction on-disk format to change, it's a good practise and it doesn't
 take much effort. When it's mandatory, it will be noted.
 
+-> 0.51 (On-disk format change)
+  - The way transactions are stored on disk has changed. It is mandatory that
+    you jfsck all your files before upgrading. Hopefully this will be the last
+    backwards-incompatible format change before 1.0.
+
 -> 0.50 (Big API change)
   - Structures are now opaque types:
     struct jfs -> jfs_t; jopen() returns a pointer to one, jclose() frees it.
diff --git a/doc/libjio.rst b/doc/libjio.rst
index 6bc4f3c..c166bf4 100644
--- a/doc/libjio.rst
+++ b/doc/libjio.rst
@@ -59,20 +59,20 @@ careful when doing strange things with files while working on them.
 The transaction file
 ~~~~~~~~~~~~~~~~~~~~
 
-The transaction file is composed of two main parts: the header and the
-payload.
+The transaction file is composed of three main parts: the header, the
+operations, and the trailer.
 
 The header holds basic information about the transaction itself, including the
-ID, some flags, and the amount of operations it includes. Then the payload has
-all the operations one after the other, divided in two parts: the first one
-includes static information about the operation (the length of the data, the
-offset of the file where it should be applied, etc.) and the data itself,
-which is saved by the library prior applying the commit, so transactions can
-be rollbacked.
+version, the transaction ID, and its flags.
 
-At the end of the transaction file, a checksum is stored, to detect journal
-corruption.
+Then the operation part has all the operations one after the other, prepending
+the operation data with a per-operation header that includes the length of the
+data and the offset of the file where it should be applied, and then the data
+itself.
 
+Finally, the trailer contains the number of operations included in it and a
+checksum of the whole file. Both fields are used to detect broken or corrupted
+transactions.
 
 The commit procedure
 --------------------
diff --git a/libjio/check.c b/libjio/check.c
index 03458ab..c39ce49 100644
--- a/libjio/check.c
+++ b/libjio/check.c
@@ -17,84 +17,10 @@
 
 #include "libjio.h"
 #include "common.h"
+#include "journal.h"
 #include "trans.h"
 
 
-/** Fill a transaction structure from a mmapped transaction file */
-static off_t fill_trans(unsigned char *map, off_t len, struct jtrans *ts)
-{
-	int i;
-	unsigned char *p;
-	struct joper *op, *tmp;
-	off_t translen;
-
-	if (len < J_DISKHEADSIZE)
-		return 0;
-
-	p = map;
-
-	ts->id = *( (uint32_t *) p);
-	p += 4;
-
-	ts->flags = *( (uint32_t *) p);
-	p += 4;
-
-	ts->numops = *( (uint32_t *) p);
-	p += 4;
-
-	translen = J_DISKHEADSIZE;
-
-	for (i = 0; i < ts->numops; i++) {
-		if (p + J_DISKOPHEADSIZE > map + len)
-			goto error;
-
-		op = malloc(sizeof(struct joper));
-		if (op == NULL)
-			goto error;
-
-		op->len = *( (uint32_t *) p);
-		p += 4;
-
-		op->plen = *( (uint32_t *) p);
-		p += 4;
-
-		op->offset = *( (uint64_t *) p);
-		p += 8;
-
-		if (p + op->len > map + len)
-			goto error;
-
-		op->buf = (void *) p;
-		p += op->len;
-
-		op->pdata = NULL;
-
-		if (ts->op == NULL) {
-			ts->op = op;
-			op->prev = NULL;
-			op->next = NULL;
-		} else {
-			for (tmp = ts->op; tmp->next != NULL; tmp = tmp->next)
-				;
-			tmp->next = op;
-			op->prev = tmp;
-			op->next = NULL;
-		}
-
-		translen += J_DISKOPHEADSIZE + op->len;
-	}
-
-	return translen;
-
-error:
-	while (ts->op != NULL) {
-		tmp = ts->op->next;
-		free(ts->op);
-		ts->op = tmp;
-	}
-	return 0;
-}
-
 /** Remove all the files in the journal directory (if any).
  *
  * @param name path to the file
@@ -158,7 +84,6 @@ enum jfsck_return jfsck(const char *name, const char *jdir,
 {
 	int tfd, rv, i, ret;
 	unsigned int maxtid;
-	uint32_t csum1, csum2;
 	char jlockfile[PATH_MAX], tname[PATH_MAX];
 	struct stat sinfo;
 	struct jfs fs;
@@ -167,7 +92,7 @@ enum jfsck_return jfsck(const char *name, const char *jdir,
 	DIR *dir;
 	struct dirent *dent;
 	unsigned char *map;
-	off_t filelen, translen, lr;
+	off_t filelen, lr;
 
 	tfd = -1;
 	filelen = 0;
@@ -310,23 +235,12 @@ enum jfsck_return jfsck(const char *name, const char *jdir,
 			map = NULL;
 			goto loop;
 		}
-		translen = fill_trans(map, filelen, curts);
-		if (translen == 0) {
-			res->broken++;
-			goto loop;
-		}
 
-		/* see if there's enough room for the checksum after the
-		 * transaction information */
-		if (filelen != translen + sizeof(uint32_t)) {
+		rv = fill_trans(map, filelen, curts);
+		if (rv == -1) {
 			res->broken++;
 			goto loop;
-		}
-
-		/* verify the checksum */
-		csum1 = checksum_map(map, filelen - (sizeof(uint32_t)));
-		csum2 = * (uint32_t *) (map + filelen - (sizeof(uint32_t)));
-		if (csum1 != csum2) {
+		} else if (rv == -2) {
 			res->corrupt++;
 			goto loop;
 		}
diff --git a/libjio/checksum.c b/libjio/checksum.c
index 250a96d..225a8af 100644
--- a/libjio/checksum.c
+++ b/libjio/checksum.c
@@ -1,7 +1,7 @@
 
 /*
  * Checksum functions
- * Based on RFC 1071, "Computing the Internet Checksum"
+ * Uses CRC32c, just because it's decent enough. As defined in RFC 3309.
  */
 
 #include <stddef.h>
@@ -9,10 +9,91 @@
 #include <sys/mman.h>
 #include "common.h"
 
+static uint32_t table[256] =
+{
+	0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+	0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+	0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+	0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+	0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+	0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+	0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+	0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+	0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+	0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+	0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+	0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+	0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+	0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+	0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+	0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+	0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+	0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+	0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+	0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+	0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+	0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+	0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+	0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+	0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+	0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+	0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+	0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+	0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+	0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+	0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+	0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+	0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+	0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+	0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+	0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+	0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+	0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+	0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+	0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+	0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+	0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+	0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+	0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+	0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+	0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+	0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+	0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+	0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+	0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+	0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+	0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+	0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+	0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+	0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+	0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+	0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+	0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+	0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+	0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+	0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+	0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+	0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+	0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L,
+};
+
+/** Calculates the checksum of the given buffer, up to count bytes. Returns the
+ * checksum. The initial crc32 must be 0. */
+uint32_t checksum_buf(uint32_t crc32, const unsigned char *buf, size_t count)
+{
+	crc32 = ~crc32;
+
+	while (count--) {
+		crc32 = (crc32 >> 8) ^ table[(crc32 ^ *buf) & 0xFFL];
+		buf++;
+	}
+
+	return ~crc32;
+}
 
 /** Reads the contents of the given fd, up to len bytes, and stores the
  * checksum in csum. Returns 1 on success, 0 on error. */
-int checksum(int fd, size_t len, uint32_t *csum)
+int checksum_fd(int fd, size_t len, uint32_t *csum)
 {
 	uint8_t *map;
 
@@ -20,29 +101,10 @@ int checksum(int fd, size_t len, uint32_t *csum)
 	if (map == MAP_FAILED)
 		return 0;
 
-	*csum = checksum_map(map, len);
+	*csum = checksum_buf(0, map, len);
 
 	munmap(map, len);
 	return 1;
 }
 
-/** Calculates the checksum of the given buffer, up to count bytes. Returns
- * the checksum. */
-uint32_t checksum_map(uint8_t *map, size_t count)
-{
-	uint32_t sum = 0;
-
-	while( count > 1 )  {
-		sum += * (uint16_t *) map++;
-		count -= 2;
-	}
-
-	if( count > 0 )
-		sum += * (uint8_t *) map;
-
-	while (sum >> 16)
-		sum = (sum & 0xffff) + (sum >> 16);
-
-	return ~sum;
-}
 
diff --git a/libjio/common.c b/libjio/common.c
index 7a3cdf9..12c92ef 100644
--- a/libjio/common.c
+++ b/libjio/common.c
@@ -11,6 +11,7 @@
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <arpa/inet.h>		/* htonl() and friends */
 
 #include "libjio.h"
 #include "common.h"
@@ -133,3 +134,56 @@ void get_jtfile(struct jfs *fs, unsigned int tid, char *jtfile)
 }
 
 
+/* The ntohll() and htonll() functions are not standard, so we define them
+ * using an UGLY trick because there is no standard way to check for
+ * endianness at runtime. */
+
+/** Convert a 64-bit value between network byte order and host byte order. */
+uint64_t ntohll(uint64_t x)
+{
+	static int endianness = 0;
+
+	/* determine the endianness by checking how htonl() behaves; use -1
+	 * for little endian and 1 for big endian */
+	if (endianness == 0) {
+		if (htonl(1) == 1)
+			endianness = 1;
+		else
+			endianness = -1;
+	}
+
+	if (endianness == 1) {
+		/* big endian */
+		return x;
+	}
+
+	/* little endian */
+	return ( ntohl( (x >> 32) & 0xFFFFFFFF ) | \
+			( (uint64_t) ntohl(x & 0xFFFFFFFF) ) << 32 );
+}
+
+/** Convert a 64-bit value between host byte order and network byte order. */
+uint64_t htonll(uint64_t x)
+{
+	static int endianness = 0;
+
+	/* determine the endianness by checking how htonl() behaves; use -1
+	 * for little endian and 1 for big endian */
+	if (endianness == 0) {
+		if (htonl(1) == 1)
+			endianness = 1;
+		else
+			endianness = -1;
+	}
+
+	if (endianness == 1) {
+		/* big endian */
+		return x;
+	}
+
+	/* little endian */
+	return ( htonl( (x >> 32) & 0xFFFFFFFF ) | \
+			( (uint64_t) htonl(x & 0xFFFFFFFF) ) << 32 );
+}
+
+
diff --git a/libjio/common.h b/libjio/common.h
index 4879eb5..d0f3ad8 100644
--- a/libjio/common.h
+++ b/libjio/common.h
@@ -74,9 +74,11 @@ ssize_t spread(int fd, void *buf, size_t count, off_t offset);
 ssize_t spwrite(int fd, const void *buf, size_t count, off_t offset);
 int get_jdir(const char *filename, char *jdir);
 void get_jtfile(struct jfs *fs, unsigned int tid, char *jtfile);
+uint64_t ntohll(uint64_t x);
+uint64_t htonll(uint64_t x);
 
-int checksum(int fd, size_t len, uint32_t *csum);
-uint32_t checksum_map(uint8_t *map, size_t count);
+int checksum_fd(int fd, size_t len, uint32_t *csum);
+uint32_t checksum_buf(uint32_t sum, const unsigned char *buf, size_t count);
 
 void autosync_check(struct jfs *fs);
 
diff --git a/libjio/journal.c b/libjio/journal.c
index 0666504..324a8f9 100644
--- a/libjio/journal.c
+++ b/libjio/journal.c
@@ -8,13 +8,13 @@
 #include <fcntl.h>		/* open() */
 #include <unistd.h>		/* f[data]sync(), close() */
 #include <stdlib.h>		/* malloc() and friends */
-#include <limits.h>		/* MAX_PATH */
+#include <limits.h>		/* PATH_MAX */
 #include <string.h>		/* memcpy() */
-#include <libgen.h>		/* basename(), dirname() */
 #include <stdio.h>		/* fprintf() */
-#include <dirent.h>		/* readdir() and friends */
 #include <errno.h>		/* errno */
-#include <sys/mman.h>		/* mmap() */
+#include <stdint.h>		/* uintX_t */
+#include <arpa/inet.h>		/* htonl() and friends */
+#include <netinet/in.h>		/* htonl() and friends (on some platforms) */
 
 #include "libjio.h"
 #include "common.h"
@@ -24,7 +24,87 @@
 
 
 /*
- * helper functions
+ * On-disk structures
+ *
+ * Each transaction will be stored on disk as a single file, composed of a
+ * header, operation information, and a trailer. The operation information is
+ * composed of repeated operation headers followed by their corresponding
+ * data, one for each operation. A special operation header containing all 0s
+ * marks the end of the operations.
+ * 
+ * Visually, something like this:
+ * 
+ *  +--------+---------+----------+---------+----------+-----+-----+---------+
+ *  | header | op1 hdr | op1 data | op2 hdr | op2 data | ... | eoo | trailer |
+ *  +--------+---------+----------+---------+----------+-----+-----+---------+
+ *             \                                             /
+ *              +--------------- operations ----------------+ 
+ *
+ * The details of each part can be seen on the following structures. All
+ * integers are stored in network byte order.
+ */
+
+/** Transaction file header */
+struct on_disk_hdr {
+	uint16_t ver;
+	uint16_t flags;
+	uint32_t trans_id;
+} __attribute__((packed));
+
+/** Transaction file operation header */
+struct on_disk_ophdr {
+	uint32_t len;
+	uint64_t offset;
+} __attribute__((packed));
+
+/** Transaction file trailer */
+struct on_disk_trailer {
+	uint32_t numops;
+	uint32_t checksum;
+} __attribute__((packed));
+
+
+/* Convert structs to/from host to network (disk) endian */
+
+static void hdr_hton(struct on_disk_hdr *hdr)
+{
+	hdr->ver = htons(hdr->ver);
+	hdr->flags = htons(hdr->flags);
+	hdr->trans_id = htonl(hdr->trans_id);
+}
+
+static void hdr_ntoh(struct on_disk_hdr *hdr)
+{
+	hdr->ver = ntohs(hdr->ver);
+	hdr->flags = ntohs(hdr->flags);
+	hdr->trans_id = ntohl(hdr->trans_id);
+}
+
+static void ophdr_hton(struct on_disk_ophdr *ophdr)
+{
+	ophdr->len = htonl(ophdr->len);
+	ophdr->offset = htonll(ophdr->offset);
+}
+
+static void ophdr_ntoh(struct on_disk_ophdr *ophdr)
+{
+	ophdr->len = ntohl(ophdr->len);
+	ophdr->offset = ntohll(ophdr->offset);
+}
+
+static void trailer_hton(struct on_disk_trailer *trailer) {
+	trailer->numops = htonl(trailer->numops);
+	trailer->checksum = htonl(trailer->checksum);
+}
+
+static void trailer_ntoh(struct on_disk_trailer *trailer) {
+	trailer->numops = ntohl(trailer->numops);
+	trailer->checksum = ntohl(trailer->checksum);
+}
+
+
+/*
+ * Helper functions
  */
 
 /** Get a new transaction id */
@@ -130,9 +210,8 @@ struct journal_op *journal_new(struct jtrans *ts)
 	int fd, id;
 	ssize_t rv;
 	char *name = NULL;
-	unsigned char buf_init[J_DISKHEADSIZE];
-	unsigned char *bufp;
-	struct journal_op *jop = NULL;
+	struct journal_op *jop;
+	struct on_disk_hdr hdr;
 
 	jop = malloc(sizeof(struct journal_op));
 	if (jop == NULL)
@@ -156,6 +235,7 @@ struct journal_op *journal_new(struct jtrans *ts)
 	jop->fd = fd;
 	jop->name = name;
 	jop->curpos = 0;
+	jop->csum = 0;
 	jop->ts = ts;
 	jop->fs = ts->fs;
 
@@ -167,22 +247,18 @@ struct journal_op *journal_new(struct jtrans *ts)
 	ts->id = id;
 
 	/* save the header */
-	bufp = buf_init;
-
-	memcpy(bufp, (void *) &(ts->id), 4);
-	bufp += 4;
+	hdr.ver = 1;
+	hdr.trans_id = id;
+	hdr.flags = ts->flags;
 
-	memcpy(bufp, (void *) &(ts->flags), 4);
-	bufp += 4;
-
-	memcpy(bufp, (void *) &(ts->numops), 4);
-	bufp += 4;
-
-	rv = spwrite(fd, buf_init, J_DISKHEADSIZE, 0);
-	if (rv != J_DISKHEADSIZE)
+	hdr_hton(&hdr);
+	rv = spwrite(fd, &hdr, sizeof(hdr), 0);
+	if (rv != sizeof(hdr))
 		goto unlink_error;
 
-	jop->curpos = J_DISKHEADSIZE;
+	jop->curpos = sizeof(hdr);
+	jop->csum = checksum_buf(jop->csum, (unsigned char *) &hdr,
+			sizeof(hdr));
 
 	fiu_exit_on("jio/commit/tf_header");
 
@@ -200,81 +276,105 @@ error:
 	return NULL;
 }
 
-/** Save the given transaction in the journal */
-int journal_save(struct journal_op *jop)
+/** Saves a single joper in the journal file */
+static int save_joper(struct journal_op *jop, const struct jtrans *ts,
+		struct joper *op)
 {
 	ssize_t rv;
-	uint32_t csum;
-	struct joper *op;
-	unsigned char hdr[J_DISKOPHEADSIZE];
-	unsigned char *hdrp;
-	const struct jtrans *ts = jop->ts;
+	struct on_disk_ophdr ophdr;
+
+	/* read the current content only if the transaction is not
+	 * marked as NOROLLBACK, and if the data is not there yet,
+	 * which is the normal case, but for rollbacking we fill it
+	 * ourselves */
+	if (!(ts->flags & J_NOROLLBACK) && (op->pdata == NULL)) {
+		op->pdata = malloc(op->len);
+		if (op->pdata == NULL)
+			goto error;
 
-	/* save each transacion in the file */
-	for (op = ts->op; op != NULL; op = op->next) {
-		/* read the current content only if the transaction is not
-		 * marked as NOROLLBACK, and if the data is not there yet,
-		 * which is the normal case, but for rollbacking we fill it
-		 * ourselves */
-		if (!(ts->flags & J_NOROLLBACK) && (op->pdata == NULL)) {
-			op->pdata = malloc(op->len);
-			if (op->pdata == NULL)
-				goto error;
-
-			op->plen = op->len;
-
-			rv = spread(ts->fs->fd, op->pdata, op->len,
-					op->offset);
-			if (rv < 0)
-				goto error;
-			if (rv < op->len) {
-				/* we are extending the file! */
-				/* ftruncate(ts->fs->fd, op->offset + op->len); */
-				op->plen = rv;
-			}
+		op->plen = op->len;
+
+		rv = spread(ts->fs->fd, op->pdata, op->len,
+				op->offset);
+		if (rv < 0)
+			goto error;
+		if (rv < op->len) {
+			/* we are extending the file! */
+			/* ftruncate(ts->fs->fd, op->offset + op->len); */
+			op->plen = rv;
 		}
+	}
 
-		/* save the operation's header */
-		hdrp = hdr;
+	/* save the operation's header */
+	ophdr.len = op->len;
+	ophdr.offset = op->offset;
 
-		memcpy(hdrp, (void *) &(op->len), 4);
-		hdrp += 4;
+	ophdr_hton(&ophdr);
+	rv = spwrite(jop->fd, &ophdr, sizeof(ophdr), jop->curpos);
+	if (rv != sizeof(ophdr))
+		goto error;
 
-		memcpy(hdrp, (void *) &(op->plen), 4);
-		hdrp += 4;
+	fiu_exit_on("jio/commit/tf_ophdr");
 
-		memcpy(hdrp, (void *) &(op->offset), 8);
-		hdrp += 8;
+	jop->curpos += sizeof(ophdr);
+	jop->csum = checksum_buf(jop->csum, (unsigned char *) &ophdr,
+			sizeof(ophdr));
 
-		rv = spwrite(jop->fd, hdr, J_DISKOPHEADSIZE, jop->curpos);
-		if (rv != J_DISKOPHEADSIZE)
-			goto error;
+	/* and save it to the disk */
+	rv = spwrite(jop->fd, op->buf, op->len, jop->curpos);
+	if (rv != op->len)
+		goto error;
 
-		fiu_exit_on("jio/commit/tf_ophdr");
+	jop->curpos += op->len;
+	jop->csum = checksum_buf(jop->csum, op->buf, op->len);
 
-		jop->curpos += J_DISKOPHEADSIZE;
+	return 0;
 
-		/* and save it to the disk */
-		rv = spwrite(jop->fd, op->buf, op->len, jop->curpos);
-		if (rv != op->len)
-			goto error;
+error:
+	return -1;
+}
+
+/** Save the given transaction in the journal */
+int journal_save(struct journal_op *jop)
+{
+	ssize_t rv;
+	struct joper *op;
+	const struct jtrans *ts = jop->ts;
+	struct on_disk_ophdr ophdr;
+	struct on_disk_trailer trailer;
 
-		jop->curpos += op->len;
+	/* save each transacion in the file */
+	for (op = ts->op; op != NULL; op = op->next) {
+		rv = save_joper(jop, ts, op);
+		if (rv != 0)
+			goto error;
 
 		fiu_exit_on("jio/commit/tf_opdata");
 	}
 
 	fiu_exit_on("jio/commit/tf_data");
 
-	/* compute and save the checksum (curpos is always small, so there's
-	 * no overflow possibility when we convert to size_t) */
-	if (!checksum(jop->fd, jop->curpos, &csum))
+	/* write the the empty ophdr to mark there are no more operations, and
+	 * then the trailer */
+	ophdr.len = 0;
+	ophdr.offset = 0;
+
+	ophdr_hton(&ophdr);
+	rv = spwrite(jop->fd, &ophdr, sizeof(ophdr), jop->curpos);
+	if (rv != sizeof(ophdr))
 		goto error;
 
-	rv = spwrite(jop->fd, &csum, sizeof(uint32_t), jop->curpos);
-	if (rv != sizeof(uint32_t))
+	jop->curpos += sizeof(ophdr);
+	jop->csum = checksum_buf(jop->csum, (unsigned char *) &ophdr,
+			sizeof(ophdr));
+
+	trailer.checksum = jop->csum;
+	trailer.numops = ts->numops;
+
+	trailer_hton(&trailer);
+	rv = spwrite(jop->fd, &trailer, sizeof(trailer), jop->curpos);
+	if (rv != sizeof(trailer))
 		goto error;
-	jop->curpos += sizeof(uint32_t);
 
 	/* this is a simple but efficient optimization: instead of doing
 	 * everything O_SYNC, we sync at this point only, this way we avoid
@@ -335,3 +435,108 @@ exit:
 }
 
 
+/** Fill a transaction structure from a mmapped transaction file. Useful for
+ * checking purposes.
+ * @returns 0 on success, -1 if the file was broken, -2 if the checksums didn't
+ *	match
+ */
+int fill_trans(unsigned char *map, off_t len, struct jtrans *ts)
+{
+	int rv, numops;
+	unsigned char *p;
+	struct joper *op, *tmp;
+	struct on_disk_hdr hdr;
+	struct on_disk_ophdr ophdr;
+	struct on_disk_trailer trailer;
+
+	rv = -1;
+
+	if (len < sizeof(hdr) + sizeof(ophdr) + sizeof(trailer))
+		return -1;
+
+	p = map;
+
+	memcpy(&hdr, p, sizeof(hdr));
+	p += sizeof(hdr);
+
+	hdr_ntoh(&hdr);
+	if (hdr.ver != 1)
+		return -1;
+
+	ts->id = hdr.trans_id;
+	ts->flags = hdr.flags;
+
+	numops = 0;
+	for (;;) {
+		if (p + sizeof(ophdr) > map + len)
+			goto error;
+
+		memcpy(&ophdr, p,  sizeof(ophdr));
+		p += sizeof(ophdr);
+
+		ophdr_ntoh(&ophdr);
+
+		if (ophdr.len == 0 && ophdr.offset == 0) {
+			/* This header marks the end of the operations */
+			break;
+		}
+
+		if (p + ophdr.len > map + len)
+			goto error;
+
+		op = malloc(sizeof(struct joper));
+		if (op == NULL)
+			goto error;
+
+		op->len = ophdr.len;
+		op->offset = ophdr.offset;
+
+		op->buf = (void *) p;
+		p += op->len;
+
+		op->pdata = NULL;
+
+		if (ts->op == NULL) {
+			ts->op = op;
+			op->prev = NULL;
+			op->next = NULL;
+		} else {
+			for (tmp = ts->op; tmp->next != NULL; tmp = tmp->next)
+				;
+			tmp->next = op;
+			op->prev = tmp;
+			op->next = NULL;
+		}
+
+		numops++;
+	}
+
+	if (p + sizeof(trailer) > map + len)
+		goto error;
+
+	memcpy(&trailer, p, sizeof(trailer));
+	p += sizeof(trailer);
+
+	trailer_ntoh(&trailer);
+
+	if (trailer.numops != numops)
+		goto error;
+
+	ts->numops = numops;
+
+	if (checksum_buf(0, map, len - sizeof(trailer)) != trailer.checksum) {
+		rv = -2;
+		goto error;
+	}
+
+	return 0;
+
+error:
+	while (ts->op != NULL) {
+		tmp = ts->op->next;
+		free(ts->op);
+		ts->op = tmp;
+	}
+	return rv;
+}
+
diff --git a/libjio/journal.h b/libjio/journal.h
index 5a4666c..765571b 100644
--- a/libjio/journal.h
+++ b/libjio/journal.h
@@ -2,6 +2,7 @@
 #ifndef _JOURNAL_H
 #define _JOURNAL_H
 
+#include <stdint.h>
 #include "libjio.h"
 
 
@@ -10,6 +11,7 @@ struct journal_op {
 	int fd;
 	char *name;
 	off_t curpos;
+	uint32_t csum;
 	struct jtrans *ts;
 	struct jfs *fs;
 };
@@ -20,5 +22,7 @@ struct journal_op *journal_new(struct jtrans *ts);
 int journal_save(struct journal_op *jop);
 int journal_free(struct journal_op *jop);
 
+int fill_trans(unsigned char *map, off_t len, struct jtrans *ts);
+
 #endif
 
diff --git a/libjio/trans.c b/libjio/trans.c
index c6fe458..4a4474e 100644
--- a/libjio/trans.c
+++ b/libjio/trans.c
@@ -87,6 +87,12 @@ int jtrans_add(struct jtrans *ts, const void *buf, size_t count, off_t offset)
 		return -1;
 	}
 
+	/* fail for 0 length transactions */
+	if (count == 0) {
+		pthread_mutex_unlock(&(ts->lock));
+		return 1;
+	}
+
 	if ((long long) ts->len + count > MAX_TSIZE) {
 		pthread_mutex_unlock(&(ts->lock));
 		return -1;
diff --git a/libjio/trans.h b/libjio/trans.h
index 366585e..e7fa689 100644
--- a/libjio/trans.h
+++ b/libjio/trans.h
@@ -49,27 +49,5 @@ struct jlinger {
 };
 
 
-/* on-disk structures */
-
-/* header (fixed length, defined below) */
-struct disk_header {
-	uint32_t id;		/* id */
-	uint32_t flags;		/* flags about this transaction */
-	uint32_t numops;	/* number of operations */
-};
-
-/* operation */
-struct disk_operation {
-	uint32_t len;		/* data length */
-	uint32_t plen;		/* previous data length */
-	uint64_t offset;	/* offset relative to the BOF */
-	char *prevdata;		/* previous data for rollback */
-};
-
-/* disk constants */
-#define J_DISKHEADSIZE	 12	/* length of disk_header */
-#define J_DISKOPHEADSIZE 16	/* length of disk_operation header */
-
-
 #endif
 
diff --git a/tests/behaviour/t_corruption.py b/tests/behaviour/t_corruption.py
index 76a50a5..ed1bedf 100644
--- a/tests/behaviour/t_corruption.py
+++ b/tests/behaviour/t_corruption.py
@@ -27,8 +27,11 @@ def test_c01():
 	n = run_with_tmp(f1)
 	assert content(n) == ''
 	tc = open(transpath(n, 1)).read()
-	# flip just one bit of the first byte
-	tc = chr((ord(tc[0]) & 0xFE) | (~ ord(tc[0]) & 0x1) & 0xFF) + tc[1:]
+	# flip just one bit in the middle byte
+	pos = len(tc) / 2
+	tc = tc[:pos] + \
+		chr((ord(tc[pos]) & 0xFE) | (~ ord(tc[pos]) & 0x1) & 0xFF) + \
+		tc[pos + 1:]
 	open(transpath(n, 1), 'w').write(tc)
 	fsck_verify(n, corrupt = 1)
 	assert content(n) == ''
diff --git a/tests/behaviour/tf.py b/tests/behaviour/tf.py
index 786f24c..20450ee 100644
--- a/tests/behaviour/tf.py
+++ b/tests/behaviour/tf.py
@@ -17,9 +17,10 @@ import struct
 import libjio
 
 
-# Useful constants, must match libjio.h
-DHS = 12	# disk header size
-DOHS = 16	# disk op header size
+# Useful constants, must match journal.h
+DHS = 8		# disk header size
+DOHS = 12	# disk op header size
+DTS = 8		# disk trailer size
 
 
 def tmppath():
@@ -159,9 +160,11 @@ class attrdict (dict):
 
 class TransFile (object):
 	def __init__(self, path = ''):
+		self.ver = 0
 		self.id = -1
 		self.flags = 0
 		self.numops = -1
+		self.checksum = -1
 		self.ops = []
 		self.path = path
 		if path:
@@ -171,30 +174,39 @@ class TransFile (object):
 		fd = open(self.path)
 
 		# header
-		hdrfmt = "III"
-		self.id, self.flags, self.numops = struct.unpack(hdrfmt,
+		hdrfmt = "!HHI"
+		self.ver, self.flags, self.id = struct.unpack(hdrfmt,
 				fd.read(struct.calcsize(hdrfmt)))
 
 		# operations (header only)
-		opfmt = "IIQ"
+		opfmt = "!IQ"
 		self.ops = []
-		for i in range(self.numops):
-			tlen, plen, offset = struct.unpack(opfmt,
+		while True:
+			tlen, offset = struct.unpack(opfmt,
 					fd.read(struct.calcsize(opfmt)))
+			if tlen == offset == 0:
+				break
 			payload = fd.read(tlen)
 			assert len(payload) == tlen
-			self.ops.append(attrdict(tlen = tlen, plen = plen,
-				offset = offset, payload = payload))
+			self.ops.append(attrdict(tlen = tlen, offset = offset,
+				payload = payload))
+
+		# trailer
+		trailerfmt = "!II"
+		self.numops, self.checksum = struct.unpack(trailerfmt,
+				fd.read(struct.calcsize(trailerfmt)))
 
 	def save(self):
 		# the lack of integrity checking in this function is
 		# intentional, so we can write broken transactions and see how
 		# jfsck() copes with them
 		fd = open(self.path, 'w')
-		fd.write(struct.pack("III", self.id, self.flags, self.numops))
+		fd.write(struct.pack("!HHI", 1, self.flags, self.id))
 		for o in self.ops:
-			fd.write(struct.pack("IIQs", o.tlen, o.plen, o.offset,
+			fd.write(struct.pack("!IQs", o.tlen, o.offset,
 				o.payload))
+		fd.write(struct.pack("!IQ", 0, 0))
+		fd.write(struct.pack("!II", self.numops, self.checksum))
 
 	def __repr__(self):
 		return '<TransFile %s: id:%d f:%s n:%d ops:%s>' % \