git » libjio » commit cc66396

This is a big rework of the commit procedure, which make transactions really useful by bundling operations up and apply all of them atomically, which is what one would expect.

author Alberto Bertogli
2004-06-20 02:57:36 UTC
committer Alberto Bertogli
2007-07-15 13:02:29 UTC
parent edc7dc01863a248e52f8e7f30fb9134ed48d915b

This is a big rework of the commit procedure, which make transactions really useful by bundling operations up and apply all of them atomically, which is what one would expect.

This is a big rework of the commit procedure, which make transactions really
useful by bundling operations up and apply all of them atomically, which is
what one would expect.

So now you create a transaction, add some operations to it (writes to
different offsets) and then commit it as a whole.

It's a big patch, it could have been splitted in more parts (ie. the move of
jfsck() is pretty much orthogonal to this) but as I'm changing code all over
the place this is easier to work with.

Makefile +1 -1
check.c +260 -0
common.c +54 -0
common.h +2 -0
libjio.h +39 -25
samples/jio3.c +16 -10
trans.c +211 -380
unix.c +19 -39

diff --git a/Makefile b/Makefile
index 8d068e6..a032e05 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ include Make.conf
 
 
 # objects to build
-OBJS = common.o trans.o unix.o ansi.o
+OBJS = common.o trans.o check.o unix.o ansi.o
 
 # rules
 default: all
diff --git a/check.c b/check.c
new file mode 100644
index 0000000..86c9f46
--- /dev/null
+++ b/check.c
@@ -0,0 +1,260 @@
+
+/*
+ * libjio - A library for Journaled I/O
+ * Alberto Bertogli (albertogli@telpin.com.ar)
+ *
+ * Recovery functions
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include <dirent.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include "libjio.h"
+#include "common.h"
+
+
+/* fill a transaction structure from a mmapped transaction file */
+static int fill_trans(unsigned char *map, off_t len, struct jtrans *ts)
+{
+	int i;
+	unsigned char *p;
+	struct joper *op, *tmp;
+
+	if (len < J_DISKHEADSIZE)
+		return 0;
+
+	p = map;
+
+	ts->id = *( (uint32_t *) p);
+	p += 4;
+
+	ts->flags = *( (uint32_t *) p);
+	p += 4;
+
+	ts->numops = *( (uint32_t *) p);
+	p += 4;
+
+	for (i = 0; i < ts->numops; i++) {
+		if (len < (p - map) + J_DISKOPHEADSIZE)
+			goto error;
+
+		op = malloc(sizeof(struct joper));
+		if (op == NULL)
+			goto error;
+
+		op->len = *( (uint32_t *) p);
+		p += 4;
+
+		op->plen = *( (uint32_t *) p);
+		p += 4;
+
+		op->offset = *( (uint64_t *) p);
+		p += 8;
+
+		if (len < (p - map) + op->plen)
+			goto error;
+
+		op->pdata = (void *) p;
+		p += op->plen;
+
+		if (ts->op == NULL) {
+			ts->op = op;
+			op->prev = NULL;
+			op->next = NULL;
+		} else {
+			for(tmp = ts->op; tmp->next != NULL; tmp = tmp->next)
+				;
+			tmp->next = op;
+			op->prev = tmp;
+			op->next = NULL;
+		}
+	}
+
+	return 1;
+
+error:
+	while (ts->op != NULL) {
+		tmp = ts->op->next;
+		free(ts->op);
+		ts->op = tmp;
+	}
+	return 0;
+}
+
+/* check the journal and rollback incomplete transactions */
+int jfsck(const char *name, struct jfsck_result *res)
+{
+	int fd, tfd, rv, i, maxtid;
+	char jdir[PATH_MAX], jlockfile[PATH_MAX], tname[PATH_MAX];
+	struct stat sinfo;
+	struct jfs fs;
+	struct jtrans *curts;
+	DIR *dir;
+	struct dirent *dent;
+	void *map;
+	off_t filelen;
+
+
+	fd = open(name, O_RDWR | O_SYNC | O_LARGEFILE);
+	if (fd < 0)
+		return J_ENOENT;
+
+	fs.fd = fd;
+	fs.name = (char *) name;
+
+	if (!get_jdir(name, jdir))
+		return J_ENOMEM;
+	rv = lstat(jdir, &sinfo);
+	if (rv < 0 || !S_ISDIR(sinfo.st_mode))
+		return J_ENOJOURNAL;
+
+	/* open the lock file, which is only used to complete the jfs
+	 * structure */
+	snprintf(jlockfile, PATH_MAX, "%s/%s", jdir, "lock");
+	rv = open(jlockfile, O_RDWR | O_CREAT, 0600);
+	if (rv < 0)
+		return J_ENOJOURNAL;
+	fs.jfd = rv;
+
+	dir = opendir(jdir);
+	if (dir == NULL)
+		return J_ENOJOURNAL;
+
+	/* loop for each file in the journal directory to find out the greater
+	 * transaction number */
+	maxtid = 0;
+	for (dent = readdir(dir); dent != NULL; dent = readdir(dir)) {
+		/* see if the file is named like a transaction, ignore
+		 * otherwise; as transactions are named as numbers > 0, a
+		 * simple atoi() is enough testing */
+		rv = atoi(dent->d_name);
+		if (rv <= 0)
+			continue;
+		if (rv > maxtid)
+			maxtid = rv;
+	}
+	closedir(dir);
+
+	/* rewrite the lockfile, writing the new maxtid on it, so that when we
+	 * rollback a transaction it doesn't step over existing ones */
+	rv = spwrite(fs.jfd, &maxtid, sizeof(maxtid), 0);
+	if (rv != sizeof(maxtid)) {
+		return J_ENOMEM;
+	}
+
+	/* we loop all the way up to the max transaction id */
+	for (i = 1; i <= maxtid; i++) {
+		curts = malloc(sizeof(struct jtrans));
+		if (curts == NULL)
+			return J_ENOMEM;
+
+		jtrans_init(&fs, curts);
+		curts->id = i;
+
+		/* open the transaction file, using i as its name, so we are
+		 * really looping in order (recovering transaction in a
+		 * different order as they were applied means instant
+		 * corruption) */
+		if (!get_jtfile(name, i, tname))
+			return J_ENOMEM;
+		tfd = open(tname, O_RDWR | O_SYNC | O_LARGEFILE, 0600);
+		if (tfd < 0) {
+			res->invalid++;
+			goto loop;
+		}
+
+		/* try to lock the transaction file, if it's locked then it is
+		 * currently being used so we skip it */
+		rv = plockf(tfd, F_TLOCK, 0, 0);
+		if (rv == -1) {
+			res->in_progress++;
+			goto loop;
+		}
+
+		filelen = lseek(tfd, 0, SEEK_END);
+		map = mmap(0, filelen, PROT_READ, MAP_SHARED, tfd, 0);
+		rv = fill_trans((unsigned char *) map, filelen, curts);
+		if (rv != 1) {
+			res->broken++;
+			goto loop;
+		}
+
+		rv = jtrans_rollback(curts);
+
+		munmap(map, filelen);
+
+		if (rv < 0) {
+			res->apply_error++;
+			goto loop;
+		}
+		res->rollbacked++;
+
+
+loop:
+		if (tfd >= 0) {
+			close(tfd);
+			tfd = -1;
+		}
+
+		free(curts);
+
+		res->total++;
+	}
+
+	close(fs.fd);
+	close(fs.jfd);
+
+	return 0;
+
+}
+
+/* remove all the files in the journal directory (if any) */
+int jfsck_cleanup(const char *name)
+{
+	char jdir[PATH_MAX], tfile[PATH_MAX*3];
+	DIR *dir;
+	struct dirent *dent;
+
+	if (!get_jdir(name, jdir))
+		return 0;
+
+	dir = opendir(jdir);
+	if (dir == NULL && errno == ENOENT)
+		/* it doesn't exist, so it's clean */
+		return 1;
+	else if (dir == NULL)
+		return 0;
+
+	for (dent = readdir(dir); dent != NULL; dent = readdir(dir)) {
+		/* we only care about transactions (named as numbers > 0) and
+		 * the lockfile (named "lock"); ignore everything else */
+		if (strcmp(dent->d_name, "lock") && atoi(dent->d_name) <= 0)
+			continue;
+
+		/* build the full path to the transaction file */
+		memset(tfile, 0, PATH_MAX * 3);
+		strcat(tfile, jdir);
+		strcat(tfile, "/");
+		strcat(tfile, dent->d_name);
+
+		/* the full filename is too large */
+		if (strlen(tfile) > PATH_MAX)
+			return 0;
+
+		/* and remove it */
+		unlink(tfile);
+	}
+	closedir(dir);
+
+	return 1;
+}
+
diff --git a/common.c b/common.c
index e0abfd3..6e39520 100644
--- a/common.c
+++ b/common.c
@@ -9,6 +9,11 @@
 #include <sys/types.h>
 #include <fcntl.h>
 #include <unistd.h>
+#include <string.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include "common.h"
 
@@ -90,3 +95,52 @@ ssize_t spwrite(int fd, const void *buf, size_t count, off_t offset)
 	return count;
 }
 
+/* build the journal directory name out of the filename */
+int get_jdir(const char *filename, char *jdir)
+{
+	char *base, *baset;
+	char *dir, *dirt;
+
+	baset = strdup(filename);
+	if (baset == NULL)
+		return 0;
+	base = basename(baset);
+
+	dirt = strdup(filename);
+	if (dirt == NULL)
+		return 0;
+	dir = dirname(dirt);
+
+	snprintf(jdir, PATH_MAX, "%s/.%s.jio", dir, base);
+
+	free(baset);
+	free(dirt);
+
+	return 1;
+}
+
+/* build the filename of a given transaction */
+int get_jtfile(const char *filename, int tid, char *jtfile)
+{
+	char *base, *baset;
+	char *dir, *dirt;
+
+	baset = strdup(filename);
+	if (baset == NULL)
+		return 0;
+	base = basename(baset);
+
+	dirt = strdup(filename);
+	if (dirt == NULL)
+		return 0;
+	dir = dirname(dirt);
+
+	snprintf(jtfile, PATH_MAX, "%s/.%s.jio/%d", dir, base, tid);
+
+	free(baset);
+	free(dirt);
+
+	return 1;
+}
+
+
diff --git a/common.h b/common.h
index a048f91..adcfcfc 100644
--- a/common.h
+++ b/common.h
@@ -12,6 +12,8 @@
 off_t plockf(int fd, int cmd, off_t offset, off_t len);
 ssize_t spread(int fd, void *buf, size_t count, off_t offset);
 ssize_t spwrite(int fd, const void *buf, size_t count, off_t offset);
+int get_jdir(const char *filename, char *jdir);
+int get_jtfile(const char *filename, int tid, char *jtfile);
 
 #endif
 
diff --git a/libjio.h b/libjio.h
index 07d54c4..8204f46 100644
--- a/libjio.h
+++ b/libjio.h
@@ -19,64 +19,77 @@ extern "C" {
 
 
 /* logical structures */
+
+/* the main file structure */
 struct jfs {
 	int fd;			/* main file descriptor */
 	char *name;		/* and its name */
 	int jfd;		/* journal's lock file descriptor */
-	int flags;		/* journal mode options used in jopen() */
+	int flags;		/* journal flags */
 	pthread_mutex_t lock;	/* a soft lock used in some operations */
 };
 
+/* a single operation */
+struct joper {
+	int locked;		/* is the region is locked? */
+	off_t offset;		/* operation's offset */
+	size_t len;		/* data length */
+	void *buf;		/* data */
+	size_t plen;		/* previous data length */
+	void *pdata;		/* previous data */
+	struct joper *prev;
+	struct joper *next;
+};
+
+/* a transaction */
 struct jtrans {
 	struct jfs *fs;		/* journal file structure to operate on */
 	char *name;		/* name of the transaction file */
 	int id;			/* transaction id */
 	int flags;		/* misc flags */
-	const void *buf;	/* buffer */
-	size_t len;		/* buffer lenght */
-	off_t offset;		/* file offset to operate on */
-	void *udata;		/* user-supplied data */
-	size_t ulen;		/* udata lenght */
-	void *pdata;		/* previous data, for rollback */
-	size_t plen;		/* pdata lenght */
+	unsigned int numops;	/* quantity of operations in the list */
+	pthread_mutex_t lock;	/* used to modify the operation list */
+	struct joper *op;	/* list of operations */
 };
 
 struct jfsck_result {
 	int total;		/* total transactions files we looked at */
 	int invalid;		/* invalid files in the journal directory */
 	int in_progress;	/* transactions in progress */
-	int broken_head;	/* transactions broken (header missing) */
-	int broken_body;	/* transactions broken (body missing) */
-	int load_error;		/* errors loading the transaction */
+	int broken;		/* transactions broken */
 	int apply_error;	/* errors applying the transaction */
 	int rollbacked;		/* transactions that were rollbacked */
 };
 
-/* on-disk structure */
-struct disk_trans {
-	
-	/* header (fixed lenght, defined below) */
+
+/* on-disk structures */
+
+/* header (fixed length, defined below) */
+struct disk_header {
 	uint32_t id;		/* id */
 	uint32_t flags;		/* flags about this transaction */
-	uint32_t len;		/* data lenght */
-	uint32_t plen;		/* previous data lenght */
-	uint32_t ulen;		/* user-supplied information lenght */
+	uint32_t numops;	/* number of operations */
+};
+
+/* operation */
+struct disk_operation {
+	uint32_t len;		/* data length */
+	uint32_t plen;		/* previous data length */
 	uint64_t offset;	/* offset relative to the BOF */
-	
-	/* payload (variable lenght) */
-	char *udata;		/* user-supplied data */
 	char *prevdata;		/* previous data for rollback */
 };
 
 
-/* core operations */
+/* core functions */
 int jopen(struct jfs *fs, const char *name, int flags, int mode, int jflags);
 void jtrans_init(struct jfs *fs, struct jtrans *ts);
+int jtrans_add(struct jtrans *ts, const void *buf, size_t count, off_t offset);
 int jtrans_commit(struct jtrans *ts);
 int jtrans_rollback(struct jtrans *ts);
 void jtrans_free(struct jtrans *ts);
 int jclose(struct jfs *fs);
 
+
 /* journal checker */
 int jfsck(const char *name, struct jfsck_result *res);
 int jfsck_cleanup(const char *name);
@@ -88,7 +101,7 @@ ssize_t jreadv(struct jfs *fs, struct iovec *vector, int count);
 ssize_t jwrite(struct jfs *fs, const void *buf, size_t count);
 ssize_t jpwrite(struct jfs *fs, const void *buf, size_t count, off_t offset);
 ssize_t jwritev(struct jfs *fs, const struct iovec *vector, int count);
-int jtruncate(struct jfs *fs, off_t lenght);
+int jtruncate(struct jfs *fs, off_t length);
 
 /* ANSI C stdio wrappers */
 struct jfs *jfopen(const char *path, const char *mode);
@@ -113,8 +126,9 @@ FILE *jfsopen(struct jfs *stream, const char *mode);
 #define J_COMMITED	1	/* mark a transaction as commited */
 #define J_ROLLBACKED	2	/* mark a transaction as rollbacked */
 
-/* disk_trans constants */
-#define J_DISKTFIXSIZE	 28	/* lenght of disk_trans' header */ 
+/* disk constants */
+#define J_DISKHEADSIZE	 12	/* length of disk_header */
+#define J_DISKOPHEADSIZE 16	/* length of disk_operation header */
 
 /* jfsck constants (return values) */
 #define J_ESUCCESS	0	/* success - shouldn't be used */
diff --git a/samples/jio3.c b/samples/jio3.c
index f239020..b5a9f5d 100644
--- a/samples/jio3.c
+++ b/samples/jio3.c
@@ -16,28 +16,34 @@ int main(int argc, char **argv)
 	struct jfs fs;
 	struct jtrans ts;
 
-	fd = jopen(&fs, "test1", O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0660, 0);
+	fd = jopen(&fs, "test1", O_RDWR | O_CREAT | O_SYNC, 0660, 0);
 	if (fd < 0)
 		perror("OPEN");
 
-#define str "ROLLBACKTEST!\n"
-
 	jtrans_init(&fs, &ts);
 
-	ts.offset = 0;
-	ts.buf = str;
-	ts.len = strlen(str);
-		
+#define str1 "1ROLLBACKTEST1!\n"
+	jtrans_add(&ts, str1, strlen(str1), 0);
+
+#define str2 "2ROLLBACKTEST2!\n"
+	jtrans_add(&ts, str2, strlen(str2), strlen(str1));
+
+#define str3 "3ROLLBACKTEST3!\n"
+	jtrans_add(&ts, str3, strlen(str3), strlen(str1) + strlen(str2));
+
+
 	rv = jtrans_commit(&ts);
-	if (rv != strlen(str))
+	if (rv != strlen(str1) + strlen(str2) + strlen(str3))
 		perror("COMMIT");
+	printf("COMMIT OK: %d\n", rv);
+
 
 	rv = jtrans_rollback(&ts);
-	if (rv != 0)
+	if (rv < 0)
 		perror("ROLLBACK");
+	printf("ROLLBACK OK: %d\n", rv);
 
 	return 0;
 
 }
 
-
diff --git a/trans.c b/trans.c
index 870c01a..1867a58 100644
--- a/trans.c
+++ b/trans.c
@@ -22,53 +22,9 @@
 #include "common.h"
 
 
-/* build the journal directory name out of the filename */
-static int get_jdir(const char *filename, char *jdir)
-{
-	char *base, *baset;
-	char *dir, *dirt;
-
-	baset = strdup(filename);
-	if (baset == NULL)
-		return 0;
-	base = basename(baset);
-
-	dirt = strdup(filename);
-	if (dirt == NULL)
-		return 0;
-	dir = dirname(dirt);
-
-	snprintf(jdir, PATH_MAX, "%s/.%s.jio", dir, base);
-
-	free(baset);
-	free(dirt);
-
-	return 1;
-}
-
-/* build the filename of a given transaction */
-static int get_jtfile(const char *filename, int tid, char *jtfile)
-{
-	char *base, *baset;
-	char *dir, *dirt;
-
-	baset = strdup(filename);
-	if (baset == NULL)
-		return 0;
-	base = basename(baset);
-
-	dirt = strdup(filename);
-	if (dirt == NULL)
-		return 0;
-	dir = dirname(dirt);
-
-	snprintf(jtfile, PATH_MAX, "%s/.%s.jio/%d", dir, base, tid);
-
-	free(baset);
-	free(dirt);
-
-	return 1;
-}
+/*
+ * helper functions
+ */
 
 /* gets a new transaction id */
 static unsigned int get_tid(struct jfs *fs)
@@ -158,53 +114,108 @@ void jtrans_init(struct jfs *fs, struct jtrans *ts)
 	ts->fs = fs;
 	ts->name = NULL;
 	ts->id = 0;
-	ts->flags = 0;
-	ts->buf = NULL;
-	ts->len = 0;
-	ts->offset = 0;
-	ts->udata = NULL;
-	ts->ulen = 0;
-	ts->pdata = NULL;
-	ts->plen = 0;
+	ts->flags = fs->flags;
+	ts->op = NULL;
+	ts->numops = 0;
+	pthread_mutex_init( &(ts->lock), NULL);
 }
 
+
 /* free the contents of a transaction structure */
 void jtrans_free(struct jtrans *ts)
 {
-	/* NOTE: we only really free the name and previous data, which are the
-	 * things _we_ allocate; the user data is caller stuff */
+	struct joper *tmpop;
+
 	ts->fs = NULL;
+
 	if (ts->name)
 		free(ts->name);
-	if (ts->pdata)
-		free(ts->pdata);
 
-	/* don't free ts itself, it's very common to allocate it in the stack,
-	 * so let the caller take care of it; and, after all, he was the one
-	 * doing the alloc in the first place */
+	while (ts->op != NULL) {
+		tmpop = ts->op->next;
+
+		if (ts->op->buf)
+			free(ts->op->buf);
+		if (ts->op->pdata)
+			free(ts->op->pdata);
+		free(ts->op);
+
+		ts->op = tmpop;
+	}
+}
+
+
+int jtrans_add(struct jtrans *ts, const void *buf, size_t count, off_t offset)
+{
+	struct joper *jop, *tmpop;
+
+	/* find the last operation in the transaction and create a new one at
+	 * the end */
+	pthread_mutex_lock(&(ts->lock));
+	if (ts->op == NULL) {
+		ts->op = malloc(sizeof(struct joper));
+		jop = ts->op;
+		jop->prev = NULL;
+	} else {
+		for (tmpop = ts->op; tmpop->next != NULL; tmpop = tmpop->next)
+			;
+		tmpop->next = malloc(sizeof(struct joper));
+		tmpop->next->prev = tmpop;
+		jop = tmpop->next;
+	}
+	pthread_mutex_unlock(&(ts->lock));
+
+	if (jop == NULL) {
+		/* malloc() failed */
+		return 0;
+	}
+
+	jop->buf = malloc(count);
+	if (jop->buf == NULL) {
+		free(jop);
+		return 0;
+	}
+
+	/* we copy the buffer because then the caller can reuse it */
+	memcpy(jop->buf, buf, count);
+	jop->len = count;
+	jop->offset = offset;
+	jop->next = NULL;
+	jop->plen = 0;
+	jop->pdata = NULL;
+	jop->locked = 0;
+
+	ts->numops++;
+
+	return 1;
 }
 
 /* commit a transaction */
 int jtrans_commit(struct jtrans *ts)
 {
-	int id, fd, rv, t;
+	int id, rv, fd = -1;
 	char *name;
 	unsigned char *buf_init, *bufp;
+	struct joper *op;
+	off_t curpos = 0;
+	size_t written = 0;
+
+	pthread_mutex_lock(&(ts->lock));
 
 	name = (char *) malloc(PATH_MAX);
 	if (name == NULL)
-		return -1;
+		goto exit;
 
 	id = get_tid(ts->fs);
 	if (id == 0)
-		return -1;
+		goto exit;
 
 	/* open the transaction file */
 	if (!get_jtfile(ts->fs->name, id, name))
-		return -1;
+		goto exit;
 	fd = open(name, O_RDWR | O_CREAT | O_TRUNC | O_LARGEFILE, 0600);
 	if (fd < 0)
-		return -1;
+		goto exit;
 
 	/* and lock it */
 	plockf(fd, F_LOCK, 0, 0);
@@ -212,73 +223,102 @@ int jtrans_commit(struct jtrans *ts)
 	ts->id = id;
 	ts->name = name;
 
-	/* lock the file region to work on */
-	if (!(ts->fs->flags & J_NOLOCK))
-		plockf(ts->fs->fd, F_LOCK, ts->offset, ts->len);
-
-	/* read the current content and fill in the transaction structure */
-	ts->pdata = malloc(ts->len);
-	if (ts->pdata == NULL)
-		goto exit;
-
-	ts->plen = ts->len;
-
-	rv = spread(ts->fs->fd, ts->pdata, ts->len, ts->offset);
-	if (rv < 0)
-		goto exit;
-	if (rv < ts->len) {
-		/* we are extending the file! use ftruncate() to do it */
-		ftruncate(ts->fs->fd, ts->offset + ts->len);
-		ts->plen = rv;
-	}
-
-	/* now save the transaction to the file, static data first */
-
-	buf_init = malloc(J_DISKTFIXSIZE);
+	/* save the header */
+	buf_init = malloc(J_DISKHEADSIZE);
 	if (buf_init == NULL)
-		return -1;
+		goto exit;
 
 	bufp = buf_init;
 
-	/* the sizes are put explicitly (instead of using sizeof()) because
-	 * they're really fixed and defined in the on-disk format */
 	memcpy(bufp, (void *) &(ts->id), 4);
 	bufp += 4;
 
 	memcpy(bufp, (void *) &(ts->flags), 4);
 	bufp += 4;
 
-	memcpy(bufp, (void *) &(ts->len), 4);
+	memcpy(bufp, (void *) &(ts->numops), 4);
 	bufp += 4;
 
-	memcpy(bufp, (void *) &(ts->plen), 4);
-	bufp += 4;
+	rv = spwrite(fd, buf_init, J_DISKHEADSIZE, 0);
+	if (rv != J_DISKHEADSIZE) {
+		free(buf_init);
+		goto exit;
+	}
 
-	memcpy(bufp, (void *) &(ts->ulen), 4);
-	bufp += 4;
+	free(buf_init);
 
-	memcpy(bufp, (void *) &(ts->offset), 8);
-	bufp += 8;
+	curpos = J_DISKHEADSIZE;
+
+	/* first of all lock all the regions we're going to work with;
+	 * otherwise there could be another transaction trying to write the
+	 * same spots and we could end up with interleaved writes, that could
+	 * break atomicity warantees if we need to rollback */
+	if (!(ts->flags & J_NOLOCK)) {
+		for (op = ts->op; op != NULL; op = op->next) {
+			rv = plockf(ts->fs->fd, F_LOCK, op->offset, op->len);
+			if (rv == -1)
+				/* note it can fail with EDEADLK */
+				goto exit;
+			op->locked = 1;
+		}
+	}
 
-	rv = spwrite(fd, buf_init, J_DISKTFIXSIZE, 0);
-	if (rv != J_DISKTFIXSIZE)
-		goto exit;
+	/* save each transacion in the file */
+	for (op = ts->op; op != NULL; op = op->next) {
+		/* read the current content only if it's not there yet, which
+		 * is the normal case, but for rollbacking we fill it
+		 * ourselves */
+		if (op->pdata == NULL) {
+			op->pdata = malloc(op->len);
+			if (op->pdata == NULL)
+				goto exit;
+
+			op->plen = op->len;
+
+			rv = spread(ts->fs->fd, op->pdata, op->len,
+					op->offset);
+			if (rv < 0)
+				goto exit;
+			if (rv < op->len) {
+				/* we are extending the file! */
+				/* ftruncate(ts->fs->fd, op->offset + op->len); */
+				op->plen = rv;
+			}
+		}
 
-	free(buf_init);
+		/* save the operation's header */
+		buf_init = malloc(J_DISKOPHEADSIZE);
+		if (buf_init == NULL)
+			goto exit;
 
+		bufp = buf_init;
 
-	/* and now the variable data */
+		memcpy(bufp, (void *) &(op->len), 4);
+		bufp += 4;
 
-	if (ts->udata) {
-		rv = spwrite(fd, ts->udata, ts->ulen, J_DISKTFIXSIZE);
-		if (rv != ts->ulen)
+		memcpy(bufp, (void *) &(op->plen), 4);
+		bufp += 4;
+
+		memcpy(bufp, (void *) &(op->offset), 8);
+		bufp += 8;
+
+		rv = spwrite(fd, buf_init, J_DISKOPHEADSIZE, curpos);
+		if (rv != J_DISKOPHEADSIZE) {
+			free(buf_init);
 			goto exit;
-	}
+		}
 
-	t = J_DISKTFIXSIZE + ts->ulen;
-	rv = spwrite(fd, ts->pdata, ts->plen, t);
-	if (rv != ts->plen)
-		goto exit;
+		free(buf_init);
+
+		curpos += J_DISKOPHEADSIZE;
+
+		/* and save it to the disk */
+		rv = spwrite(fd, op->pdata, op->plen, curpos);
+		if (rv != op->plen)
+			goto exit;
+
+		curpos += op->plen;
+	}
 
 	/* this is a simple but efficient optimization: instead of doing
 	 * everything O_SYNC, we sync at this point only, this way we avoid
@@ -288,9 +328,18 @@ int jtrans_commit(struct jtrans *ts)
 	fsync(fd);
 
 	/* now that we have a safe transaction file, let's apply it */
-	rv = spwrite(ts->fs->fd, ts->buf, ts->len, ts->offset);
-	if (rv != ts->len)
-		goto exit;
+	written = 0;
+	for (op = ts->op; op != NULL; op = op->next) {
+		rv = spwrite(ts->fs->fd, op->buf, op->len, op->offset);
+
+		plockf(ts->fs->fd, F_ULOCK, op->offset, op->len);
+		op->locked = 0;
+
+		if (rv != op->len)
+			goto exit;
+
+		written += rv;
+	}
 
 	/* the transaction has been applied, so we cleanup and remove it from
 	 * the disk */
@@ -303,13 +352,16 @@ int jtrans_commit(struct jtrans *ts)
 
 exit:
 	close(fd);
+	for (op = ts->op; op != NULL; op = op->next) {
+		if (op->locked)
+			plockf(ts->fs->fd, F_ULOCK, op->offset, op->len);
+	}
 
-	if (!(ts->fs->flags & J_NOLOCK))
-		plockf(ts->fs->fd, F_ULOCK, ts->offset, ts->len);
+	pthread_mutex_unlock(&(ts->lock));
 
-	/* return the lenght only if it was properly commited */
+	/* return the length only if it was properly commited */
 	if (ts->flags & J_COMMITED)
-		return ts->len;
+		return written;
 	else
 		return -1;
 
@@ -318,41 +370,61 @@ exit:
 /* rollback a transaction */
 int jtrans_rollback(struct jtrans *ts)
 {
-	int rv;
 	struct jtrans newts;
+	struct joper *op, *curop, *lop;
 
-	/* copy the old transaction to the new one */
-	jtrans_init(ts->fs, &newts);
+	/* FIXME: this looks like a mess! */
+
+	if (ts->op == NULL) {
+		/* we're trying to rollback an empty transaction */
+		return 0;
+	}
 
+	jtrans_init(ts->fs, &newts);
 	newts.flags = ts->flags;
-	newts.offset = ts->offset;
 
-	newts.buf = ts->pdata;
-	newts.len = ts->plen;
+	/* find the last operation */
+	for (op = ts->op; op->next != NULL; op = op->next)
+		;
 
-	if (ts->plen < ts->len) {
-		/* we extended the data in the previous transaction, so we
+	/* and traverse the list backwards */
+	for ( ; op != NULL; op = op->prev) {
+		/* if we extended the data in the previous transaction, we
 		 * should truncate it back */
 		/* DANGEROUS: this is one of the main reasons why rollbacking
 		 * is dangerous and should only be done with extreme caution:
 		 * if for some reason, after the previous transacton, we have
 		 * extended the file further, this will cut it back to what it
 		 * was; read the docs for more detail */
-		ftruncate(ts->fs->fd, ts->offset + ts->plen);
-
+		if (op->plen < op->len)
+			ftruncate(ts->fs->fd, op->offset + op->plen);
+
+		/* manually add the operation to the new transaction */
+		curop = malloc(sizeof(struct joper));
+		curop->offset = op->offset;
+		curop->len = op->plen;
+		curop->buf = op->pdata;
+		curop->plen = op->plen;
+		curop->pdata = op->pdata;
+		curop->locked = 0;
+
+		/* add the new transaction to the list */
+		if (newts.op == NULL) {
+			newts.op = curop;
+			curop->prev = NULL;
+			curop->next = NULL;
+		} else {
+			for (lop = newts.op; lop->next != NULL; lop = lop->next)
+				;
+			lop->next = curop;
+			curop->prev = lop;
+			curop->next = NULL;
+		}
 	}
 
-	newts.pdata = ts->pdata;
-	newts.plen = ts->plen;
-
-	newts.udata = ts->udata;
-	newts.ulen = ts->ulen;
-
-	rv = jtrans_commit(&newts);
-	return rv;
+	return jtrans_commit(&newts);
 }
 
-
 /*
  * basic operations
  */
@@ -392,7 +464,6 @@ int jopen(struct jfs *fs, const char *name, int flags, int mode, int jflags)
 	 * make it easier for them by taking care of it here. If performance
 	 * is essential, the jpread/jpwrite functions should be used, just as
 	 * real life. */
-
 	pthread_mutex_init( &(fs->lock), NULL);
 
 	if (!get_jdir(name, jdir))
@@ -440,243 +511,3 @@ int jclose(struct jfs *fs)
 	return 0;
 }
 
-
-/*
- * journal recovery
- */
-
-/* check the journal and replay the incomplete transactions */
-int jfsck(const char *name, struct jfsck_result *res)
-{
-	int fd, tfd, rv, i, maxtid;
-	char jdir[PATH_MAX], jlockfile[PATH_MAX], tname[PATH_MAX];
-	unsigned char *buf = NULL;
-	struct stat sinfo;
-	struct jfs fs;
-	struct jtrans *curts;
-	DIR *dir;
-	struct dirent *dent;
-	off_t offset;
-
-	fd = open(name, O_RDWR | O_SYNC | O_LARGEFILE);
-	if (fd < 0)
-		return J_ENOENT;
-
-	fs.fd = fd;
-	fs.name = (char *) name;
-
-	if (!get_jdir(name, jdir))
-		return J_ENOMEM;
-	rv = lstat(jdir, &sinfo);
-	if (rv < 0 || !S_ISDIR(sinfo.st_mode))
-		return J_ENOJOURNAL;
-
-	/* open the lock file, which is only used to complete the jfs
-	 * structure */
-	snprintf(jlockfile, PATH_MAX, "%s/%s", jdir, "lock");
-	rv = open(jlockfile, O_RDWR | O_CREAT, 0600);
-	if (rv < 0)
-		return J_ENOJOURNAL;
-	fs.jfd = rv;
-
-	dir = opendir(jdir);
-	if (dir == NULL)
-		return J_ENOJOURNAL;
-
-	/* loop for each file in the journal directory to find out the greater
-	 * transaction number */
-	maxtid = 0;
-	for (dent = readdir(dir); dent != NULL; dent = readdir(dir)) {
-		/* see if the file is named like a transaction, ignore
-		 * otherwise; as transactions are named as numbers > 0, a
-		 * simple atoi() is enough testing */
-		rv = atoi(dent->d_name);
-		if (rv <= 0)
-			continue;
-		if (rv > maxtid)
-			maxtid = rv;
-	}
-	closedir(dir);
-
-	/* rewrite the lockfile, writing the new maxtid on it, so that when we
-	 * rollback a transaction it doesn't step over existing ones */
-	rv = spwrite(fs.jfd, &maxtid, sizeof(maxtid), 0);
-	if (rv != sizeof(maxtid)) {
-		return J_ENOMEM;
-	}
-
-	/* we loop all the way up to the max transaction id */
-	for (i = 1; i <= maxtid; i++) {
-		curts = malloc(sizeof(struct jtrans));
-		if (curts == NULL)
-			return J_ENOMEM;
-
-		jtrans_init(&fs, curts);
-		curts->id = i;
-
-		/* open the transaction file, using i as its name, so we are
-		 * really looping in order (recovering transaction in a
-		 * different order as they were applied means instant
-		 * corruption) */
-		if (!get_jtfile(name, i, tname))
-			return J_ENOMEM;
-		tfd = open(tname, O_RDWR | O_SYNC | O_LARGEFILE, 0600);
-		if (tfd < 0) {
-			res->invalid++;
-			goto loop;
-		}
-
-		/* try to lock the transaction file, if it's locked then it is
-		 * currently being used so we skip it */
-		rv = plockf(tfd, F_TLOCK, 0, 0);
-		if (rv == -1) {
-			res->in_progress++;
-			goto loop;
-		}
-
-		/* load from disk, header first */
-		buf = (unsigned char *) malloc(J_DISKTFIXSIZE);
-		if (buf == NULL) {
-			res->load_error++;
-			goto loop;
-		}
-
-		rv = read(tfd, buf, J_DISKTFIXSIZE);
-		if (rv != J_DISKTFIXSIZE) {
-			res->broken_head++;
-			free(buf);
-			goto loop;
-		}
-
-		curts->flags = *( (uint32_t *) (buf + 4));
-		curts->len = *( (uint32_t *) (buf + 8));
-		curts->plen = *( (uint32_t *) (buf + 12));
-		curts->ulen = *( (uint32_t *) (buf + 16));
-		curts->offset = *( (uint64_t *) (buf + 20));
-
-		free(buf);
-
-		/* if we got here, the transaction was not applied, so we
-		 * check if the transaction file is complete (we only need to
-		 * rollback it) or not (so we can't do anything but ignore it)
-		 */
-
-		lstat(tname, &sinfo);
-		rv = J_DISKTFIXSIZE + curts->ulen + curts->plen;
-		if (sinfo.st_size != rv) {
-			/* the transaction file is incomplete, some of the
-			 * body is missing */
-			res->broken_body++;
-			goto loop;
-		}
-
-		/* we have a complete transaction file which commit was not
-		 * successful, so we read it to complete the transaction
-		 * structure and rollback it */
-		curts->pdata = malloc(curts->plen);
-		if (curts->pdata == NULL) {
-			res->load_error++;
-			goto loop;
-		}
-
-		curts->udata = malloc(curts->ulen);
-		if (curts->udata == NULL) {
-			res->load_error++;
-			goto loop;
-		}
-
-		/* user data */
-		offset = J_DISKTFIXSIZE;
-		rv = spread(tfd, curts->udata, curts->ulen, offset);
-		if (rv != curts->ulen) {
-			res->load_error++;
-			goto loop;
-		}
-
-		/* previous data */
-		offset = J_DISKTFIXSIZE + curts->ulen;
-		rv = spread(tfd, curts->pdata, curts->plen, offset);
-		if (rv != curts->plen) {
-			res->load_error++;
-			goto loop;
-		}
-
-		/* rollback */
-		rv = jtrans_rollback(curts);
-		if (rv < 0) {
-			res->apply_error++;
-			goto loop;
-		}
-		res->rollbacked++;
-
-		/* free the data we just allocated */
-		if (curts->plen) {
-			free(curts->pdata);
-			curts->pdata = NULL;
-		}
-		if (curts->ulen) {
-			free(curts->udata);
-			curts->udata = NULL;
-		}
-		if (curts->name) {
-			free(curts->name);
-			curts->name = NULL;
-		}
-
-loop:
-		if (tfd > 0)
-			close(tfd);
-
-		free(curts);
-
-		res->total++;
-	}
-
-	close(fs.fd);
-	close(fs.jfd);
-
-	return 0;
-
-}
-
-/* remove all the files in the journal directory (if any) */
-int jfsck_cleanup(const char *name)
-{
-	char jdir[PATH_MAX], tfile[PATH_MAX*3];
-	DIR *dir;
-	struct dirent *dent;
-
-	if (!get_jdir(name, jdir))
-		return 0;
-
-	dir = opendir(jdir);
-	if (dir == NULL && errno == ENOENT)
-		/* it doesn't exist, so it's clean */
-		return 1;
-	else if (dir == NULL)
-		return 0;
-
-	for (dent = readdir(dir); dent != NULL; dent = readdir(dir)) {
-		/* we only care about transactions (named as numbers > 0) and
-		 * the lockfile (named "lock"); ignore everything else */
-		if (strcmp(dent->d_name, "lock") && atoi(dent->d_name) <= 0)
-			continue;
-
-		/* build the full path to the transaction file */
-		memset(tfile, 0, PATH_MAX * 3);
-		strcat(tfile, jdir);
-		strcat(tfile, "/");
-		strcat(tfile, dent->d_name);
-
-		/* the full filename is too large */
-		if (strlen(tfile) > PATH_MAX)
-			return 0;
-
-		/* and remove it */
-		unlink(tfile);
-	}
-	closedir(dir);
-
-	return 1;
-}
-
diff --git a/unix.c b/unix.c
index 2d82655..bce8647 100644
--- a/unix.c
+++ b/unix.c
@@ -88,10 +88,7 @@ ssize_t jwrite(struct jfs *fs, const void *buf, size_t count)
 
 	jtrans_init(fs, &ts);
 	pos = lseek(fs->fd, 0, SEEK_CUR);
-	ts.offset = pos;
-
-	ts.buf = buf;
-	ts.len = count;
+	jtrans_add(&ts, buf, count, pos);
 
 	rv = jtrans_commit(&ts);
 
@@ -114,10 +111,7 @@ ssize_t jpwrite(struct jfs *fs, const void *buf, size_t count, off_t offset)
 	struct jtrans ts;
 
 	jtrans_init(fs, &ts);
-	ts.offset = offset;
-
-	ts.buf = buf;
-	ts.len = count;
+	jtrans_add(&ts, buf, count, offset);
 
 	rv = jtrans_commit(&ts);
 
@@ -129,43 +123,29 @@ ssize_t jpwrite(struct jfs *fs, const void *buf, size_t count, off_t offset)
 /* writev wrapper */
 ssize_t jwritev(struct jfs *fs, const struct iovec *vector, int count)
 {
-	int rv, i, bufp;
-	ssize_t sum;
-	char *buf;
-	off_t pos;
+	int rv, i;
+	size_t sum;
+	off_t ipos, t;
 	struct jtrans ts;
 
-	sum = 0;
-	for (i = 0; i < count; i++)
-		sum += vector[i].iov_len;
-
-	/* unify the buffers into one big chunk to commit */
-	/* FIXME: can't we do this more efficient? It ruins the whole purpose
-	 * of using writev()! maybe we should do one transaction per vector */
-	buf = malloc(sum);
-	if (buf == NULL)
-		return -1;
-	bufp = 0;
-
-	for (i = 0; i < count; i++) {
-		memcpy(buf + bufp, vector[i].iov_base, vector[i].iov_len);
-		bufp += vector[i].iov_len;
-	}
-
 	pthread_mutex_lock(&(fs->lock));
 
 	jtrans_init(fs, &ts);
-	pos = lseek(fs->fd, 0, SEEK_CUR);
-	ts.offset = pos;
+	ipos = lseek(fs->fd, 0, SEEK_CUR);
+	t = ipos;
 
-	ts.buf = buf;
-	ts.len = sum;
+	sum = 0;
+	for (i = 0; i < count; i++) {
+		jtrans_add(&ts, vector[i].iov_base, vector[i].iov_len, t);
+		sum += vector[i].iov_len;
+		t += vector[i].iov_len;
+	}
 
 	rv = jtrans_commit(&ts);
 
 	if (rv >= 0) {
 		/* if success, advance the file pointer */
-		lseek(fs->fd, count, SEEK_CUR);
+		lseek(fs->fd, sum, SEEK_CUR);
 	}
 
 	pthread_mutex_unlock(&(fs->lock));
@@ -177,14 +157,14 @@ ssize_t jwritev(struct jfs *fs, const struct iovec *vector, int count)
 }
 
 /* truncate a file - be careful with this */
-int jtruncate(struct jfs *fs, off_t lenght)
+int jtruncate(struct jfs *fs, off_t length)
 {
 	int rv;
 
-	/* lock from lenght to the end of file */
-	plockf(fs->fd, F_LOCK, lenght, 0);
-	rv = ftruncate(fs->fd, lenght);
-	plockf(fs->fd, F_ULOCK, lenght, 0);
+	/* lock from length to the end of file */
+	plockf(fs->fd, F_LOCK, length, 0);
+	rv = ftruncate(fs->fd, length);
+	plockf(fs->fd, F_ULOCK, length, 0);
 
 	return rv;
 }