author | Alberto Bertogli
<albertito@blitiri.com.ar> 2009-04-06 03:17:19 UTC |
committer | Alberto Bertogli
<albertito@blitiri.com.ar> 2009-04-12 13:50:36 UTC |
parent | 0f91cd19551e7b7de45b80f431430f7d34d83339 |
libjio/Makefile | +1 | -1 |
libjio/check.c | +0 | -2 |
libjio/journal.c | +318 | -0 |
libjio/journal.h | +23 | -0 |
libjio/libjio.h | +2 | -3 |
libjio/trans.c | +18 | -257 |
diff --git a/libjio/Makefile b/libjio/Makefile index c3ab13e..192a18e 100644 --- a/libjio/Makefile +++ b/libjio/Makefile @@ -39,7 +39,7 @@ endif # objects to build -OBJS = checksum.o common.o trans.o check.o unix.o ansi.o +OBJS = checksum.o common.o trans.o check.o journal.o unix.o ansi.o # rules default: all diff --git a/libjio/check.c b/libjio/check.c index 2fabde5..4b84dd5 100644 --- a/libjio/check.c +++ b/libjio/check.c @@ -293,8 +293,6 @@ loop: if (map != NULL) munmap(map, filelen); - if (curts->name) - free(curts->name); while (curts->op != NULL) { tmpop = curts->op->next; if (curts->op->pdata) diff --git a/libjio/journal.c b/libjio/journal.c new file mode 100644 index 0000000..d9edf65 --- /dev/null +++ b/libjio/journal.c @@ -0,0 +1,318 @@ + +/* + * Internal journal + */ + +#include <sys/types.h> /* [s]size_t */ +#include <sys/stat.h> /* open() */ +#include <fcntl.h> /* open() */ +#include <unistd.h> /* f[data]sync(), close() */ +#include <stdlib.h> /* malloc() and friends */ +#include <limits.h> /* MAX_PATH */ +#include <string.h> /* memcpy() */ +#include <libgen.h> /* basename(), dirname() */ +#include <stdio.h> /* fprintf() */ +#include <dirent.h> /* readdir() and friends */ +#include <errno.h> /* errno */ +#include <sys/mman.h> /* mmap() */ + +#include "libjio.h" +#include "common.h" +#include "compat.h" +#include "journal.h" + + +/* + * helper functions + */ + +/* gets a new transaction id */ +static unsigned int get_tid(struct jfs *fs) +{ + unsigned int curid, rv; + + /* lock the whole file */ + plockf(fs->jfd, F_LOCKW, 0, 0); + + /* read the current max. curid */ + curid = *(fs->jmap); + + fiu_do_on("jio/get_tid/overflow", curid = -1); + + /* increment it and handle overflows */ + rv = curid + 1; + if (rv == 0) + goto exit; + + /* write to the file descriptor */ + *(fs->jmap) = rv; + +exit: + plockf(fs->jfd, F_UNLOCK, 0, 0); + return rv; +} + +/* frees a transaction id */ +static void free_tid(struct jfs *fs, unsigned int tid) +{ + unsigned int curid, i; + char name[PATH_MAX]; + + /* lock the whole file */ + plockf(fs->jfd, F_LOCKW, 0, 0); + + /* read the current max. curid */ + curid = *(fs->jmap); + + /* if we're the max tid, scan the directory looking up for the new + * max; the detailed description can be found in the "doc/" dir */ + if (tid == curid) { + /* look up the new max. */ + for (i = curid - 1; i > 0; i--) { + get_jtfile(fs, i, name); + if (access(name, R_OK | W_OK) == 0) { + break; + } else if (errno != EACCES) { + /* Real error, stop looking for a new max. It + * doesn't hurt us because it's ok if the max + * is higher than it could be */ + break; + } + } + + /* and save it */ + *(fs->jmap) = i; + } + + plockf(fs->jfd, F_UNLOCK, 0, 0); + return; +} + +/* fsync()s a directory */ +static int already_warned_about_sync = 0; +static int fsync_dir(int fd) +{ + int rv; + + rv = fsync(fd); + + if (rv != 0 && (errno == EINVAL || errno == EBADF)) { + /* it seems to be legal that fsync() on directories is not + * implemented, so if this fails with EINVAL or EBADF, just + * call a global sync(); which is awful (and might still + * return before metadata is done) but it seems to be the + * saner choice; otherwise we just fail */ + sync(); + rv = 0; + + if (!already_warned_about_sync) { + fprintf(stderr, "libjio warning: falling back on " \ + "sync() for directory syncing\n"); + already_warned_about_sync = 1; + } + } + + return rv; +} + + +/* + * Journal functions + */ + +/* Creates a new transaction in the journal, returns a pointer to an opaque + * jop_t (that is freed using journal_free), or NULL if there was an error. + * The transaction cannot be modified until journal_free() is called. */ +struct journal_op *journal_new(struct jtrans *ts) +{ + int fd, id; + ssize_t rv; + char *name = NULL; + unsigned char buf_init[J_DISKHEADSIZE]; + unsigned char *bufp; + struct journal_op *jop = NULL; + + jop = malloc(sizeof(struct journal_op)); + if (jop == NULL) + goto error; + + name = (char *) malloc(PATH_MAX); + if (name == NULL) + goto error; + + id = get_tid(ts->fs); + if (id == 0) + goto error; + + /* open the transaction file */ + get_jtfile(ts->fs, id, name); + fd = open(name, O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd < 0) + goto error; + + jop->id = id; + jop->fd = fd; + jop->name = name; + jop->curpos = 0; + jop->ts = ts; + + fiu_exit_on("jio/commit/created_tf"); + + /* and lock it, just in case */ + plockf(fd, F_LOCKW, 0, 0); + + ts->id = id; + + /* save the header */ + bufp = buf_init; + + memcpy(bufp, (void *) &(ts->id), 4); + bufp += 4; + + memcpy(bufp, (void *) &(ts->flags), 4); + bufp += 4; + + memcpy(bufp, (void *) &(ts->numops), 4); + bufp += 4; + + rv = spwrite(fd, buf_init, J_DISKHEADSIZE, 0); + if (rv != J_DISKHEADSIZE) { + free(buf_init); + goto unlink_error; + } + + jop->curpos = J_DISKHEADSIZE; + + fiu_exit_on("jio/commit/tf_header"); + + return jop; + +unlink_error: + unlink(name); + free_tid(ts->fs, ts->id); + close(fd); + +error: + if (name) + free(name); + if (jop) + free(jop); + + return NULL; +} + +/* Saves the transaction in the journal */ +int journal_save(struct journal_op *jop) +{ + ssize_t rv; + uint32_t csum; + struct joper *op; + unsigned char hdr[J_DISKOPHEADSIZE]; + unsigned char *hdrp; + const struct jtrans *ts = jop->ts; + + /* save each transacion in the file */ + for (op = ts->op; op != NULL; op = op->next) { + /* read the current content only if the transaction is not + * marked as NOROLLBACK, and if the data is not there yet, + * which is the normal case, but for rollbacking we fill it + * ourselves */ + if (!(ts->flags & J_NOROLLBACK) && (op->pdata == NULL)) { + op->pdata = malloc(op->len); + if (op->pdata == NULL) + goto error; + + op->plen = op->len; + + rv = spread(ts->fs->fd, op->pdata, op->len, + op->offset); + if (rv < 0) + goto error; + if (rv < op->len) { + /* we are extending the file! */ + /* ftruncate(ts->fs->fd, op->offset + op->len); */ + op->plen = rv; + } + } + + /* save the operation's header */ + hdrp = hdr; + + memcpy(hdrp, (void *) &(op->len), 4); + hdrp += 4; + + memcpy(hdrp, (void *) &(op->plen), 4); + hdrp += 4; + + memcpy(hdrp, (void *) &(op->offset), 8); + hdrp += 8; + + rv = spwrite(jop->fd, hdr, J_DISKOPHEADSIZE, jop->curpos); + if (rv != J_DISKOPHEADSIZE) + goto error; + + fiu_exit_on("jio/commit/tf_ophdr"); + + jop->curpos += J_DISKOPHEADSIZE; + + /* and save it to the disk */ + rv = spwrite(jop->fd, op->buf, op->len, jop->curpos); + if (rv != op->len) + goto error; + + jop->curpos += op->len; + + fiu_exit_on("jio/commit/tf_opdata"); + } + + fiu_exit_on("jio/commit/tf_data"); + + /* compute and save the checksum (curpos is always small, so there's + * no overflow possibility when we convert to size_t) */ + if (!checksum(jop->fd, jop->curpos, &csum)) + goto error; + + rv = spwrite(jop->fd, &csum, sizeof(uint32_t), jop->curpos); + if (rv != sizeof(uint32_t)) + goto error; + jop->curpos += sizeof(uint32_t); + + /* this is a simple but efficient optimization: instead of doing + * everything O_SYNC, we sync at this point only, this way we avoid + * doing a lot of very small writes; in case of a crash the + * transaction file is only useful if it's complete (ie. after this + * point) so we only flush here (both data and metadata) */ + if (fsync(jop->fd) != 0) + goto error; + if (fsync_dir(ts->fs->jdirfd) != 0) + goto error; + + fiu_exit_on("jio/commit/tf_sync"); + + return 0; + +error: + return -1; +} + +/* Frees a journal operation. + * NOTE: It can't assume the save completed successfuly, so we can call it + * when journal_save() fails. */ +int journal_free(struct journal_op *jop) +{ + unlink(jop->name); + + fiu_exit_on("jio/commit/pre_ok_free_tid"); + free_tid(jop->ts->fs, jop->ts->id); + + close(jop->fd); + + if (jop->name) + free(jop->name); + + free(jop); + + return 0; +} + + diff --git a/libjio/journal.h b/libjio/journal.h new file mode 100644 index 0000000..fa1129c --- /dev/null +++ b/libjio/journal.h @@ -0,0 +1,23 @@ + +#ifndef _JOURNAL_H +#define _JOURNAL_H + +#include "libjio.h" + + +struct journal_op { + int id; + int fd; + char *name; + off_t curpos; + struct jtrans *ts; +}; + +typedef struct journal_op jop_t; + +struct journal_op *journal_new(struct jtrans *ts); +int journal_save(struct journal_op *jop); +int journal_free(struct journal_op *jop); + +#endif + diff --git a/libjio/libjio.h b/libjio/libjio.h index b092a48..4c916ca 100644 --- a/libjio/libjio.h +++ b/libjio/libjio.h @@ -55,7 +55,6 @@ struct joper { /* a transaction */ struct jtrans { struct jfs *fs; /* journal file structure to operate on */ - char *name; /* name of the transaction file */ int id; /* transaction id */ uint32_t flags; /* transaction flags */ unsigned int numops; /* quantity of operations in the list */ @@ -65,9 +64,9 @@ struct jtrans { }; /* lingered transaction */ +struct journal_op; struct jlinger { - int id; /* transaction id */ - char *name; /* name of the transaction file */ + struct journal_op *jop; struct jlinger *next; }; diff --git a/libjio/trans.c b/libjio/trans.c index f1f5a0c..795b5d8 100644 --- a/libjio/trans.c +++ b/libjio/trans.c @@ -22,100 +22,7 @@ #include "libjio.h" #include "common.h" #include "compat.h" - - -/* - * helper functions - */ - -/* gets a new transaction id */ -static unsigned int get_tid(struct jfs *fs) -{ - unsigned int curid, rv; - - /* lock the whole file */ - plockf(fs->jfd, F_LOCKW, 0, 0); - - /* read the current max. curid */ - curid = *(fs->jmap); - - fiu_do_on("jio/get_tid/overflow", curid = -1); - - /* increment it and handle overflows */ - rv = curid + 1; - if (rv == 0) - goto exit; - - /* write to the file descriptor */ - *(fs->jmap) = rv; - -exit: - plockf(fs->jfd, F_UNLOCK, 0, 0); - return rv; -} - -/* frees a transaction id */ -static void free_tid(struct jfs *fs, unsigned int tid) -{ - unsigned int curid, i; - char name[PATH_MAX]; - - /* lock the whole file */ - plockf(fs->jfd, F_LOCKW, 0, 0); - - /* read the current max. curid */ - curid = *(fs->jmap); - - /* if we're the max tid, scan the directory looking up for the new - * max; the detailed description can be found in the "doc/" dir */ - if (tid == curid) { - /* look up the new max. */ - for (i = curid - 1; i > 0; i--) { - get_jtfile(fs, i, name); - if (access(name, R_OK | W_OK) == 0) { - break; - } else if (errno != EACCES) { - /* Real error, stop looking for a new max. It - * doesn't hurt us because it's ok if the max - * is higher than it could be */ - break; - } - } - - /* and save it */ - *(fs->jmap) = i; - } - - plockf(fs->jfd, F_UNLOCK, 0, 0); - return; -} - -/* fsync()s a directory */ -static int already_warned_about_sync = 0; -static int fsync_dir(int fd) -{ - int rv; - - rv = fsync(fd); - - if (rv != 0 && (errno == EINVAL || errno == EBADF)) { - /* it seems to be legal that fsync() on directories is not - * implemented, so if this fails with EINVAL or EBADF, just - * call a global sync(); which is awful (and might still - * return before metadata is done) but it seems to be the - * saner choice; otherwise we just fail */ - sync(); - rv = 0; - - if (!already_warned_about_sync) { - fprintf(stderr, "libjio warning: falling back on " \ - "sync() for directory syncing\n"); - already_warned_about_sync = 1; - } - } - - return rv; -} +#include "journal.h" /* @@ -128,7 +35,6 @@ void jtrans_init(struct jfs *fs, struct jtrans *ts) pthread_mutexattr_t attr; ts->fs = fs; - ts->name = NULL; ts->id = 0; ts->flags = fs->flags; ts->op = NULL; @@ -148,9 +54,6 @@ void jtrans_free(struct jtrans *ts) ts->fs = NULL; - if (ts->name) - free(ts->name); - while (ts->op != NULL) { tmpop = ts->op->next; @@ -243,14 +146,10 @@ int jtrans_add(struct jtrans *ts, const void *buf, size_t count, off_t offset) /* commit a transaction */ ssize_t jtrans_commit(struct jtrans *ts) { - int id, fd = -1; ssize_t rv; - uint32_t csum; - char *name; - unsigned char *buf_init, *bufp; struct joper *op; struct jlinger *linger; - off_t curpos = 0; + jop_t *jop; size_t written = 0; pthread_mutex_lock(&(ts->lock)); @@ -263,56 +162,6 @@ ssize_t jtrans_commit(struct jtrans *ts) if (ts->flags & J_RDONLY) goto exit; - name = (char *) malloc(PATH_MAX); - if (name == NULL) - goto exit; - - id = get_tid(ts->fs); - if (id == 0) - goto exit; - - /* open the transaction file */ - get_jtfile(ts->fs, id, name); - fd = open(name, O_RDWR | O_CREAT | O_TRUNC, 0600); - if (fd < 0) - goto exit; - - fiu_exit_on("jio/commit/created_tf"); - - /* and lock it */ - plockf(fd, F_LOCKW, 0, 0); - - ts->id = id; - ts->name = name; - - /* save the header */ - buf_init = malloc(J_DISKHEADSIZE); - if (buf_init == NULL) - goto unlink_exit; - - bufp = buf_init; - - memcpy(bufp, (void *) &(ts->id), 4); - bufp += 4; - - memcpy(bufp, (void *) &(ts->flags), 4); - bufp += 4; - - memcpy(bufp, (void *) &(ts->numops), 4); - bufp += 4; - - rv = spwrite(fd, buf_init, J_DISKHEADSIZE, 0); - if (rv != J_DISKHEADSIZE) { - free(buf_init); - goto unlink_exit; - } - - fiu_exit_on("jio/commit/tf_header"); - - free(buf_init); - - curpos = J_DISKHEADSIZE; - /* first of all lock all the regions we're going to work with; * otherwise there could be another transaction trying to write the * same spots and we could end up with interleaved writes, that could @@ -323,98 +172,19 @@ ssize_t jtrans_commit(struct jtrans *ts) lr = plockf(ts->fs->fd, F_LOCKW, op->offset, op->len); if (lr == -1) /* note it can fail with EDEADLK */ - goto unlink_exit; + goto unlock_exit; op->locked = 1; } } - /* save each transacion in the file */ - for (op = ts->op; op != NULL; op = op->next) { - /* read the current content only if the transaction is not - * marked as NOROLLBACK, and if the data is not there yet, - * which is the normal case, but for rollbacking we fill it - * ourselves */ - if (!(ts->flags & J_NOROLLBACK) && (op->pdata == NULL)) { - op->pdata = malloc(op->len); - if (op->pdata == NULL) - goto unlink_exit; - - op->plen = op->len; - - rv = spread(ts->fs->fd, op->pdata, op->len, - op->offset); - if (rv < 0) - goto unlink_exit; - if (rv < op->len) { - /* we are extending the file! */ - /* ftruncate(ts->fs->fd, op->offset + op->len); */ - op->plen = rv; - } - } - - /* save the operation's header */ - buf_init = malloc(J_DISKOPHEADSIZE); - if (buf_init == NULL) - goto unlink_exit; - - bufp = buf_init; - - memcpy(bufp, (void *) &(op->len), 4); - bufp += 4; - - memcpy(bufp, (void *) &(op->plen), 4); - bufp += 4; - - memcpy(bufp, (void *) &(op->offset), 8); - bufp += 8; - - rv = spwrite(fd, buf_init, J_DISKOPHEADSIZE, curpos); - if (rv != J_DISKOPHEADSIZE) { - free(buf_init); - goto unlink_exit; - } - - fiu_exit_on("jio/commit/tf_ophdr"); - - free(buf_init); - - curpos += J_DISKOPHEADSIZE; - - /* and save it to the disk */ - rv = spwrite(fd, op->buf, op->len, curpos); - if (rv != op->len) - goto unlink_exit; - - curpos += op->len; - - fiu_exit_on("jio/commit/tf_opdata"); - } - - fiu_exit_on("jio/commit/tf_data"); + jop = journal_new(ts); + if (jop == NULL) + goto unlock_exit; - /* compute and save the checksum (curpos is always small, so there's - * no overflow possibility when we convert to size_t) */ - if (!checksum(fd, curpos, &csum)) + rv = journal_save(jop); + if (rv < 0) goto unlink_exit; - rv = spwrite(fd, &csum, sizeof(uint32_t), curpos); - if (rv != sizeof(uint32_t)) - goto unlink_exit; - curpos += sizeof(uint32_t); - - /* this is a simple but efficient optimization: instead of doing - * everything O_SYNC, we sync at this point only, this way we avoid - * doing a lot of very small writes; in case of a crash the - * transaction file is only useful if it's complete (ie. after this - * point) so we only flush here (both data and metadata) */ - if (fsync(fd) != 0) - goto unlink_exit; - if (fsync_dir(ts->fs->jdirfd) != 0) { - goto unlink_exit; - } - - fiu_exit_on("jio/commit/tf_sync"); - /* now that we have a safe transaction file, let's apply it */ written = 0; for (op = ts->op; op != NULL; op = op->next) { @@ -433,24 +203,20 @@ ssize_t jtrans_commit(struct jtrans *ts) if (linger == NULL) goto rollback_exit; - linger->id = id; - linger->name = strdup(name); + linger->jop = jop; pthread_mutex_lock(&(ts->fs->ltlock)); linger->next = ts->fs->ltrans; ts->fs->ltrans = linger; pthread_mutex_unlock(&(ts->fs->ltlock)); } else { - if (fdatasync(ts->fs->fd) != 0) + rv = journal_free(jop); + if (rv != 0) goto rollback_exit; - - /* the transaction has been applied, so we cleanup and remove - * it from the disk */ - unlink(name); - fiu_exit_on("jio/commit/pre_ok_free_tid"); - free_tid(ts->fs, ts->id); } + jop = NULL; + /* mark the transaction as committed, _after_ it was removed */ ts->flags = ts->flags | J_COMMITTED; @@ -481,13 +247,10 @@ rollback_exit: } unlink_exit: - if (!(ts->flags & J_COMMITTED)) { - unlink(name); - free_tid(ts->fs, ts->id); - } - - close(fd); + if (jop) + journal_free(jop); +unlock_exit: /* always unlock everything at the end; otherwise we could have * half-overlapping transactions applying simultaneously, and if * anything goes wrong it would be possible to break consistency */ @@ -719,15 +482,13 @@ int jsync(struct jfs *fs) return rv; pthread_mutex_lock(&(fs->ltlock)); + ltmp = fs->ltrans; while (fs->ltrans != NULL) { - free_tid(fs, fs->ltrans->id); fiu_exit_on("jio/jsync/pre_unlink"); - unlink(fs->ltrans->name); - free(fs->ltrans->name); + journal_free(fs->ltrans->jop); ltmp = fs->ltrans->next; free(fs->ltrans); - fs->ltrans = ltmp; }