git » libjio » commit 64757bf

libjio: Mark the journal as broken when we should not continue committing

author Alberto Bertogli
2009-07-29 21:05:13 UTC
committer Alberto Bertogli
2009-07-30 17:11:59 UTC
parent 488fbecb3ce7075e96f1688bd5fb3c4f3b15d6dd

libjio: Mark the journal as broken when we should not continue committing

When we have very serious errors that can cause consistency problems if we
continue to operate on the file, mark the journal as broken by creating a
file named "broken". journal_new() will automatically fail if the file is
present, so no further transactions will be allowed.

Transactions that were being committed when we set the flag are not a
problem because they affect different portions of the file.

This protects from the following scenario:

 - Commit data X to a range A-B:
   - Write the transaction 1 to the journal
   - Write the data to the file
   - Attempt to remove the journal file: fail
 - Commit data Y to a range A-B: success
 - Crash
 - jfsck():
   - Finds transaction 1, writes data X to the file which corrupts things
     because data Y is the one that should be there.

We also return -2 in those cases, which means the user should abort
immediately, but this measure can help to prevent problems when the user
is not careful or can't stop further commits.

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>

libjio/check.c +14 -2
libjio/journal.c +35 -3
tests/behaviour/t_corruption.py +30 -0

diff --git a/libjio/check.c b/libjio/check.c
index 4cd97b4..965c177 100644
--- a/libjio/check.c
+++ b/libjio/check.c
@@ -91,7 +91,7 @@ enum jfsck_return jfsck(const char *name, const char *jdir,
 {
 	int tfd, rv, i, ret;
 	unsigned int maxtid;
-	char jlockfile[PATH_MAX], tname[PATH_MAX];
+	char jlockfile[PATH_MAX], tname[PATH_MAX], brokenname[PATH_MAX];
 	struct stat sinfo;
 	struct jfs fs;
 	struct jtrans *curts;
@@ -222,6 +222,19 @@ enum jfsck_return jfsck(const char *name, const char *jdir,
 		goto exit;
 	}
 
+	/* remove the broken mark so we can call jtrans_commit() */
+	snprintf(brokenname, PATH_MAX, "%s/broken", fs.jdir);
+	rv = access(brokenname, F_OK);
+	if (rv == 0) {
+		if (unlink(brokenname) != 0) {
+			ret = J_EIO;
+			goto exit;
+		}
+	} else if (errno != ENOENT) {
+		ret = J_EIO;
+		goto exit;
+	}
+
 	/* verify (and possibly fix) all the transactions */
 	for (i = 1; i <= maxtid; i++) {
 		curts = jtrans_new(&fs, 0);
@@ -337,6 +350,5 @@ exit:
 		munmap(fs.jmap, sizeof(unsigned int));
 
 	return ret;
-
 }
 
diff --git a/libjio/journal.c b/libjio/journal.c
index 41b292f..7e5c41d 100644
--- a/libjio/journal.c
+++ b/libjio/journal.c
@@ -225,6 +225,31 @@ static int corrupt_journal_file(struct journal_op *jop)
 	return 0;
 }
 
+/** Mark the journal as broken. To do so, we just create a file named "broken"
+ * inside the journal directory. Used internally to mark severe journal errors
+ * that should prevent further journal use to avoid potential corruption, like
+ * failures to remove transaction files. The mark is removed by jfsck(). */
+static int mark_broken(struct jfs *fs)
+{
+	char broken_path[PATH_MAX];
+	int fd;
+
+	snprintf(broken_path, PATH_MAX, "%s/broken", fs->jdir);
+	fd = creat(broken_path, 0600);
+	close(fd);
+
+	return fd >= 0;
+}
+
+/** Check if the journal is broken */
+static int is_broken(struct jfs *fs)
+{
+	char broken_path[PATH_MAX];
+
+	snprintf(broken_path, PATH_MAX, "%s/broken", fs->jdir);
+	return access(broken_path, F_OK) == 0;
+}
+
 
 /*
  * Journal functions
@@ -237,10 +262,13 @@ struct journal_op *journal_new(struct jfs *fs, unsigned int flags)
 	int fd, id;
 	ssize_t rv;
 	char *name = NULL;
-	struct journal_op *jop;
+	struct journal_op *jop = NULL;
 	struct on_disk_hdr hdr;
 	struct iovec iov[1];
 
+	if (is_broken(fs))
+		goto error;
+
 	jop = malloc(sizeof(struct journal_op));
 	if (jop == NULL)
 		goto error;
@@ -414,13 +442,17 @@ int journal_free(struct journal_op *jop, int do_unlink)
 		 * unlink failed, so we attempt to truncate it, and if that
 		 * fails we corrupt it as a last resort. */
 		if (ftruncate(jop->fd, 0) != 0) {
-			if (corrupt_journal_file(jop) != 0)
+			if (corrupt_journal_file(jop) != 0) {
+				mark_broken(jop->fs);
 				goto exit;
+			}
 		}
 	}
 
-	if (fsync_dir(jop->fs->jdirfd) != 0)
+	if (fsync_dir(jop->fs->jdirfd) != 0) {
+		mark_broken(jop->fs);
 		goto exit;
+	}
 
 	fiu_exit_on("jio/commit/pre_ok_free_tid");
 	free_tid(jop->fs, jop->id);
diff --git a/tests/behaviour/t_corruption.py b/tests/behaviour/t_corruption.py
index 4d228ab..0a9d270 100644
--- a/tests/behaviour/t_corruption.py
+++ b/tests/behaviour/t_corruption.py
@@ -145,3 +145,33 @@ def test_c07():
 	assert content(n) == ''
 	cleanup(n)
 
+def test_c08():
+	"broken journal"
+	c = gencontent()
+
+	f, jf = bitmp(jflags = 0)
+	n = f.name
+
+	def f1(f, jf):
+		fiu.enable("jio/commit/tf_sync")
+		jf.write(c)
+
+	run_forked(f1, f, jf)
+
+	assert content(n) == ''
+	open(jiodir(n) + '/broken', 'w+')
+
+	def f2(f, jf):
+		try:
+			jf.pwrite(c, 200)
+		except IOError:
+			return
+		raise RuntimeError
+
+	run_forked(f2, f, jf)
+
+	fsck_verify(n, reapplied = 1)
+	assert content(n) == c
+	assert not os.path.exists(jiodir(n) + '/broken')
+	cleanup(n)
+