author | Alberto Bertogli
<albertito@gmail.com> 2007-07-15 11:43:40 UTC |
committer | Alberto Bertogli
<albertito@gmail.com> 2007-07-15 11:43:40 UTC |
Make.conf | +17 | -0 |
Makefile | +43 | -0 |
README | +31 | -0 |
doc/LICENSE | +186 | -0 |
doc/TODO | +7 | -0 |
doc/big_transactions | +9 | -0 |
doc/jiofsck | +12 | -0 |
doc/libjio.3 | +172 | -0 |
doc/libjio.h | +104 | -0 |
doc/libjio.lyx | +450 | -0 |
doc/threads | +20 | -0 |
jiofsck.c | +66 | -0 |
libjio.c | +773 | -0 |
libjio.h | +104 | -0 |
samples/build | +4 | -0 |
samples/clean | +2 | -0 |
samples/jio1.c | +62 | -0 |
samples/jio2.c | +73 | -0 |
samples/jio3.c | +43 | -0 |
tests/.1.jio/1 | +0 | -0 |
tests/.1.jio/2 | +1 | -0 |
tests/.1.jio/3 | +0 | -0 |
tests/.1.jio/4 | +0 | -0 |
tests/.1.jio/5 | +0 | -0 |
tests/.1.jio/6 | +0 | -0 |
tests/.1.jio/8 | +0 | -0 |
tests/.1.jio/desc | +14 | -0 |
tests/.1.jio/lock | +0 | -0 |
tests/1 | +1 | -0 |
diff --git a/Make.conf b/Make.conf new file mode 100644 index 0000000..b4696a8 --- /dev/null +++ b/Make.conf @@ -0,0 +1,17 @@ + +VERSION="0.10" + +CC = gcc +CFLAGS += -Wall -O6 \ + -D_LARGEFILE_SOURCE=1 -D_LARGEFILE64_SOURCE=1 \ + -D_LFS_LARGEFILE=1 -D_LFS64_LARGEFILE=1 + +ifdef DEBUG +CFLAGS += -g -pg -fprofile-arcs -ftest-coverage +endif + +# prefix for installing the binaries +PREFIX=/usr/local + + + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..52ef03c --- /dev/null +++ b/Makefile @@ -0,0 +1,43 @@ + +include Make.conf + + +# objects to build +OBJS = libjio.o + +# rules +default: all + +all: shared static jiofsck + +shared: libjio.o + $(CC) -shared libjio.o -o libjio.so + +static: libjio.o + $(AR) cr libjio.a libjio.o + +jiofsck: jiofsck.o static + $(CC) jiofsck.o libjio.a -lpthread -o jiofsck + +install: all + install -g root -o root -m 0755 libjio.so $(PREFIX)/lib + install -g root -o root -m 0644 libjio.a $(PREFIX)/lib + install -g root -o root -m 0644 libjio.h $(PREFIX)/include + install -g root -o root -m 0775 jiofsck $(PREFIX)/bin + install -g root -o root -m 0644 -d $(PREFIX)/man/man3 + install -g root -o root -m 0644 doc/libjio.3 $(PREFIX)/man/man3/ + @echo + @echo "Please run ldconfig to update your library cache" + @echo + +.c.o: + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + + +clean: + rm -f libjio.o libjio.a libjio.so jiofsck.o jiofsck + rm -f *.bb *.bbg *.da *.gcov gmon.out + + +.PHONY: default all shared static install clean + diff --git a/README b/README new file mode 100644 index 0000000..a04ea5f --- /dev/null +++ b/README @@ -0,0 +1,31 @@ + +libjio - A library for Journalled I/O +Alberto Bertogli (albertogli@telpin.com.ar) +------------------------------------------- + +As the name says, this is a simple library to do journalled, +transaction-oriented I/O. + +It provides a very simple transaction api to commit and rollback transactions, +and on top of that a unix-alike set of functions to perform most regular +operations (ie. open, read, write). + +On the disk, the file you work on is exactly like a regular one, but a +special directory is created to store in-flight transactions. + +This allows both simple file manipulation, recovery and debugging because +everything is isolated. + +There's a more detailed document about the library itself in doc/libjio.ps +(or, alternatively, the ascii version doc/libjio.txt), and a manpage where you +will find the programming reference. + + +It is licensed under the Open Software License version 2.0. + +Comments and patches are always welcome; please send them to my email address, +albertogli@telpin.com.ar. + +Thanks, + Alberto + diff --git a/doc/LICENSE b/doc/LICENSE new file mode 100644 index 0000000..ef6283f --- /dev/null +++ b/doc/LICENSE @@ -0,0 +1,186 @@ + +This project, 'libjio', is copyrighted by Alberto Bertogli and licensed under +the Open Software License version 2.0 as obtained from www.opensource.org (and +included here-in for easy reference) (that license itself is copyrighted by +Larry Rosen). + +Note that the "Original Work" that this license covers is only the library +itself. Thus just the act of linking/importing this library into another +program does NOT in itself make that program considered a derivative work of +this Original Work. + + Alberto Bertogli + 21 February 2004 + +------------------------------------------------------------------------- + + +Open Software License + v. 2.0 + +This Open Software License (the "License") applies to any original work of +authorship (the "Original Work") whose owner (the "Licensor") has placed the +following notice immediately following the copyright notice for the Original +Work: + + Licensed under the Open Software License version 2.0 + + +1) Grant of Copyright License. Licensor hereby grants You a world-wide, +royalty-free, non-exclusive, perpetual, sublicenseable license to do the +following: + + a) to reproduce the Original Work in copies; + + b) to prepare derivative works ("Derivative Works") based upon the + Original Work; + + c) to distribute copies of the Original Work and Derivative Works to + the public, with the proviso that copies of Original Work or + Derivative Works that You distribute shall be licensed under the + Open Software License; + + d) to perform the Original Work publicly; and + + e) to display the Original Work publicly. + +2) Grant of Patent License. Licensor hereby grants You a world-wide, +royalty-free, non-exclusive, perpetual, sublicenseable license, under patent +claims owned or controlled by the Licensor that are embodied in the Original +Work as furnished by the Licensor, to make, use, sell and offer for sale the +Original Work and Derivative Works. + +3) Grant of Source Code License. The term "Source Code" means the preferred +form of the Original Work for making modifications to it and all available +documentation describing how to modify the Original Work. Licensor hereby +agrees to provide a machine-readable copy of the Source Code of the Original +Work along with each copy of the Original Work that Licensor distributes. +Licensor reserves the right to satisfy this obligation by placing a +machine-readable copy of the Source Code in an information repository +reasonably calculated to permit inexpensive and convenient access by You for +as long as Licensor continues to distribute the Original Work, and by +publishing the address of that information repository in a notice immediately +following the copyright notice that applies to the Original Work. + +4) Exclusions From License Grant. Neither the names of Licensor, nor the names +of any contributors to the Original Work, nor any of their trademarks or +service marks, may be used to endorse or promote products derived from this +Original Work without express prior written permission of the Licensor. +Nothing in this License shall be deemed to grant any rights to trademarks, +copyrights, patents, trade secrets or any other intellectual property of +Licensor except as expressly stated herein. No patent license is granted to +make, use, sell or offer to sell embodiments of any patent claims other than +the licensed claims defined in Section 2. No right is granted to the +trademarks of Licensor even if such marks are included in the Original Work. +Nothing in this License shall be interpreted to prohibit Licensor from +licensing under different terms from this License any Original Work that +Licensor otherwise would have a right to license. + +5) External Deployment. The term "External Deployment" means the use or +distribution of the Original Work or Derivative Works in any way such that the +Original Work or Derivative Works may be used by anyone other than You, +whether the Original Work or Derivative Works are distributed to those persons +or made available as an application intended for use over a computer network. +As an express condition for the grants of license hereunder, You agree that +any External Deployment by You of a Derivative Work shall be deemed a +distribution and shall be licensed to all under the terms of this License, as +prescribed in section 1(c) herein. + +6) Attribution Rights. You must retain, in the Source Code of any Derivative +Works that You create, all copyright, patent or trademark notices from the +Source Code of the Original Work, as well as any notices of licensing and any +descriptive text identified therein as an "Attribution Notice." You must cause +the Source Code for any Derivative Works that You create to carry a prominent +Attribution Notice reasonably calculated to inform recipients that You have +modified the Original Work. + +7) Warranty of Provenance and Disclaimer of Warranty. Licensor warrants that +the copyright in and to the Original Work and the patent rights granted herein +by Licensor are owned by the Licensor or are sublicensed to You under the +terms of this License with the permission of the contributor(s) of those +copyrights and patent rights. Except as expressly stated in the immediately +proceeding sentence, the Original Work is provided under this License on an +"AS IS" BASIS and WITHOUT WARRANTY, either express or implied, including, +without limitation, the warranties of NON-INFRINGEMENT, MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY OF THE +ORIGINAL WORK IS WITH YOU. This DISCLAIMER OF WARRANTY constitutes an +essential part of this License. No license to Original Work is granted +hereunder except under this disclaimer. + +8) Limitation of Liability. Under no circumstances and under no legal theory, +whether in tort (including negligence), contract, or otherwise, shall the +Licensor be liable to any person for any direct, indirect, special, +incidental, or consequential damages of any character arising as a result of +this License or the use of the Original Work including, without limitation, +damages for loss of goodwill, work stoppage, computer failure or malfunction, +or any and all other commercial damages or losses. This limitation of +liability shall not apply to liability for death or personal injury resulting +from Licensor's negligence to the extent applicable law prohibits such +limitation. Some jurisdictions do not allow the exclusion or limitation of +incidental or consequential damages, so this exclusion and limitation may not +apply to You. + +9) Acceptance and Termination. If You distribute copies of the Original Work +or a Derivative Work, You must make a reasonable effort under the +circumstances to obtain the express assent of recipients to the terms of this +License. Nothing else but this License (or another written agreement between +Licensor and You) grants You permission to create Derivative Works based upon +the Original Work or to exercise any of the rights granted in Section 1 +herein, and any attempt to do so except under the terms of this License (or +another written agreement between Licensor and You) is expressly prohibited by +U.S. copyright law, the equivalent laws of other countries, and by +international treaty. Therefore, by exercising any of the rights granted to +You in Section 1 herein, You indicate Your acceptance of this License and all +of its terms and conditions. This License shall terminate immediately and you +may no longer exercise any of the rights granted to You by this License upon +Your failure to honor the proviso in Section 1(c) herein. + +10) Termination for Patent Action. This License shall terminate automatically +and You may no longer exercise any of the rights granted to You by this +License as of the date You commence an action, including a cross-claim or +counterclaim, for patent infringement (i) against Licensor with respect to a +patent applicable to software or (ii) against any entity with respect to a +patent applicable to the Original Work (but excluding combinations of the +Original Work with other software or hardware). + +11) Jurisdiction, Venue and Governing Law. Any action or suit relating to this +License may be brought only in the courts of a jurisdiction wherein the +Licensor resides or in which Licensor conducts its primary business, and under +the laws of that jurisdiction excluding its conflict-of-law provisions. The +application of the United Nations Convention on Contracts for the +International Sale of Goods is expressly excluded. Any use of the Original +Work outside the scope of this License or after its termination shall be +subject to the requirements and penalties of the U.S. Copyright Act, 17 U.S.C. +101 et seq., the equivalent laws of other countries, and international treaty. +This section shall survive the termination of this License. + +12) Attorneys Fees. In any action to enforce the terms of this License or +seeking damages relating thereto, the prevailing party shall be entitled to +recover its costs and expenses, including, without limitation, reasonable +attorneys' fees and costs incurred in connection with such action, including +any appeal of such action. This section shall survive the termination of this +License. + +13) Miscellaneous. This License represents the complete agreement concerning +the subject matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent necessary +to make it enforceable. + +14) Definition of "You" in This License. "You" throughout this License, +whether in upper or lower case, means an individual or a legal entity +exercising rights under, and complying with all of the terms of, this License. +For legal entities, "You" includes any entity that controls, is controlled by, +or is under common control with you. For purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the direction or +management of such entity, whether by contract or otherwise, or (ii) ownership +of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial +ownership of such entity. + +15) Right to Use. You may use the Original Work in all ways not otherwise +restricted or conditioned by this License or by law, and Licensor promises not +to interfere with or be responsible for such uses by You. + +This license is Copyright (C) 2003 Lawrence E. Rosen. All rights reserved. +Permission is hereby granted to copy and distribute this license without +modification. This license may not be modified without the express written +permission of its copyright owner. diff --git a/doc/TODO b/doc/TODO new file mode 100644 index 0000000..639d149 --- /dev/null +++ b/doc/TODO @@ -0,0 +1,7 @@ + + * allow to store the journal somewhere else (or just leave it as-is, and + let the user do a simple symlink of the journal directory?) + * make jfsck return a list of fixed transactions + * more testing on j{read|write}v() + * more samples and integration inside the build system + * a better manpage diff --git a/doc/big_transactions b/doc/big_transactions new file mode 100644 index 0000000..3c5392d --- /dev/null +++ b/doc/big_transactions @@ -0,0 +1,9 @@ + +If you have to create big transactions, instead of creating a huge buffer you +can mmap a temporary file and periodically sync it; and when you're done, just +jtrans_commit() the whole thing. + +This would be a quite efficient way, without any performance penalty and a +very simple approach; I originally thought of doing this on the journal, but +it had many drawbacks that made it much expensive, slower and complex. + diff --git a/doc/jiofsck b/doc/jiofsck new file mode 100644 index 0000000..b42aeb2 --- /dev/null +++ b/doc/jiofsck @@ -0,0 +1,12 @@ + +Note that jfsck does not warantee that all the transactions are fully +completed, it can only do so if you run it without any other process accessing +the journal. + +If you want to see this, you can take a look at the struct jfsck_result. It +include a field named in_progress which tell the number of transactions that +were in progress at the moment of checking, and as such weren't checked. + +Be aware that the counter is not atomic, as two checkers can be running at the +same time. + diff --git a/doc/libjio.3 b/doc/libjio.3 new file mode 100644 index 0000000..b8815f8 --- /dev/null +++ b/doc/libjio.3 @@ -0,0 +1,172 @@ +.TH libjio 3 "21/Feb/2004" +.SH NAME +libjio - A library for Journalled I/O + +.SH FUNCTIONS + +.B #include <libjio.h> + +.BI "int jopen(struct jfs *" fs ", char *" name ", int " flags ", int " mode ", int " jflags " ); + +.BI "ssize_t jread(struct jfs *" fs ", void *" buf ", size_t " count " ); + +.BI "ssize_t jpread(struct jfs *" fs ", void *" buf ", size_t " count ", off_t " offset " ); + +.BI "ssize_t jreadv(struct jfs *" fs ", struct iovec *" vector ", int " count " ); + +.BI "ssize_t jwrite(struct jfs *" fs ", void *" buf ", size_t " count " ); + +.BI "ssize_t jpwrite(struct jfs *" fs ", void *" buf ", size_t " count ", off_t " offset " ); + +.BI "ssize_t jwritev(struct jfs *" fs ", struct iovec *" vector ", int " count " ); + +.BI "int jtruncate(struct jfs *" fs ", off_t " lenght " ); + +.BI "int jclose(struct jfs *" fs " ); + +.BI "void jtrans_init(struct jfs *" fs " , struct jtrans *" ts " ); + +.BI "int jtrans_commit(struct jtrans *" ts " ); + +.BI "int jtrans_rollback(struct jtrans *" ts " ); + +.BI "void jtrans_free(struct jtrans *" ts " ); + +.BI "int jfsck(char *" name ", struct jfsck_result *" res " ); + +.SH STRUCTURES +.PP +.RS +.NF +struct jfs +{ + int fd; /* main file descriptor */ + char *name; /* and its name */ + int jfd; /* journal's lock file descriptor */ + int flags; /* journal mode options used in jopen() */ + pthread_mutex_t lock; /* a soft lock used in some operations */ + } +.FI +.RE + +.RS +.NF +struct jtrans +{ + struct jfs *fs; /* journal file structure to operate on */ + char *name; /* name of the transaction file */ + int id; /* transaction id */ + int flags; /* misc flags */ + void *buf; /* buffer */ + size_t len; /* buffer lenght */ + off_t offset; /* file offset to operate on */ + void *udata; /* user-supplied data */ + size_t ulen; /* udata lenght */ + void *pdata; /* previous data, for rollback */ + size_t plen; /* pdata lenght */ + } +.FI +.RE + +.RS +.NF +struct jfsck_result +{ + int total; /* total transactions files we looked at */ + int invalid; /* invalid files in the journal directory */ + int in_progress; /* transactions in progress */ + int broken_head; /* transactions broken (header missing) */ + int broken_body; /* transactions broken (body missing) */ + int load_error; /* errors loading the transaction */ + int apply_error; /* errors applying the transaction */ + int reapplied; /* transactions that were re-applied */ + } +.FI +.RE + +.SH DESCRIPTION + +libjio is a library to do transaction-oriented journalled I/O. This manpage +describes it's C API very briefly, further information can be found in the +documentation that comes along with the library itself, or on the web at +http://auriga.wearlab.de/~alb/libjio. + +We can group the functions into three groups: a common one, with functions +common to the other two; low-level one, which consists of jtrans_commit and +jtrans_receive. They provide a method for manipulating transactions, which are +defined in a structure named struct jtrans (described above). + +The second group mimics somehow the traditional UNIX API by providing similar +interfaces to read(), write(), and their friends. + +.SH COMMON API + +Most functions reference somehow the structures described avobe, specially +struct jfs and struct jtrans. They represent a file to operate on and a single +transaction, respectively. To open a file, you should use the jopen() call, +which is just like the normal open() call but affects a pointer to a struct +jfs. To close a file, use jclose(). They're exactly like the open() and +close() functions but use a struct jfs instead of a file descriptor; take a +look at their manpages if you have any doubts about how to use them. + +There is one function that differs from the rest, which is jfsck(). It is used +to perform journal checking and recovery in case of a crash. It must be +performed when nobody else is using the file (like in the case of a filesystem +which can't be mounted), and it returns 0 if success or -1 in case of a +failure. If it succeed, a structure jfsck_result that summarizes the outcome +of the operation. There is also a program named jiofsck which is just a simple +human frontend to this function. + +.SH HIGH LEVEL API + +The high level API, as explained before, consists of the functions jread(), +jpread(), jreadv(), jwrite(), jpwrite(), jwritev(), jtruncate(). In most cases +you will only need to use this, because they're simple and familiar. + +They are all exactly like the UNIX equivalent (if you still don't get it, take +the initial 'j' out), and behave the same way, with the only exception that +instead of a file descriptor you need to pass a pointer to a struct jfs (just +like jopen() and jclose()). Again, I will not duplicate the manpage for all +these functions, just refer to the regular UNIX versions to see how to use +them, they all have the same semantics and behave the same way. + +.SH LOW LEVEL API + +The low level functions are the ones which manipulate transactions directly; +they are four: jtrans_init(), jtrans_commit(), jtrans_rollback() and +jtrans_free(). These are intended to be use in special situations where your +application needs direct control over the transactions. + +jtrans_init() and jtrans_free() just initialize and free a given transaction, +the former should be called prior any use, and the latter when you want to +destroy a transaction. Note that jtrans_free() is not a disk operation, but +only frees the pointers that were previously allocated by the library; all +disk operations are performed by the other two functions. They have no return +value. + +jtrans_commit() is in charge of commiting the given transaction (which data +was completed by you, and is described in the STRUCTURES section), and after +its return the data has been saved to the disk atomically. It returns the +number of bytes written or -1 if there was an error. + +jtrans_rollback() reverses a transaction that was applied with +jtrans_commit(), and leaves the file as it was before applying it. Be very +very careful with this function, it's quite dangerous if you don't know for +sure that you're doing the right thing. It returns as jtrans_commit(). + +.SH BUGS + +None that I'm aware of, but if you find one please let me know at +albertogli@telpin.com.ar. + +.SH SEE ALSO + +.BR open (2), +.BR read (2), +.BR write (2), +.BR readv (2), +.BR writev (2), +.BR pread (2), +.BR pwrite (2), +.BR ftruncate (2), +.BR close (2) diff --git a/doc/libjio.h b/doc/libjio.h new file mode 100644 index 0000000..4241ab0 --- /dev/null +++ b/doc/libjio.h @@ -0,0 +1,104 @@ + +/* + * libjio - A library for Journalled I/O + * Alberto Bertogli (albertogli@telpin.com.ar) + */ + +#ifndef _LIBJIO_H +#define _LIBJIO_H + +#include <stdint.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <pthread.h> + + +/* logical structures */ +struct jfs { + int fd; /* main file descriptor */ + char *name; /* and its name */ + int jfd; /* journal's lock file descriptor */ + int flags; /* journal mode options used in jopen() */ + pthread_mutex_t lock; /* a soft lock used in some operations */ +}; + +struct jtrans { + struct jfs *fs; /* journal file structure to operate on */ + char *name; /* name of the transaction file */ + int id; /* transaction id */ + int flags; /* misc flags */ + void *buf; /* buffer */ + size_t len; /* buffer lenght */ + off_t offset; /* file offset to operate on */ + void *udata; /* user-supplied data */ + size_t ulen; /* udata lenght */ + void *pdata; /* previous data, for rollback */ + size_t plen; /* pdata lenght */ +}; + +struct jfsck_result { + int total; /* total transactions files we looked at */ + int invalid; /* invalid files in the journal directory */ + int in_progress; /* transactions in progress */ + int broken_head; /* transactions broken (header missing) */ + int broken_body; /* transactions broken (body missing) */ + int load_error; /* errors loading the transaction */ + int apply_error; /* errors applying the transaction */ + int reapplied; /* transactions that were re-applied */ +}; + +/* on-disk structure */ +struct disk_trans { + + /* header (fixed lenght, defined below) */ + uint32_t id; /* id */ + uint32_t flags; /* flags about this transaction */ + uint32_t len; /* data lenght */ + uint32_t ulen; /* user-supplied information lenght */ + uint64_t offset; /* offset relative to the BOF */ + + /* payload (variable lenght) */ + char *udata; /* user-supplied data */ + char *prevdata; /* previous data, optional, for rollback */ + char *data; /* data */ +}; + + +/* basic operations */ +int jopen(struct jfs *fs, char *name, int flags, int mode, int jflags); +ssize_t jread(struct jfs *fs, void *buf, size_t count); +ssize_t jpread(struct jfs *fs, void *buf, size_t count, off_t offset); +ssize_t jreadv(struct jfs *fs, struct iovec *vector, int count); +ssize_t jwrite(struct jfs *fs, void *buf, size_t count); +ssize_t jpwrite(struct jfs *fs, void *buf, size_t count, off_t offset); +ssize_t jwritev(struct jfs *fs, struct iovec *vector, int count); +int jtruncate(struct jfs *fs, off_t lenght); +int jclose(struct jfs *fs); + +/* transaction operations */ +void jtrans_init(struct jfs *fs, struct jtrans *ts); +int jtrans_commit(struct jtrans *ts); +int jtrans_rollback(struct jtrans *ts); +void jtrans_free(struct jtrans *ts); + +/* journal checker */ +int jfsck(char *name, struct jfsck_result *res); + + +/* jfs constants */ +#define J_NOLOCK 1 /* don't lock the file before operating on it */ + +/* jtrans constants */ +#define J_COMMITED 1 /* mark a transaction as commited */ +#define J_ROLLBACKED 2 /* mark a transaction as rollbacked */ + +/* disk_trans constants */ +#define J_DISKTFIXSIZE 24 /* lenght of disk_trans' header */ + +/* jfsck constants (return values) */ +#define J_ESUCCESS 0 /* success - shouldn't be used */ +#define J_ENOENT 1 /* no such file */ +#define J_ENOJOURNAL 2 /* no journal associated */ + +#endif + diff --git a/doc/libjio.lyx b/doc/libjio.lyx new file mode 100644 index 0000000..6af7ed0 --- /dev/null +++ b/doc/libjio.lyx @@ -0,0 +1,450 @@ +#LyX 1.3 created this file. For more info see http://www.lyx.org/ +\lyxformat 221 +\textclass article +\language english +\inputencoding auto +\fontscheme default +\graphics default +\paperfontsize default +\papersize Default +\paperpackage a4 +\use_geometry 0 +\use_amsmath 0 +\use_natbib 0 +\use_numerical_citations 0 +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\quotes_times 2 +\papercolumns 1 +\papersides 1 +\paperpagestyle default + +\layout Title + +libjio - A library for journalled I/O +\layout Author + +Alberto Bertogli (albertogli@telpin.com.ar) +\layout Standard + + +\begin_inset LatexCommand \tableofcontents{} + +\end_inset + + +\layout Section + +Introduction +\layout Standard + + +\emph on +libjio +\emph default + is a library for doing journalled transaction-oriented I/O, providing atomicity + warantees and a simple to use but powerful API. +\layout Standard + +This document explains the design of the library, how it works internally + and why it works that way. + You should read it even if you don't plan to do use the library in strange + ways, it provides (or at least tries to =) an insight view on how the library + performs its job, which can be very valuable knowledge when working with + it. +\layout Standard + +To the user, libjio provides two groups of functions, one UNIX-alike that + implements the journalled versions of the classic functions ( +\emph on +open() +\emph default +, +\emph on +read() +\emph default +, +\emph on +write() +\emph default + and friends); and a lower-level one that center on transactions and allows + the user to manipulate them directly by providing means of commiting and + rollbacking. + The former, as expected, are based on the latter and interact safely with + them. + Besides, it's designed in a way that allows efficient and safe interaction + with I/O performed from outside the library in case you want to. +\layout Standard + +The following sections describe different concepts and procedures that the + library bases its work on. + It's not intended to be a replace to reading the source code: please do + so if you have any doubts, it's not big at all (less than 800 lines, including + comments) and I hope it's readable enough. + If you think that's not the case, please let me know and I'll try to give + you a hand. +\layout Section + +General on-disk data organization +\layout Standard + +On the disk, the file you are working on will look exactly as you expect + and hasn't got a single bit different that what you would get using the + regular API. + But, besides the working file, you will find a directory named after it + where the journaling information lives. + +\layout Standard + +Inside, there are two kind of files: the lock file and transaction files. + The first one is used as a general lock and holds the next transaction + ID to assign, and there is only one; the second one holds one transaction, + which is composed by a header of fixed size and a variable-size payload, + and can be as many as in-flight transactions. + +\layout Standard + +This impose some restrictions to the kind of operations you can perform + over a file while it's currently being used: you can't move it (because + the journal directory name depends on the filename) and you can't unlink + it (for similar reasons). + Some other operations, like truncating, are also done outside the library + and the user is expected to do them atomically when no transactions are + currently being done. + +\layout Standard + +This warnings are no different from a normal simultaneous use under classic + UNIX environments, but they are here to remind you that even tho the library + warantees a lot and eases many things from its user (specially from complex + cases, like multiple threads using the file at the same time), you should + still be careful when doing strange things with files while working on + them. + +\layout Subsection + +The transaction file +\layout Standard + +The transaction file is composed of two main parts: the header and the payload. +\layout Standard + +The header holds basic information about the transaction itself, including + the ID, some flags, the offset to commit to and the lenght of the data. + The payload holds the data, in three parts: user-defined data, previous + data, and real data. +\layout Standard + +User-defined data is not used by the library itself, but it's a space where + the user can save private data that can be useful later. + Previous data is saved by the library prior applying the commit, so transaction +s can be rollbacked. + Real data is just the data to save to the disk, and it is saved because + if a crash occurs when while we are applying the transaction we can recover + gracefuly. +\layout Section + +The commit procedure +\layout Standard + +We call "commit" to the action of +\emph on +safely +\emph default + and +\emph on +atomically +\emph default + write some given data to the disk. +\layout Standard + +The former, +\emph on +safely +\emph default +, means that after a commit has been done we can assume the data will not + get lost and can be retrieved, unless of course some major event happens + (like a hardware failure). + For us, this means that the data was effectively written to the disk and + if a crash occurs after the commit operation has returned, the operation + will be complete and data will be available from the file. +\layout Standard + +The latter, +\emph on +atomically +\emph default +, warantees that the operation is either completely done, or not done at + all. + This is a really common word, specially if you have worked with multiprocessing +, and should be quite familiar. + We implement atomicity by combining fine-grained locks and journalling, + which can assure us both to be able to recover from crashes, and to have + exclusive access to a portion of the file without having any other transaction + overlap it. +\layout Standard + +Well, so much for talking, now let's get real; libjio applies commits in + a very simple and straightforward way, inside +\emph on +jtrans_commit() +\emph default +: +\layout Itemize + +Lock the section where the commit takes place +\layout Itemize + +Open the transaction file +\layout Itemize + +Write the header +\layout Itemize + +Write the user data (if any) +\layout Itemize + +Read the previous data from the file +\layout Itemize + +Write the previous data in the transaction +\layout Itemize + +Write the data to commit to the transaction file +\layout Itemize + +Write the data to the file +\layout Itemize + +Mark the transaction as commited by setting a flag in the header +\layout Itemize + +Unlink the transaction file +\layout Itemize + +Unlock the section where the commit takes place +\layout Standard + +This may look as a lot of steps, but they're not as much as it looks like + inside the code, and allows a recovery from interruptions in every step + of the way (or even in the middle of a step). +\layout Section + +The rollback procedure +\layout Standard + +First of all, rollbacking is like +\begin_inset Quotes eld +\end_inset + +undo +\begin_inset Quotes erd +\end_inset + + a commit: return the data to the state it had exactly before a given commit + was applied. + Due to the way we handle commits, doing this operation becomes quite simple + and straightforward. +\layout Standard + +In the previous section we said that each transaction held, besides the + data to commit to the disk, the data that was on it before commiting. + That data is saved precisely to be able to rollback. + So, to rollback a transaction all that has to be done is recover that +\begin_inset Quotes eld +\end_inset + +previous data +\begin_inset Quotes erd +\end_inset + + from the transaction we want to rollback, and save it to the disk. + In the end, this ends up being a new transaction with the previous data + as the new one, so we do that: create a new transaction structure, fill + in the data from the transaction we want to rollback, and commit it. + All this is performed by +\emph on +jtrans_rollback() +\emph default +. +\layout Standard + +By doing this we can provide the same warranties a commit has, it's really + fast, eases the recovery, and the code is simple and clean. + What a deal. +\layout Standard + +But be aware that rollbacking is dangerous. + And I really mean it: you should +\series bold +\emph on +only +\series default +\emph default + do it if you're really sure it's ok. + Consider, for instance, that you commit transaction A, then B, and then + you rollback A. + If A and B happen to touch the same portion of the file, the rollback will, + of course, not return the state previous to B, but previous to A. + If it's not done safely, this can lead to major corruption. + Now, if you add to this transactions that extend the file (and thus rollbacking + truncates it back), you not only have corruption but data loss. + So, again, be aware, I can't stress this enough, +\series bold +\emph on +rollback only if you really really know what you are doing +\series default +\emph default +. +\layout Section + +The recovery procedure +\layout Standard + +Recovering from crashes is done by the +\emph on +jfsck() +\emph default + call (or the program +\emph on +jiofsck +\emph default + which is just a simple invocation to that function), which opens the file + and goes through all transactions in the journal (remember that transactions + are removed from the journal directory after they're applied), loading + and recommiting them if possible. + There are several steps where it can fail: there could be no journal, a + given transaction file might be corrupted, incomplete, and so on; but in + the end, there are two cases regarding each transaction: either it's complete + and can be reapplied, or not. +\layout Standard + +In the case the transaction is not complete, there is no possibility that + it has been partially applied to the disk, remember that, from the commit + procedure, we only apply the transaction +\emph on +after +\emph default + saving it in the journal, so there is really nothing left to be done. +\layout Standard + +If the transaction is complete, we only need to recommit: if the transaction + was either not applied at all, partially applied or completely applied, + it makes no difference as we are now capable of completing it, and do so. +\layout Standard + +In any case, after making the recovery you can simply remove the journal + entirely and let the library create a new one, and you can be sure that + transaction atomicity was preserved. +\layout Section + +High-level functions +\layout Standard + +We call +\emph on +high-level functions +\emph default + to the ones provided by the library that emulate the good old unix file + manipulation calls. + Most of them are just wrappers around commits, and implement proper locking + when operating in order to allow simultaneous operations (either across + threads or processes). + They are described in detail in the manual pages, we'll only list them + here for completion: +\layout Itemize + +jopen() +\layout Itemize + +jread(), jpread(), jreadv() +\layout Itemize + +jwrite(), jpwrite(), jwritev() +\layout Itemize + +jtruncate() +\layout Itemize + +jclose() +\layout Section + +ACID (or How does libjio fit into theory) +\layout Standard + +I haven't read much theory about this, and the library was implemented basically + by common sense and not theorethical study. + +\layout Standard + +However, I'm aware that database people like ACID (well, that's not news + for anybody ;), which they say mean "Atomicity, Consistency, Isolation, + Durability" (yeah, right!). + +\layout Standard + +So, even libjio is not a purely database thing, it can be used to achieve + those attributes in a simple and efficient way. + +\layout Standard + +Let's take a look one by one: +\layout Itemize + +Atomicity: In a transaction involving two or more discrete pieces of information +, either all of the pieces are committed or none are. + This has been talked before and we've seen how the library achieves this + point, mostly based on locks and relying on a commit procedure. +\layout Itemize + +Consistency: A transaction either creates a new and valid state of data, + or, if any failure occurs, returns all data to its state before the transaction + was started. + This, like atomicity, has been discussed before, specially in the recovery + section, when we saw how in case of a crash we end up with a fully applied + transaction, or no transaction applied at all. +\layout Itemize + +Isolation: A transaction in process and not yet committed must remain isolated + from any other transaction. + This comes as a side effect of doing proper locking on the sections each + transaction affect, and guarantees that there can't be two transactions + working on the same section at the same time. +\layout Itemize + +Durability: Committed data is saved by the system such that, even in the + event of a failure and system restart, the data is available in its correct + state. + For this point we rely on the disk as a method of permanent storage, and + expect that when we do syncronous I/O, data is safely written and can be + recovered after a crash. +\layout Section + +Working from outside +\layout Standard + +If you want, and are careful enough, you can safely do I/O without using + the library. + Here I'll give you some general guidelines that you need to follow in order + to prevent corruption. + Of course you can bend or break them according to your use, this is just + a general overview on how to interact from outside. + +\layout Itemize + +Lock the sections you want to use: the library, as we have already exposed, + relies on fcntl locking; so, if you intend to operate on parts on the file + while using it, you should lock them. + +\layout Itemize + +Don't tuncate, unlink or rename: these operations have serious implications + when they're done while using the library, because the library itself assumes + that names don't change, and files don't dissapear beneath it. + It could potentially lead to corruption, although most of the time you + would just get errors from every call. +\the_end diff --git a/doc/threads b/doc/threads new file mode 100644 index 0000000..bf4b6fb --- /dev/null +++ b/doc/threads @@ -0,0 +1,20 @@ + +The library is entirely threadsafe. + +This will make some people who worked with threads a bit concerned, because +everybody knows that if a file descriptor is shared among threads, and two +threads decide to read/write/perform any op that moves the file pointer, a +mess is waiting to happen. And almost operations do touch the file pointer. + +But don't worry, the library is _truly_ threadsafe: it uses pread/pwrite, +which do not touch the file pointer, and allows working on the same file +simultaneously without concerns. Besides, it slightly improves performance by +having less locking, less system calls, lower overhead and less calculation to +perform the operation. + + +Still, bear in mind that if you decide to work on the file outside libjio you +need to lockf() the sections you're going to work on, because libjio relies on +lockf() locking to warantee atomicity. + + diff --git a/jiofsck.c b/jiofsck.c new file mode 100644 index 0000000..062dd3a --- /dev/null +++ b/jiofsck.c @@ -0,0 +1,66 @@ + +/* + * jiofsck - A journal checker and recovery tool for libjio + * Alberto Bertogli (albertogli@telpin.com.ar) + */ + +#include <stdio.h> +#include <string.h> +#include "libjio.h" + + +void usage() +{ + printf("Use: jiofsck FILE\n\n"); + printf("Where FILE is the name of the file" + "which you want to check the journal from.\n"); +} + +int main(int argc, char **argv) +{ + int rv; + char *file; + struct jfsck_result res; + + if (argc != 2) { + usage(); + return 1; + } + + file = argv[1]; + + memset(&res, 0, sizeof(res)); + + printf("Checking journal: "); + rv = jfsck(file, &res); + + if (rv == J_ENOENT) { + printf("No such file or directory\n"); + return 1; + } else if (rv == J_ENOJOURNAL) { + printf("No journal associated to the file, " + "or journal empty\n"); + return 1; + } + + printf("done\n"); + + printf("Journal checking results\n"); + printf("------------------------\n\n"); + + printf("Total:\t\t %d\n", res.total); + printf("Invalid:\t %d\n", res.invalid); + printf("In progress:\t %d\n", res.in_progress); + printf("Broken head:\t %d\n", res.broken_head); + printf("Broken body:\t %d\n", res.broken_body); + printf("Load error:\t %d\n", res.load_error); + printf("Apply error:\t %d\n", res.apply_error); + printf("Reapplied:\t %d\n", res.reapplied); + printf("\n"); + + printf("You can now safely remove the journal directory completely\n" + "to start a new journal.\n"); + + return 0; +} + diff --git a/libjio.c b/libjio.c new file mode 100644 index 0000000..d2db87c --- /dev/null +++ b/libjio.c @@ -0,0 +1,773 @@ + +/* + * libjio - A library for Journalled I/O + * Alberto Bertogli (albertogli@telpin.com.ar) + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include <libgen.h> +#include <stdio.h> +#include <dirent.h> +#include <sys/uio.h> + +#include "libjio.h" + + +/* + * small util functions + */ + +/* like lockf, but lock always from the beginning of the file */ +static off_t plockf(int fd, int cmd, off_t offset, off_t len) +{ + struct flock fl; + int op; + + if (cmd == F_LOCK) { + fl.l_type = F_WRLCK; + op = F_SETLKW; + } else if (cmd == F_ULOCK) { + fl.l_type = F_UNLCK; + op = F_SETLKW; + } else if (cmd == F_TLOCK) { + fl.l_type = F_WRLCK; + op = F_SETLK; + } else + return 0; + + fl.l_whence = SEEK_SET; + fl.l_start = offset; + fl.l_len = len; + + return fcntl(fd, op, &fl); +} + +/* like pread but either fails, or return a complete read; if we return less + * than count is because EOF was reached */ +static ssize_t spread(int fd, void *buf, size_t count, off_t offset) +{ + int rv, c; + + c = 0; + + while (c < count) { + rv = pread(fd, buf + c, count - c, offset + c); + + if (rv == count) + /* we're done */ + return count; + else if (rv < 0) + /* error */ + return rv; + else if (rv == 0) + /* got EOF */ + return c; + + /* incomplete read, keep on reading */ + c += rv; + } + + return count; +} + +/* like spread() but for pwrite() */ +static ssize_t spwrite(int fd, void *buf, size_t count, off_t offset) +{ + int rv, c; + + c = 0; + + while (c < count) { + rv = pwrite(fd, buf + c, count - c, offset + c); + + if (rv == count) + /* we're done */ + return count; + else if (rv <= 0) + /* error/nothing was written */ + return rv; + + /* incomplete write, keep on writing */ + c += rv; + } + + return count; +} + +/* build the journal directory name out of the filename */ +static void get_jdir(char *filename, char *jdir) +{ + char *base; + char *dir; + + base = basename(strdup(filename)); + dir = dirname(strdup(filename)); + + snprintf(jdir, PATH_MAX, "%s/.%s.jio", dir, base); +} + +/* build the filename of a given transaction */ +static void get_jtfile(char *filename, int tid, char *jtfile) +{ + char *base; + char *dir; + + base = basename(strdup(filename)); + dir = dirname(strdup(filename)); + + snprintf(jtfile, PATH_MAX, "%s/.%s.jio/%d", dir, base, tid); +} + +/* gets a new transaction id */ +static unsigned int get_tid(struct jfs *fs) +{ + unsigned int curid; + int r, rv; + + /* lock the whole file */ + plockf(fs->jfd, F_LOCK, 0, 0); + + /* read the current max. curid */ + r = spread(fs->jfd, &curid, sizeof(curid), 0); + if (r != sizeof(curid)) { + rv = 0; + goto exit; + } + + /* increment it and handle overflows */ + rv = curid + 1; + if (rv == 0) + rv = 1; + + /* write to the file descriptor */ + r = spwrite(fs->jfd, &rv, sizeof(rv), 0); + if (r != sizeof(curid)) { + rv = 0; + goto exit; + } + +exit: + plockf(fs->jfd, F_ULOCK, 0, 0); + return rv; +} + +/* frees a transaction id */ +static void free_tid(struct jfs *fs, unsigned int tid) +{ + unsigned int curid, i; + int r; + char name[PATH_MAX]; + + /* lock the whole file */ + plockf(fs->jfd, F_LOCK, 0, 0); + + /* read the current max. curid */ + r = spread(fs->jfd, &curid, sizeof(curid), 0); + if (r != sizeof(curid)) { + goto exit; + } + + if (tid < curid) { + /* we're not freeing the max. curid, so we just return */ + goto exit; + } else { + /* look up the new max. */ + for (i = curid - 1; i > 0; i--) { + get_jtfile(fs->name, i, name); + if (access(name, R_OK | W_OK) == 0) { + curid = i; + break; + } + } + + /* and save it */ + r = spwrite(fs->jfd, &i, sizeof(i), 0); + if (r != sizeof(curid)) { + goto exit; + } + } + +exit: + plockf(fs->jfd, F_ULOCK, 0, 0); + return; +} + + +/* + * transaction functions + */ + +/* initialize a transaction structure */ +void jtrans_init(struct jfs *fs, struct jtrans *ts) +{ + ts->fs = fs; + ts->name = NULL; + ts->id = 0; + ts->flags = 0; + ts->buf = NULL; + ts->len = 0; + ts->offset = 0; + ts->udata = NULL; + ts->ulen = 0; + ts->pdata = NULL; + ts->plen = 0; +} + +/* free a transaction structure */ +void jtrans_free(struct jtrans *ts) +{ + /* NOTE: we only really free the name and previous data, which are the + * things _we_ allocate; the user data is caller stuff */ + ts->fs = NULL; + if (ts->name) + free(ts->name); + if (ts->pdata) + free(ts->pdata); + free(ts); +} + +/* commit a transaction */ +int jtrans_commit(struct jtrans *ts) +{ + int id, fd, rv, t; + char *name; + void *buf_init, *bufp; + + name = (char *) malloc(PATH_MAX); + + id = get_tid(ts->fs); + if (id == 0) + return -1; + + /* open the transaction file */ + get_jtfile(ts->fs->name, id, name); + fd = open(name, O_RDWR | O_CREAT | O_TRUNC | O_LARGEFILE, 0600); + if (fd < 0) + return -1; + + /* and lock it */ + plockf(fd, F_LOCK, 0, 0); + + ts->id = id; + ts->name = name; + + /* lock the file region to work on */ + if (!(ts->fs->flags & J_NOLOCK)) + plockf(ts->fs->fd, F_LOCK, ts->offset, ts->len); + + /* first the static data */ + + buf_init = malloc(J_DISKTFIXSIZE); + bufp = buf_init; + + memcpy(bufp, (void *) &(ts->id), sizeof(ts->id)); + bufp += 4; + + memcpy(bufp, (void *) &(ts->flags), sizeof(ts->flags)); + bufp += 4; + + memcpy(bufp, (void *) &(ts->len), sizeof(ts->len)); + bufp += 4; + + memcpy(bufp, (void *) &(ts->ulen), sizeof(ts->ulen)); + bufp += 4; + + memcpy(bufp, (void *) &(ts->offset), sizeof(ts->offset)); + bufp += 8; + + rv = spwrite(fd, buf_init, J_DISKTFIXSIZE, 0); + if (rv != J_DISKTFIXSIZE) + goto exit; + + free(buf_init); + + + /* and now the variable part */ + + if (ts->udata) { + rv = spwrite(fd, ts->udata, ts->ulen, J_DISKTFIXSIZE); + if (rv != ts->ulen) + goto exit; + } + + ts->pdata = malloc(ts->len); + ts->plen = ts->len; + + /* copy the current content into the transaction file */ + rv = spread(ts->fs->fd, ts->pdata, ts->len, ts->offset); + if (rv < 0) + goto exit; + if (rv < ts->len) { + /* we are extending the file! use ftruncate() to do it */ + ftruncate(ts->fs->fd, ts->offset + ts->len); + + ts->plen = rv; + + } + + t = J_DISKTFIXSIZE + ts->ulen; + rv = spwrite(fd, ts->pdata, ts->len, t); + if (rv != ts->len) + goto exit; + + /* save the new data in the transaction file */ + t = J_DISKTFIXSIZE + ts->ulen + ts->plen; + rv = spwrite(fd, ts->buf, ts->len, t); + if (rv != ts->len) + goto exit; + + /* this is a simple but efficient optimization: instead of doing + * everything O_SYNC, we sync at this point only, this way we avoid + * doing a lot of very small writes; in case of a crash the + * transaction file is only useful if it's complete (ie. after this + * point) so we only flush here */ + fsync(fd); + + /* now that we have a safe transaction file, let's apply it */ + rv = spwrite(ts->fs->fd, ts->buf, ts->len, ts->offset); + if (rv != ts->len) + goto exit; + + /* mark the transaction as commited */ + ts->flags = ts->flags | J_COMMITED; + + /* the transaction has been applied, so we cleanup and remove it from + * the disk */ + free_tid(ts->fs, ts->id); + unlink(name); + +exit: + close(fd); + + if (!(ts->fs->flags & J_NOLOCK)) + plockf(ts->fs->fd, F_ULOCK, ts->offset, ts->len); + + /* return the lenght only if it was properly commited */ + if (ts->flags & J_COMMITED) + return ts->len; + else + return -1; + +} + +/* rollback a transaction */ +int jtrans_rollback(struct jtrans *ts) +{ + int rv; + struct jtrans newts; + + /* copy the old transaction to the new one */ + jtrans_init(ts->fs, &newts); + + newts.name = malloc(strlen(ts->name)); + strcpy(newts.name, ts->name); + newts.flags = ts->flags; + newts.offset = ts->offset; + + newts.buf = ts->pdata; + newts.len = ts->plen; + + if (ts->plen < ts->len) { + /* we extended the data in the previous transaction, so we + * should truncate it back */ + /* DANGEROUS: this is one of the main reasons why rollbacking + * is dangerous and should only be done with extreme caution: + * if for some reason, after the previous transacton, we have + * extended the file further, this will cut it back to what it + * was; read the docs for more detail */ + ftruncate(ts->fs->fd, ts->offset + ts->plen); + + } + + newts.pdata = ts->buf; + newts.plen = ts->len; + + newts.udata = ts->udata; + newts.ulen = ts->ulen; + + rv = jtrans_commit(&newts); + return rv; +} + + +/* + * basic operations + */ + +/* open a file */ +int jopen(struct jfs *fs, char *name, int flags, int mode, int jflags) +{ + int fd, jfd, rv; + unsigned int t; + char jdir[PATH_MAX], jlockfile[PATH_MAX]; + struct stat sinfo; + + fd = open(name, flags, mode); + if (fd < 0) + return -1; + + fs->fd = fd; + fs->name = name; + fs->flags = jflags; + + pthread_mutex_init( &(fs->lock), NULL); + + get_jdir(name, jdir); + rv = mkdir(jdir, 0750); + rv = lstat(jdir, &sinfo); + if (rv < 0 || !S_ISDIR(sinfo.st_mode)) + return -1; + + snprintf(jlockfile, PATH_MAX, "%s/%s", jdir, "lock"); + if (access(jlockfile, F_OK) != 0) { + /* file doesn't exists, create it */ + jfd = open(jlockfile, O_RDWR | O_CREAT | O_SYNC, 0600); + } else { + jfd = open(jlockfile, O_RDWR | O_SYNC, 0600); + } + if (jfd < 0) + return -1; + + /* initialize the lock file by writing the first tid to it, but only + * if its empty, otherwise there is a race if two processes call + * jopen() simultaneously and both initialize the file */ + plockf(jfd, F_LOCK, 0, 0); + lstat(jlockfile, &sinfo); + if (sinfo.st_size == 0) { + t = 1; + rv = write(jfd, &t, sizeof(t)); + if (rv != sizeof(t)) { + plockf(jfd, F_ULOCK, 0, 0); + return -1; + } + } + plockf(jfd, F_ULOCK, 0, 0); + + fs->jfd = jfd; + + return fd; +} + +/* read wrapper */ +ssize_t jread(struct jfs *fs, void *buf, size_t count) +{ + int rv; + pthread_mutex_lock(&(fs->lock)); + lockf(fs->fd, F_LOCK, count); + rv = read(fs->fd, buf, count); + lockf(fs->fd, F_ULOCK, -count); + pthread_mutex_unlock(&(fs->lock)); + + return rv; +} + +/* pread wrapper */ +ssize_t jpread(struct jfs *fs, void *buf, size_t count, off_t offset) +{ + int rv; + plockf(fs->fd, F_LOCK, offset, count); + rv = pread(fs->fd, buf, count, offset); + plockf(fs->fd, F_ULOCK, offset, count); + + return rv; +} + +/* readv wrapper */ +ssize_t jreadv(struct jfs *fs, struct iovec *vector, int count) +{ + int rv, i; + size_t sum; + + sum = 0; + for (i = 0; i < count; i++) + sum += vector[i].iov_len; + + pthread_mutex_lock(&(fs->lock)); + lockf(fs->fd, F_LOCK, sum); + rv = readv(fs->fd, vector, count); + lockf(fs->fd, F_ULOCK, -sum); + pthread_mutex_unlock(&(fs->lock)); + + return rv; +} + +/* write wrapper */ +ssize_t jwrite(struct jfs *fs, void *buf, size_t count) +{ + int rv; + off_t pos; + struct jtrans ts; + + pthread_mutex_lock(&(fs->lock)); + + jtrans_init(fs, &ts); + pos = lseek(fs->fd, 0, SEEK_CUR); + ts.offset = pos; + + ts.buf = buf; + ts.len = count; + + rv = jtrans_commit(&ts); + + pthread_mutex_unlock(&(fs->lock)); + return rv; +} + +/* pwrite wrapper */ +ssize_t jpwrite(struct jfs *fs, void *buf, size_t count, off_t offset) +{ + int rv; + struct jtrans ts; + + pthread_mutex_lock(&(fs->lock)); + + jtrans_init(fs, &ts); + ts.offset = offset; + + ts.buf = buf; + ts.len = count; + + rv = jtrans_commit(&ts); + + pthread_mutex_unlock(&(fs->lock)); + return rv; +} + +/* writev wrapper */ +ssize_t jwritev(struct jfs *fs, struct iovec *vector, int count) +{ + int rv, i, bufp; + ssize_t sum; + char *buf; + off_t pos; + struct jtrans ts; + + sum = 0; + for (i = 0; i < count; i++) + sum += vector[i].iov_len; + + /* unify the buffers into one big chunk to commit */ + /* FIXME: can't we do this more efficient? It ruins the whole purpose + * of using writev() :\ + * maybe we should do one transaction per vector */ + buf = malloc(sum); + bufp = 0; + + for (i = 0; i < count; i++) { + memcpy(buf + bufp, vector[i].iov_base, vector[i].iov_len); + bufp += vector[i].iov_len; + } + + pthread_mutex_lock(&(fs->lock)); + + jtrans_init(fs, &ts); + pos = lseek(fs->fd, 0, SEEK_CUR); + ts.offset = pos; + + ts.buf = buf; + ts.len = sum; + + rv = jtrans_commit(&ts); + + pthread_mutex_unlock(&(fs->lock)); + return rv; + +} + +/* truncate a file - be careful with this */ +int jtruncate(struct jfs *fs, off_t lenght) +{ + int rv; + + /* lock from lenght to the end of file */ + plockf(fs->fd, F_LOCK, lenght, 0); + rv = ftruncate(fs->fd, lenght); + plockf(fs->fd, F_ULOCK, lenght, 0); + + return rv; +} + +/* close a file */ +int jclose(struct jfs *fs) +{ + if (close(fs->fd)) + return -1; + if (close(fs->jfd)) + return -1; + return 0; +} + + +/* + * journal recovery + */ + +/* check the journal and replay the incomplete transactions */ +int jfsck(char *name, struct jfsck_result *res) +{ + int fd, jfd, tfd, rv, i, maxtid; + char jdir[PATH_MAX], jlockfile[PATH_MAX], tname[PATH_MAX]; + char *buf = NULL; + struct stat sinfo; + struct jfs fs; + struct jtrans *curts; + DIR *dir; + off_t offset; + + fd = open(name, O_RDWR | O_SYNC | O_LARGEFILE); + if (fd < 0) + return J_ENOENT; + + fs.fd = fd; + fs.name = name; + + get_jdir(name, jdir); + rv = lstat(jdir, &sinfo); + if (rv < 0 || !S_ISDIR(sinfo.st_mode)) + return J_ENOJOURNAL; + + snprintf(jlockfile, PATH_MAX, "%s/%s", jdir, "lock"); + jfd = open(jlockfile, O_RDWR | O_SYNC, 0600); + if (jfd < 0) + return J_ENOJOURNAL; + + lstat(jlockfile, &sinfo); + if (sinfo.st_size == 0) + return J_ENOJOURNAL; + + plockf(jfd, F_LOCK, 0, 0); + rv = spread(jfd, &maxtid, sizeof(maxtid), 0); + if (rv != sizeof(maxtid)) { + return J_ENOJOURNAL; + } + plockf(jfd, F_ULOCK, 0, 0); + + fs.jfd = jfd; + + dir = opendir(jdir); + if (dir == NULL) + return J_ENOJOURNAL; + + /* we loop all the way up to the max transaction id */ + for (i = 1; i <= maxtid; i++) { + curts = malloc(sizeof(struct jtrans)); + jtrans_init(&fs, curts); + curts->id = i; + + /* open the transaction file, using i as its name, so we are + * really looping in order (recovering transaction in a + * different order as they were applied means instant + * corruption) */ + get_jtfile(name, i, tname); + tfd = open(tname, O_RDWR | O_SYNC | O_LARGEFILE, 0600); + if (tfd < 0) { + res->invalid++; + goto loop; + } + + /* try to lock the transaction file, if it's locked then it is + * currently being used so we skip it */ + rv = plockf(fd, F_TLOCK, 0, 0); + if (rv == -1) { + res->in_progress++; + goto loop; + } + + curts->name = tname; + + /* load from disk, header first */ + buf = (char *) malloc(J_DISKTFIXSIZE); + rv = read(tfd, buf, J_DISKTFIXSIZE); + if (rv != J_DISKTFIXSIZE) { + res->broken_head++; + goto loop; + } + + curts->flags = (int) *(buf + 4); + curts->len = (size_t) *(buf + 8); + curts->ulen = (size_t) *(buf + 16); + curts->offset = (off_t) *(buf + 20); + + /* if we got here, the transaction was not applied, so we + * check if the transaction file is complete (we only need to + * apply it) or not (so we can't do anything but ignore it) */ + + lstat(tname, &sinfo); + rv = J_DISKTFIXSIZE + curts->len + curts->ulen + curts->plen; + if (sinfo.st_size != rv) { + /* the transaction file is incomplete, some of the + * body is missing */ + res->broken_body++; + goto loop; + } + + /* we have a complete transaction file which commit was not + * successful, so we read it to complete the transaction + * structure and apply it again */ + curts->buf = malloc(curts->len); + curts->pdata = malloc(curts->plen); + curts->udata = malloc(curts->ulen); + + /* user data */ + offset = J_DISKTFIXSIZE; + rv = spread(tfd, curts->udata, curts->ulen, offset); + if (rv != curts->ulen) { + printf("ULEN\n"); + res->load_error++; + goto loop; + } + + /* previous data */ + offset = J_DISKTFIXSIZE + curts->ulen; + rv = spread(tfd, curts->pdata, curts->plen, offset); + if (rv != curts->plen) { + printf("PLEN\n"); + res->load_error++; + goto loop; + } + + /* real data */ + offset = J_DISKTFIXSIZE + curts->ulen + curts->plen; + rv = spread(tfd, curts->buf, curts->len, offset); + if (rv != curts->len) { + res->load_error++; + goto loop; + } + + /* apply */ + rv = jtrans_commit(curts); + if (rv < 0) { + res->apply_error++; + goto loop; + } + res->reapplied++; + + /* free the data we just allocated */ + if (curts->len) + free(curts->buf); + if (curts->plen) + free(curts->pdata); + if (curts->ulen) + free(curts->udata); + +loop: + if (tfd > 0) + close(tfd); + + res->total++; + if (buf) + free(buf); + free(curts); + } + + return 0; + +} + + diff --git a/libjio.h b/libjio.h new file mode 100644 index 0000000..4241ab0 --- /dev/null +++ b/libjio.h @@ -0,0 +1,104 @@ + +/* + * libjio - A library for Journalled I/O + * Alberto Bertogli (albertogli@telpin.com.ar) + */ + +#ifndef _LIBJIO_H +#define _LIBJIO_H + +#include <stdint.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <pthread.h> + + +/* logical structures */ +struct jfs { + int fd; /* main file descriptor */ + char *name; /* and its name */ + int jfd; /* journal's lock file descriptor */ + int flags; /* journal mode options used in jopen() */ + pthread_mutex_t lock; /* a soft lock used in some operations */ +}; + +struct jtrans { + struct jfs *fs; /* journal file structure to operate on */ + char *name; /* name of the transaction file */ + int id; /* transaction id */ + int flags; /* misc flags */ + void *buf; /* buffer */ + size_t len; /* buffer lenght */ + off_t offset; /* file offset to operate on */ + void *udata; /* user-supplied data */ + size_t ulen; /* udata lenght */ + void *pdata; /* previous data, for rollback */ + size_t plen; /* pdata lenght */ +}; + +struct jfsck_result { + int total; /* total transactions files we looked at */ + int invalid; /* invalid files in the journal directory */ + int in_progress; /* transactions in progress */ + int broken_head; /* transactions broken (header missing) */ + int broken_body; /* transactions broken (body missing) */ + int load_error; /* errors loading the transaction */ + int apply_error; /* errors applying the transaction */ + int reapplied; /* transactions that were re-applied */ +}; + +/* on-disk structure */ +struct disk_trans { + + /* header (fixed lenght, defined below) */ + uint32_t id; /* id */ + uint32_t flags; /* flags about this transaction */ + uint32_t len; /* data lenght */ + uint32_t ulen; /* user-supplied information lenght */ + uint64_t offset; /* offset relative to the BOF */ + + /* payload (variable lenght) */ + char *udata; /* user-supplied data */ + char *prevdata; /* previous data, optional, for rollback */ + char *data; /* data */ +}; + + +/* basic operations */ +int jopen(struct jfs *fs, char *name, int flags, int mode, int jflags); +ssize_t jread(struct jfs *fs, void *buf, size_t count); +ssize_t jpread(struct jfs *fs, void *buf, size_t count, off_t offset); +ssize_t jreadv(struct jfs *fs, struct iovec *vector, int count); +ssize_t jwrite(struct jfs *fs, void *buf, size_t count); +ssize_t jpwrite(struct jfs *fs, void *buf, size_t count, off_t offset); +ssize_t jwritev(struct jfs *fs, struct iovec *vector, int count); +int jtruncate(struct jfs *fs, off_t lenght); +int jclose(struct jfs *fs); + +/* transaction operations */ +void jtrans_init(struct jfs *fs, struct jtrans *ts); +int jtrans_commit(struct jtrans *ts); +int jtrans_rollback(struct jtrans *ts); +void jtrans_free(struct jtrans *ts); + +/* journal checker */ +int jfsck(char *name, struct jfsck_result *res); + + +/* jfs constants */ +#define J_NOLOCK 1 /* don't lock the file before operating on it */ + +/* jtrans constants */ +#define J_COMMITED 1 /* mark a transaction as commited */ +#define J_ROLLBACKED 2 /* mark a transaction as rollbacked */ + +/* disk_trans constants */ +#define J_DISKTFIXSIZE 24 /* lenght of disk_trans' header */ + +/* jfsck constants (return values) */ +#define J_ESUCCESS 0 /* success - shouldn't be used */ +#define J_ENOENT 1 /* no such file */ +#define J_ENOJOURNAL 2 /* no journal associated */ + +#endif + diff --git a/samples/build b/samples/build new file mode 100755 index 0000000..736293c --- /dev/null +++ b/samples/build @@ -0,0 +1,4 @@ +gcc -Wall -O6 -I.. -lpthread jio1.c ../libjio.a -o jio1 +gcc -Wall -O6 -I.. -lpthread jio2.c ../libjio.a -o jio2 +gcc -Wall -O6 -I.. -lpthread jio3.c ../libjio.a -o jio3 + diff --git a/samples/clean b/samples/clean new file mode 100755 index 0000000..b5715c8 --- /dev/null +++ b/samples/clean @@ -0,0 +1,2 @@ +rm -rf jio1 jio2 jio3 test1 .test1.jio + diff --git a/samples/jio1.c b/samples/jio1.c new file mode 100644 index 0000000..4296ac6 --- /dev/null +++ b/samples/jio1.c @@ -0,0 +1,62 @@ + + +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <libjio.h> + +#define str "TESTTESTTEST1234\n" + +int jio(void) +{ + int fd, rv; + struct jfs fs; + + fd = jopen(&fs, "test1", O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0660, 0); + if (fd < 0) + perror("OPEN"); + + rv = jwrite(&fs, str, strlen(str)); + if (rv != strlen(str)) + perror("WRITE"); + + return 0; + +} + +int classic(void) +{ + int fd, rv; + + fd = open("test1", O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0660); + if (fd < 0) + perror("OPEN"); + + rv = write(fd, str, strlen(str)); + if (rv != strlen(str)) + perror("WRITE"); + + return 0; + +} + + +int main(int argc, char **argv) { + if (argc != 2) { + printf("Use: jio1 [c|j]\n"); + return 1; + } + + if (*argv[1] == 'c') + classic(); + else if (*argv[1] == 'j') + jio(); + else + printf("Use: jio1 [c|j]\n"); + + return 0; +} diff --git a/samples/jio2.c b/samples/jio2.c new file mode 100644 index 0000000..9d6d008 --- /dev/null +++ b/samples/jio2.c @@ -0,0 +1,73 @@ + + +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> + +#include <libjio.h> + + +#define str "TESTTESTTEST1234\n" + +int jio(void) +{ + int fd, rv; + struct jfs fs; + + fd = jopen(&fs, "test1", O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0660, 0); + if (fd < 0) + perror("OPEN"); + + rv = jwrite(&fs, str, strlen(str)); + if (rv != strlen(str)) + perror("WRITE"); + + return 0; + +} + +int classic(void) +{ + int fd, rv; + + fd = open("test1", O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0660); + if (fd < 0) + perror("OPEN"); + + rv = write(fd, str, strlen(str)); + if (rv != strlen(str)) + perror("WRITE"); + + return 0; + +} + + +int main(int argc, char **argv) { + int i; + int N; + + if (argc != 2) { + printf("Use: jio1 [c|j] N\n"); + return 1; + } + + N = 0; + N = atoi(argv[2]); + + if (*argv[1] == 'c') + for (i = 0; i < N; i++) + classic(); + else if (*argv[1] == 'j') + for (i = 0; i < N; i++) + jio(); + else + printf("Use: jio1 [c|j] N\n"); + + return 0; +} + diff --git a/samples/jio3.c b/samples/jio3.c new file mode 100644 index 0000000..f239020 --- /dev/null +++ b/samples/jio3.c @@ -0,0 +1,43 @@ + + +#include <stdio.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include <libjio.h> + + +int main(int argc, char **argv) +{ + int fd, rv; + struct jfs fs; + struct jtrans ts; + + fd = jopen(&fs, "test1", O_RDWR | O_CREAT | O_TRUNC | O_SYNC, 0660, 0); + if (fd < 0) + perror("OPEN"); + +#define str "ROLLBACKTEST!\n" + + jtrans_init(&fs, &ts); + + ts.offset = 0; + ts.buf = str; + ts.len = strlen(str); + + rv = jtrans_commit(&ts); + if (rv != strlen(str)) + perror("COMMIT"); + + rv = jtrans_rollback(&ts); + if (rv != 0) + perror("ROLLBACK"); + + return 0; + +} + + diff --git a/tests/.1.jio/1 b/tests/.1.jio/1 new file mode 100644 index 0000000..67b2ffa Binary files /dev/null and b/tests/.1.jio/1 differ diff --git a/tests/.1.jio/2 b/tests/.1.jio/2 new file mode 100644 index 0000000..96bdcc1 --- /dev/null +++ b/tests/.1.jio/2 @@ -0,0 +1 @@ +Not so random but yet broken transaction diff --git a/tests/.1.jio/3 b/tests/.1.jio/3 new file mode 100644 index 0000000..069bc8d Binary files /dev/null and b/tests/.1.jio/3 differ diff --git a/tests/.1.jio/4 b/tests/.1.jio/4 new file mode 100644 index 0000000..02b727b Binary files /dev/null and b/tests/.1.jio/4 differ diff --git a/tests/.1.jio/5 b/tests/.1.jio/5 new file mode 100644 index 0000000..f9b5055 Binary files /dev/null and b/tests/.1.jio/5 differ diff --git a/tests/.1.jio/6 b/tests/.1.jio/6 new file mode 100644 index 0000000..e69de29 diff --git a/tests/.1.jio/8 b/tests/.1.jio/8 new file mode 100644 index 0000000..dc5c2bb Binary files /dev/null and b/tests/.1.jio/8 differ diff --git a/tests/.1.jio/desc b/tests/.1.jio/desc new file mode 100644 index 0000000..8b11482 --- /dev/null +++ b/tests/.1.jio/desc @@ -0,0 +1,14 @@ + +tid description + +1 random data +2 simple ascii text +3 valid transaction (stores 'TESTTESTTEST1234') +4 header only +5 header and a bit of data +6 empty +7 (doesn't exist) +8 valid but oversized + +This leads to a file with 'TESTTESTTEST1234' on it. + diff --git a/tests/.1.jio/lock b/tests/.1.jio/lock new file mode 100644 index 0000000..38424fc Binary files /dev/null and b/tests/.1.jio/lock differ diff --git a/tests/1 b/tests/1 new file mode 100644 index 0000000..de6c6a5 --- /dev/null +++ b/tests/1 @@ -0,0 +1 @@ +TESTTESTTEST1234