#!/usr/bin/python
# A backup script
# Alberto Bertogli (albertogli@telpin.com.ar)
# Version 0.03
import sys
import os
import sha
import cPickle
import re
from stat import *
#
# constants
#
PSIZE = 4 * 1024
VERSION = "0.03"
#
# classes
#
# file_info functions are not included directly in the class to avoid memory
# waste, which is about 1k per file.
def finfo_load(finfo):
"Loads data from the file."
s = os.lstat(finfo.fullname)
finfo.stat = s
if S_ISREG(s.st_mode):
finfo.type = 'r'
elif S_ISLNK(s.st_mode):
finfo.type = 'l'
elif S_ISCHR(s.st_mode):
finfo.type = 'c'
finfo.rdev = s.st_rdev
elif S_ISBLK(s.st_mode):
finfo.type = 'b'
finfo.rdev = s.st_rdev
elif S_ISFIFO(s.st_mode):
finfo.type = 'f'
finfo.linkto = os.readlink(finfo.fullname)
elif S_ISDIR(s.st_mode):
finfo.type = 'd'
else:
finfo.type = 'u'
finfo.mtime = s.st_mtime
finfo.atime = s.st_atime
finfo.size = s.st_size
finfo.mode = s.st_mode
finfo.uid = s.st_uid
finfo.gid = s.st_gid
if finfo.type == 'r':
finfo.hash = finfo.hash_file()
def finfo_cmp_mdata(finfo, other):
"Compares metadata to other."
if finfo.mtime != other.mtime: return 0
if finfo.mode != other.mode: return 0
if finfo.uid != other.uid: return 0
if finfo.gid != other.gid: return 0
return 1
def finfo_cmp_data(finfo, other):
"Compares data to other."
if finfo.size != other.size: return 0
if finfo.hash != other.hash: return 0
if finfo.type != other.type: return 0
if finfo.type == 'b' or finfo.type == 'c':
if finfo.rdev != other.rdev:
return 0
if finfo.type == 'l':
if finfo.linkto != other.linkto:
return 0
return 1
def finfo_copy_file_reg_raw(finfo, dst):
"Copy a regular file."
sfile = open(finfo.fullname, 'r')
dfile = open(dst, 'w')
# the data
data = sfile.read(PSIZE)
while data:
dfile.write(data)
data = sfile.read(PSIZE)
sfile.close()
dfile.close()
def finfo_copy_file_reg_bzip2(finfo, dst):
"Copy a regular file, destination is bz2 compressed."
import bz2
sfile = open(finfo.fullname)
dfile = open(dst, 'w')
bcomp = bz2.BZ2Compressor()
data = sfile.read(PSIZE)
while data:
dfile.write(bcomp.compress(data))
data = sfile.read(PSIZE)
dfile.write(bcomp.flush())
sfile.close()
dfile.close()
def finfo_copy_file_reg_gzip(finfo, dst):
"Copy a regular file, destination is gzip compressed."
import gzip
sfile = open(finfo.fullname)
dfile = gzip.open(dst, 'w')
data = sfile.read(PSIZE)
while data:
dfile.write(data)
data = sfile.read(PSIZE)
sfile.close()
dfile.close()
# the copy function is modified by configuration
#finfo_copy_file_reg = finfo_copy_file_reg_gzip
def finfo_copy_file_link(finfo, dst):
"Copy a symbolic link."
linkto = os.readlink(finfo.fullname)
os.symlink(linkto, dst)
def finfo_copy_file_dev(finfo, dst):
"Copy a device file."
major = os.major(finfo.rdev)
minor = os.minor(finfo.rdev)
dev = os.makedev(major, minor)
os.mknod(dst, finfo.mode, dev)
def finfo_update_mdata(finfo, dst):
"Updates a file's metadata."
os.lchown(dst, finfo.uid, finfo.gid)
if finfo.type != 'l':
# these don't really like symlinks
os.utime(dst, (finfo.atime, finfo.mtime))
os.chmod(dst, finfo.mode & 07777)
def finfo_copy_file(finfo, dst):
"Copies a file, along with its permissions and ownership."
# create the path to dst if it doesn't exist
make_path(dst)
# copy accordingly to the file type
if finfo.type == 'r':
finfo_copy_file_reg(finfo, dst)
elif finfo.type == 'l':
finfo_copy_file_link(finfo, dst)
elif finfo.type == 'b' or finfo.type == 'c':
finfo_copy_file_dev(finfo, dst)
elif finfo.type == 'f':
# we just create fifos
os.mkfifo(dst, finfo.mode & 07777)
elif finfo.type == 'd':
# we just create directories
try:
os.makedirs(dst, finfo.mode & 07777)
except OSError:
# ignore if the dir already exists, it could
# happen because the walker doesn't do it in
# any kind of order, so a subdirectory might
# be created before the parent.
pass
else:
raise 'Unk type: 0x%x %d' % (finfo.mode, finfo.name)
def finfo_hash_file_sha(finfo):
"Returns the sha1sum of a file."
import sha
hash = sha.new()
f = open(finfo.fullname)
data = f.read(PSIZE)
while data:
hash.update(data)
data = f.read(PSIZE)
f.close()
return hash.hexdigest()
def finfo_hash_file_md5(finfo):
"Returns the md5sum of a file."
import md5
hash = md5.new()
f = open(finfo.fullname)
data = f.read(PSIZE)
while data:
hash.update(data)
data = f.read(PSIZE)
f.close()
return hash.hexdigest()
def finfo_hash_file_none(finfo):
"Empty hash."
return '-'
# the hash function is modified by configuration
finfo_hash_file = finfo_hash_file_sha
class file_info:
"Represents a file"
def __init__(self, name, fullname):
self.name = name
self.fullname = fullname
self.mode = 0
self.uid = 0
self.gid = 0
self.mtime = 0
self.atime = 0
self.size = 0
self.type = ''
self.linkto = None
self.rdev = None
self.hash = None
self.stat = None
def __repr__(self):
return "<%s: %s %d>" % (self.name, self.type, self.size)
def __eq__(self, other):
"Compares to other file_info object."
if self.name != other.name: return 0
if not finfo_cmp_data(self, other): return 0
if not finfo_cmp_mdata(self, other): return 0
return 1
def __ne__(self, other):
return not (self == other)
class index_file:
"Represents the index file."
def __init__(self, name):
self.name = name
self.db = {}
self.names = []
self.pathdb = {}
def load(self):
"Loads data from the file."
try:
f = open(self.name)
except IOError:
# probably file doesn't exist, ignore
return
(self.db, self.names) = cPickle.load(f)
f.close()
def save(self):
"Saves the index to the disk."
for f in self.db.keys():
self.db[f].fullname = ''
f = open(self.name, 'w')
cPickle.dump((self.db, self.names), f, cPickle.HIGHEST_PROTOCOL)
f.close()
def put_file(self, filename, fullpath):
"Incorporates a file into the index."
self.db[filename] = file_info(filename, fullpath)
finfo_load(self.db[filename])
if self.db[filename].type == 'u':
# ignore files of unknown types, like unix sockets
del(self.db[filename])
return
self.names.append(filename)
self.pathdb[filename] = fullpath
def get_file(self, filename):
"Get the file_info object for the given filename."
return self.db[filename]
def populate(self, root, exclude):
"Populate the index from a root path."
def skip_file(relname):
"Check if the file matches the exclude list"
for r in exclude:
if r.search(relname):
return 1
return 0
root = os.path.abspath(root)
base, reduced = os.path.split(root)
self.put_file(reduced, root)
tree = os.walk(root, topdown = True)
for path, childs, files in tree:
for f in files:
full = path + '/' + f
name = relative_path(base, full)
if skip_file(name):
continue
self.put_file(name, full)
for c in childs:
full = path + '/' + c
name = relative_path(base, full)
if skip_file(name):
continue
self.put_file(name, full)
def quiet_unlink(path):
"Removes the given file if exists, or do nothing if not."
try:
os.unlink(path)
except OSError:
pass
def force_unlink(path, type):
"Removes a file or directory, recurses if necesary."
if type != 'd':
try:
os.unlink(path)
except OSError:
pass
else:
try:
os.removedirs(path)
except OSError:
pass
def make_path(f):
"If f is 'a/b/c/f', make sure 'a/b/c' exist."
dir = os.path.dirname(f)
try:
os.makedirs(dir)
except OSError:
# it can fail if already exist
pass
def relative_path(base, path):
"""If base = '/x/x/b' and path = '/x/x/b/c/d', returns 'b/c/d'. Both
must be absolute for simplicity."""
res = path[len(base):]
while res[0] == '/':
res = res[1:]
return res
#
# main operations
#
def make_sync(sources, srcidx_path, dst_path, dstidx_path, exclude):
"Sync two directories."
# destination and indexes are always a complete path
srcidx_path = os.path.join(os.getcwd(), srcidx_path)
dst_path = os.path.join(os.getcwd(), dst_path)
dstidx_path = os.path.join(os.getcwd(), dstidx_path)
# process regular expressions
exclude_re = []
for r in exclude:
exclude_re.append(re.compile(r))
# load destination index
printv("* loading destination index")
dstidx = index_file(dstidx_path)
dstidx.load()
# create source index
printv("* building source index")
srcidx = index_file(srcidx_path)
for src_path in sources:
printv("\t* " + src_path)
srcidx.populate(src_path, exclude_re)
printv("* sync")
# compare them
update_files = []
for f in srcidx.names:
if f not in dstidx.names or \
not finfo_cmp_data(srcidx.db[f], dstidx.db[f]):
# files missing in destination, or data changed
#dst = os.path.join(dst_path, f)
dst = dst_path + '/' + f
printv('data\t', f, dst)
quiet_unlink(dst)
finfo_copy_file(srcidx.db[f], dst)
update_files.append((f, dst))
elif not finfo_cmp_mdata(srcidx.db[f], dstidx.db[f]):
# metadata changed
#dst = os.path.join(dst_path, f)
dst = dst_path + '/' + f
printv('mdata\t', f, dst)
update_files.append((f, dst))
# metadata gets changed later because otherwise we could leave
# directory times wrong due to files being added to a directory after
# their creation; this way we're sure there will be no more file
# creation afterwards
printv('* mdata')
for f, dst in update_files:
try:
finfo_update_mdata(srcidx.db[f], dst)
except:
# it can fail if the destination doesn't have the
# file, ignore for now; TODO: output some kind of
# script so people can run it later when they get all
# back together
pass
printv('* unlink')
for f in dstidx.names:
if f not in srcidx.names:
# files in destination and not in source
#dst = os.path.join(dst_path, f)
dst = dst_path + '/' + f
printv('unlink\t', f, dst)
force_unlink(dst, dstidx.db[f].type)
# we save the index at last because it voids file_info.fullpath so we
# don't save unnecesary information
printv('* saving index')
srcidx.save()
def show_idx(idx_path):
printv("* loading index")
idx = index_file(idx_path)
idx.load()
for f in idx.names:
fi = idx.db[f]
printv( "%s %d %f %s %s" % (fi.type, fi.size, fi.mtime,
str(fi.hash), fi.name) )
def build_idx(idx_path, path):
printv("* building index")
# see comments in make_sync()
while path[-1] == '/' and path != '/':
path = path[:-1]
idx_path = os.path.join(os.getcwd(), idx_path)
if path != '/':
parent, src = os.path.split(path)
if parent:
os.chdir(parent)
path = src
# build the index
idx = index_file(idx_path)
idx.populate(path)
idx.save()
#
# helper functions
#
def printv(*params):
"Equivalent to 'if verbose: print params'."
if not verbose:
return
for i in params:
print i,
print
def parse_options():
"Commandline options parser."
from optparse import OptionParser
class AbkOptionParser(OptionParser):
"Custom abk command line option parser."
def format_help (self, formatter=None):
"Displays the description before usage."
if formatter is None:
formatter = self.formatter
result = []
if self.description:
result.append(self.get_prog_name() + ' - ' + \
self.format_description(formatter)+"\n\n")
if self.usage:
result.append(self.get_usage() + "\n")
result.append(self.format_option_help(formatter))
return "".join(result)
usage = """%prog [options] command params
commands:
show idx_file
shows the given index file contents
mkidx idx_file dir
builds an index file for the given directory
sync idx src1 [src2 ... srcN] dst
synchronizes all sources with dst, using the given idx index file"""
parser = AbkOptionParser(usage=usage, description="A backup script - "
"Alberto Bertogli (albertogli@telpin.com.ar)",
version="%prog " + VERSION, prog='abk')
parser.add_option("-v", "--verbose",
action="store_true", dest="verbose", default=True,
help="print progress information [default]")
parser.add_option("-q", "--quiet",
action="store_false", dest="verbose",
help="don't print progress information (just errors)")
parser.add_option("-c", "--copy-mode", default='gzip', metavar="MODE",
action="store", dest="copy_mode",
help="select copy mode to use. Available modes: "
"raw, gzip, bzip2 [default: gzip]")
parser.add_option("-a", "--hash-mode", default='sha', metavar="MODE",
action="store", dest="hash_mode",
help="select the hash to use to check for file content change. "
"Available modes: none, sha, md5 [default: sha]")
parser.add_option("-e", "--exclude", metavar="REGEX",
action="append", dest="exclude",
help="excludes files that matches with the regular expression. "
"This option accepts multiple instances")
parser.add_option("-i", "--new-idx", metavar="FILE",
action="store", dest="new_idx",
help="select where to write the new generated index. "
"This is useful for incremental backups. "
"If not specified, the old index file (idx) is overwritten")
(opts, args) = parser.parse_args()
return (parser, opts, args)
#
# main
#
# command line options
(parser, opts, args) = parse_options()
verbose = opts.verbose
# configuration
if opts.copy_mode == 'raw':
finfo_copy_file_reg = finfo_copy_file_reg_raw
elif opts.copy_mode == 'gzip':
finfo_copy_file_reg = finfo_copy_file_reg_gzip
elif opts.copy_mode == 'bzip2':
finfo_copy_file_reg = finfo_copy_file_reg_bzip2
else:
parser.error("Invalid copy mode (%s)." % opts.copy_mode)
if opts.hash_mode == 'none':
file_info.hash_file = finfo_hash_file_none
elif opts.hash_mode == 'md5':
file_info.hash_file = finfo_hash_file_md5
elif opts.hash_mode == 'sha':
file_info.hash_file = finfo_hash_file_sha
else:
parser.error("Invalid hash mode (%s)." % opts.hash_mode)
# main command
try:
cmd = args[0]
except:
parser.error("Command missing.")
if cmd == 'show':
try:
show_idx(args[1])
except:
parser.error("Missing idx_file parameter.")
elif cmd == 'mkidx':
try:
idx_path = args[1]
path = args[2]
except:
parser.error("Missing parameter(s) for command mkidx.")
build_idx(idx_path, path)
elif cmd == 'sync':
try:
old_idx_path = args[1]
new_idx_path = old_idx_path
sources = args[2:-1]
dst_path = args[-1]
except:
parser.error("Missing parameter(s) for command sync.")
if opts.new_idx:
new_idx_path = opts.new_idx
exclude = []
if opts.exclude:
exclude = opts.exclude
make_sync(sources, new_idx_path, dst_path, old_idx_path, exclude)
else:
parser.error("Unknown command (%s)." % cmd)