git » abk » master

#!/usr/bin/python

# A backup script
# Alberto Bertogli (albertogli@telpin.com.ar)
# Version 0.03

import sys
import os
import sha
import cPickle
import re
from stat import *


#
# constants
#

PSIZE = 4 * 1024

VERSION = "0.03"


#
# classes
#

# file_info functions are not included directly in the class to avoid memory
# waste, which is about 1k per file.

def finfo_load(finfo):
	"Loads data from the file."
	s = os.lstat(finfo.fullname)
	finfo.stat = s
	if S_ISREG(s.st_mode):
		finfo.type = 'r'
	elif S_ISLNK(s.st_mode):
		finfo.type = 'l'
	elif S_ISCHR(s.st_mode):
		finfo.type = 'c'
		finfo.rdev = s.st_rdev
	elif S_ISBLK(s.st_mode):
		finfo.type = 'b'
		finfo.rdev = s.st_rdev
	elif S_ISFIFO(s.st_mode):
		finfo.type = 'f'
		finfo.linkto = os.readlink(finfo.fullname)
	elif S_ISDIR(s.st_mode):
		finfo.type = 'd'
	else:
		finfo.type = 'u'

	finfo.mtime = s.st_mtime
	finfo.atime = s.st_atime
	finfo.size = s.st_size
	finfo.mode = s.st_mode
	finfo.uid = s.st_uid
	finfo.gid = s.st_gid

	if finfo.type == 'r':
		finfo.hash = finfo.hash_file()

def finfo_cmp_mdata(finfo, other):
	"Compares metadata to other."
	if finfo.mtime != other.mtime: return 0
	if finfo.mode != other.mode: return 0
	if finfo.uid != other.uid: return 0
	if finfo.gid != other.gid: return 0
	return 1

def finfo_cmp_data(finfo, other):
	"Compares data to other."
	if finfo.size != other.size: return 0
	if finfo.hash != other.hash: return 0
	if finfo.type != other.type: return 0
	if finfo.type == 'b' or finfo.type == 'c':
		if finfo.rdev != other.rdev:
			return 0
	if finfo.type == 'l':
		if finfo.linkto != other.linkto:
			return 0
	return 1

def finfo_copy_file_reg_raw(finfo, dst):
	"Copy a regular file."
	sfile = open(finfo.fullname, 'r')
	dfile = open(dst, 'w')

	# the data
	data = sfile.read(PSIZE)
	while data:
		dfile.write(data)
		data = sfile.read(PSIZE)

	sfile.close()
	dfile.close()

def finfo_copy_file_reg_bzip2(finfo, dst):
	"Copy a regular file, destination is bz2 compressed."
	import bz2
	sfile = open(finfo.fullname)
	dfile = open(dst, 'w')

	bcomp = bz2.BZ2Compressor()
	data = sfile.read(PSIZE)
	while data:
		dfile.write(bcomp.compress(data))
		data = sfile.read(PSIZE)
	dfile.write(bcomp.flush())
	sfile.close()
	dfile.close()

def finfo_copy_file_reg_gzip(finfo, dst):
	"Copy a regular file, destination is gzip compressed."
	import gzip
	sfile = open(finfo.fullname)
	dfile = gzip.open(dst, 'w')

	data = sfile.read(PSIZE)
	while data:
		dfile.write(data)
		data = sfile.read(PSIZE)

	sfile.close()
	dfile.close()

# the copy function is modified by configuration
#finfo_copy_file_reg = finfo_copy_file_reg_gzip


def finfo_copy_file_link(finfo, dst):
	"Copy a symbolic link."
	linkto = os.readlink(finfo.fullname)
	os.symlink(linkto, dst)


def finfo_copy_file_dev(finfo, dst):
	"Copy a device file."
	major = os.major(finfo.rdev)
	minor = os.minor(finfo.rdev)
	dev = os.makedev(major, minor)
	os.mknod(dst, finfo.mode, dev)

def finfo_update_mdata(finfo, dst):
	"Updates a file's metadata."
	os.lchown(dst, finfo.uid, finfo.gid)
	if finfo.type != 'l':
		# these don't really like symlinks
		os.utime(dst, (finfo.atime, finfo.mtime))
		os.chmod(dst, finfo.mode & 07777)

def finfo_copy_file(finfo, dst):
	"Copies a file, along with its permissions and ownership."
	# create the path to dst if it doesn't exist
	make_path(dst)

	# copy accordingly to the file type
	if finfo.type == 'r':
		finfo_copy_file_reg(finfo, dst)
	elif finfo.type == 'l':
		finfo_copy_file_link(finfo, dst)
	elif finfo.type == 'b' or finfo.type == 'c':
		finfo_copy_file_dev(finfo, dst)
	elif finfo.type == 'f':
		# we just create fifos
		os.mkfifo(dst, finfo.mode & 07777)
	elif finfo.type == 'd':
		# we just create directories
		try:
			os.makedirs(dst, finfo.mode & 07777)
		except OSError:
			# ignore if the dir already exists, it could
			# happen because the walker doesn't do it in
			# any kind of order, so a subdirectory might
			# be created before the parent.
			pass
	else:
		raise 'Unk type: 0x%x %d' % (finfo.mode, finfo.name)

def finfo_hash_file_sha(finfo):
	"Returns the sha1sum of a file."
	import sha
	hash = sha.new()
	f = open(finfo.fullname)
	data = f.read(PSIZE)
	while data:
		hash.update(data)
		data = f.read(PSIZE)
	f.close()
	return hash.hexdigest()

def finfo_hash_file_md5(finfo):
	"Returns the md5sum of a file."
	import md5
	hash = md5.new()
	f = open(finfo.fullname)
	data = f.read(PSIZE)
	while data:
		hash.update(data)
		data = f.read(PSIZE)
	f.close()
	return hash.hexdigest()

def finfo_hash_file_none(finfo):
	"Empty hash."
	return '-'

# the hash function is modified by configuration
finfo_hash_file = finfo_hash_file_sha


class file_info:
	"Represents a file"
	def __init__(self, name, fullname):
		self.name = name
		self.fullname = fullname
		self.mode = 0
		self.uid = 0
		self.gid = 0
		self.mtime = 0
		self.atime = 0
		self.size = 0
		self.type = ''
		self.linkto = None
		self.rdev = None
		self.hash = None
		self.stat = None

	def __repr__(self):
		return "<%s: %s %d>" % (self.name, self.type, self.size)

	def __eq__(self, other):
		"Compares to other file_info object."
		if self.name != other.name: return 0
		if not finfo_cmp_data(self, other): return 0
		if not finfo_cmp_mdata(self, other): return 0

		return 1

	def __ne__(self, other):
		return not (self == other)


class index_file:
	"Represents the index file."
	def __init__(self, name):
		self.name = name
		self.db = {}
		self.names = []
		self.pathdb = {}

	def load(self):
		"Loads data from the file."
		try:
			f = open(self.name)
		except IOError:
			# probably file doesn't exist, ignore
			return
		(self.db, self.names) = cPickle.load(f)
		f.close()

	def save(self):
		"Saves the index to the disk."
		for f in self.db.keys():
			self.db[f].fullname = ''
		f = open(self.name, 'w')
		cPickle.dump((self.db, self.names), f, cPickle.HIGHEST_PROTOCOL)
		f.close()

	def put_file(self, filename, fullpath):
		"Incorporates a file into the index."
		self.db[filename] = file_info(filename, fullpath)
		finfo_load(self.db[filename])
		if self.db[filename].type == 'u':
			# ignore files of unknown types, like unix sockets
			del(self.db[filename])
			return
		self.names.append(filename)
		self.pathdb[filename] = fullpath

	def get_file(self, filename):
		"Get the file_info object for the given filename."
		return self.db[filename]

	def populate(self, root, exclude):
		"Populate the index from a root path."

		def skip_file(relname):
			"Check if the file matches the exclude list"
			for r in exclude:
				if r.search(relname):
					return 1
			return 0

		root = os.path.abspath(root)
		base, reduced = os.path.split(root)
		self.put_file(reduced, root)
		tree = os.walk(root, topdown = True)
		for path, childs, files in tree:
			for f in files:
				full = path + '/' + f
				name = relative_path(base, full)
				if skip_file(name):
					continue
				self.put_file(name, full)
			for c in childs:
				full = path + '/' + c
				name = relative_path(base, full)
				if skip_file(name):
					continue
				self.put_file(name, full)


def quiet_unlink(path):
	"Removes the given file if exists, or do nothing if not."
	try:
		os.unlink(path)
	except OSError:
		pass


def force_unlink(path, type):
	"Removes a file or directory, recurses if necesary."
	if type != 'd':
		try:
			os.unlink(path)
		except OSError:
			pass
	else:
		try:
			os.removedirs(path)
		except OSError:
			pass

def make_path(f):
	"If f is 'a/b/c/f', make sure 'a/b/c' exist."
	dir = os.path.dirname(f)
	try:
		os.makedirs(dir)
	except OSError:
		# it can fail if already exist
		pass

def relative_path(base, path):
	"""If base = '/x/x/b' and path = '/x/x/b/c/d', returns 'b/c/d'. Both
	must be absolute for simplicity."""
	res = path[len(base):]
	while res[0] == '/':
		res = res[1:]
	return res


#
# main operations
#

def make_sync(sources, srcidx_path, dst_path, dstidx_path, exclude):
	"Sync two directories."
	# destination and indexes are always a complete path
	srcidx_path = os.path.join(os.getcwd(), srcidx_path)
	dst_path = os.path.join(os.getcwd(), dst_path)
	dstidx_path = os.path.join(os.getcwd(), dstidx_path)

	# process regular expressions
	exclude_re = []
	for r in exclude:
		exclude_re.append(re.compile(r))

	# load destination index
	printv("* loading destination index")
	dstidx = index_file(dstidx_path)
	dstidx.load()

	# create source index
	printv("* building source index")
	srcidx = index_file(srcidx_path)
	for src_path in sources:
		printv("\t* " + src_path)
		srcidx.populate(src_path, exclude_re)

	printv("* sync")

	# compare them
	update_files = []
	for f in srcidx.names:
		if f not in dstidx.names or \
				not finfo_cmp_data(srcidx.db[f], dstidx.db[f]):
			# files missing in destination, or data changed
			#dst = os.path.join(dst_path, f)
			dst = dst_path + '/' + f
			printv('data\t', f, dst)
			quiet_unlink(dst)
			finfo_copy_file(srcidx.db[f], dst)
			update_files.append((f, dst))
		elif not finfo_cmp_mdata(srcidx.db[f], dstidx.db[f]):
			# metadata changed
			#dst = os.path.join(dst_path, f)
			dst = dst_path + '/' + f
			printv('mdata\t', f, dst)
			update_files.append((f, dst))

	# metadata gets changed later because otherwise we could leave
	# directory times wrong due to files being added to a directory after
	# their creation; this way we're sure there will be no more file
	# creation afterwards
	printv('* mdata')
	for f, dst in update_files:
		try:
			finfo_update_mdata(srcidx.db[f], dst)
		except:
			# it can fail if the destination doesn't have the
			# file, ignore for now; TODO: output some kind of
			# script so people can run it later when they get all
			# back together
			pass

	printv('* unlink')
	for f in dstidx.names:
		if f not in srcidx.names:
			# files in destination and not in source
			#dst = os.path.join(dst_path, f)
			dst = dst_path + '/' + f
			printv('unlink\t', f, dst)
			force_unlink(dst, dstidx.db[f].type)

	# we save the index at last because it voids file_info.fullpath so we
	# don't save unnecesary information
	printv('* saving index')
	srcidx.save()


def show_idx(idx_path):
	printv("* loading index")
	idx = index_file(idx_path)
	idx.load()
	for f in idx.names:
		fi = idx.db[f]
		printv( "%s %d %f %s %s" % (fi.type, fi.size, fi.mtime,
				str(fi.hash), fi.name) )

def build_idx(idx_path, path):
	printv("* building index")

	# see comments in make_sync()
	while path[-1] == '/' and path != '/':
		path = path[:-1]
	idx_path = os.path.join(os.getcwd(), idx_path)
	if path != '/':
		parent, src = os.path.split(path)
		if parent:
			os.chdir(parent)
			path = src

	# build the index
	idx = index_file(idx_path)
	idx.populate(path)
	idx.save()


#
# helper functions
#

def printv(*params):
	"Equivalent to 'if verbose: print params'."
	if not verbose:
		return
	for i in params:
		print i,
	print


def parse_options():
	"Commandline options parser."
	from optparse import OptionParser
	class AbkOptionParser(OptionParser):
		"Custom abk command line option parser."
		def format_help (self, formatter=None):
			"Displays the description before usage."
			if formatter is None:
				formatter = self.formatter
			result = []
			if self.description:
				result.append(self.get_prog_name() + ' - ' + \
					self.format_description(formatter)+"\n\n")
			if self.usage:
				result.append(self.get_usage() + "\n")
			result.append(self.format_option_help(formatter))
			return "".join(result)
	usage = """%prog [options] command params

commands:
  show idx_file
    shows the given index file contents
  mkidx idx_file dir
    builds an index file for the given directory
  sync idx src1 [src2 ... srcN] dst
    synchronizes all sources with dst, using the given idx index file"""
	parser = AbkOptionParser(usage=usage, description="A backup script - "
		"Alberto Bertogli (albertogli@telpin.com.ar)",
		version="%prog " + VERSION, prog='abk')
	parser.add_option("-v", "--verbose",
		action="store_true", dest="verbose", default=True,
		help="print progress information [default]")
	parser.add_option("-q", "--quiet",
		action="store_false", dest="verbose",
		help="don't print progress information (just errors)")
	parser.add_option("-c", "--copy-mode", default='gzip', metavar="MODE",
		action="store", dest="copy_mode",
		help="select copy mode to use. Available modes: "
		"raw, gzip, bzip2 [default: gzip]")
	parser.add_option("-a", "--hash-mode", default='sha', metavar="MODE",
		action="store", dest="hash_mode",
		help="select the hash to use to check for file content change. "
		"Available modes: none, sha, md5 [default: sha]")
	parser.add_option("-e", "--exclude", metavar="REGEX",
		action="append", dest="exclude",
		help="excludes files that matches with the regular expression. "
		"This option accepts multiple instances")
	parser.add_option("-i", "--new-idx", metavar="FILE",
		action="store", dest="new_idx",
		help="select where to write the new generated index. "
		"This is useful for incremental backups. "
		"If not specified, the old index file (idx) is overwritten")
	(opts, args) = parser.parse_args()
	return (parser, opts, args)


#
# main
#

# command line options
(parser, opts, args) = parse_options()
verbose = opts.verbose

# configuration
if opts.copy_mode == 'raw':
	finfo_copy_file_reg = finfo_copy_file_reg_raw
elif opts.copy_mode == 'gzip':
	finfo_copy_file_reg = finfo_copy_file_reg_gzip
elif opts.copy_mode == 'bzip2':
	finfo_copy_file_reg = finfo_copy_file_reg_bzip2
else:
	parser.error("Invalid copy mode (%s)." % opts.copy_mode)

if opts.hash_mode == 'none':
	file_info.hash_file = finfo_hash_file_none
elif opts.hash_mode == 'md5':
	file_info.hash_file = finfo_hash_file_md5
elif opts.hash_mode == 'sha':
	file_info.hash_file = finfo_hash_file_sha
else:
	parser.error("Invalid hash mode (%s)." % opts.hash_mode)

# main command
try:
	cmd = args[0]
except:
	parser.error("Command missing.")

if cmd == 'show':
	try:
		show_idx(args[1])
	except:
		parser.error("Missing idx_file parameter.")
elif cmd == 'mkidx':
	try:
		idx_path = args[1]
		path = args[2]
	except:
		parser.error("Missing parameter(s) for command mkidx.")
	build_idx(idx_path, path)
elif cmd == 'sync':
	try:
		old_idx_path = args[1]
		new_idx_path = old_idx_path
		sources = args[2:-1]
		dst_path = args[-1]
	except:
		parser.error("Missing parameter(s) for command sync.")
	if opts.new_idx:
		new_idx_path = opts.new_idx
	exclude = []
	if opts.exclude:
		exclude = opts.exclude
	make_sync(sources, new_idx_path, dst_path, old_idx_path, exclude)
else:
	parser.error("Unknown command (%s)." % cmd)
git » abk » master » tree

[master] / abk