git » git-arr » commit bb18564

Use xattr to avoid rewriting all blobs on each commit

author Alberto Bertogli
2025-07-13 22:56:45 UTC
committer Alberto Bertogli
2025-07-14 00:46:14 UTC
parent 736d8be9cfeffcf2448cd14a46704115006943e3

Use xattr to avoid rewriting all blobs on each commit

Today, on each commit we regenerate the entire tree for the relevant
branch, which takes a significant amount of time.

This patch records the object id of each blob when it is written, and
then avoids any later writes if the object id is the same.

The result is a significant performance improvement on incremental
regeneration.

If the python3-xattr package is not available, we fall back to the
previous behaviour of using the mtime of the branch.

git-arr +14 -2
git.py +4 -4
utils.py +29 -0
views/tree-list.html +1 -1

diff --git a/git-arr b/git-arr
index f89aa74..a145d6b 100755
--- a/git-arr
+++ b/git-arr
@@ -341,9 +341,17 @@ def generate(output: str, only=None):
     """Generate static html to the output directory."""
 
     @utils.log_timing("path")
-    def write_to(path: str, func_or_str, args=(), mtime=None):
+    def write_to(path: str, func_or_str, args=(), mtime=None, oid=""):
         path = output + "/" + path
 
+        if oid:
+            # If we were given an oid, try to use xattrs to check if the file
+            # we wrote is still the same, in which case we can skip writing
+            # it again.
+            path_oid = utils.get_xattr_oid(path)
+            if path_oid and path_oid == oid:
+                return
+
         if mtime:
             path_mtime: Union[float, int] = 0
             if os.path.exists(path):
@@ -381,6 +389,8 @@ def generate(output: str, only=None):
         open(path, "w").write(s)
         if mtime:
             os.utime(path, (mtime, mtime))
+        if oid:
+            utils.set_xattr_oid(path, oid)
 
     def link(from_path, to_path):
         from_path = output + "/" + from_path
@@ -395,7 +405,7 @@ def generate(output: str, only=None):
 
         write_to("r/%s/b/%s/t/index.html" % (r.name, bn), tree, (r, bn), mtime)
 
-        for otype, oname, _ in t.ls("", recursive=True):
+        for otype, oname, oid, _ in t.ls("", recursive=True):
             # FIXME: bottle cannot route paths with '\n' so those are sadly
             # expected to fail for now; we skip them.
             if "\n" in oname.raw:
@@ -417,6 +427,7 @@ def generate(output: str, only=None):
                     blob,
                     (r, bn, fname.url, dirname.url),
                     mtime,
+                    oid,
                 )
             else:
                 write_to(
@@ -425,6 +436,7 @@ def generate(output: str, only=None):
                     tree,
                     (r, bn, oname.url),
                     mtime,
+                    oid,
                 )
 
     # Always generate the index, to keep the "last updated" time fresh.
diff --git a/git.py b/git.py
index a2904c3..0869bdd 100644
--- a/git.py
+++ b/git.py
@@ -571,8 +571,8 @@ class Tree:
     @functools.lru_cache
     def ls(
         self, path, recursive=False
-    ) -> Iterable[Tuple[str, smstr, Optional[int]]]:
-        """Generates (type, name, size) for each file in path."""
+    ) -> Iterable[Tuple[str, smstr, str, Optional[int]]]:
+        """Generates (type, name, oid, size) for each file in path."""
         cmd = self.repo.cmd("ls-tree")
         cmd.long = None
         if recursive:
@@ -587,7 +587,7 @@ class Tree:
 
         files = []
         for l in cmd.run():
-            _mode, otype, _oid, size, name = l.split(None, 4)
+            _mode, otype, oid, size, name = l.split(None, 4)
             if size == "-":
                 size = None
             else:
@@ -602,7 +602,7 @@ class Tree:
 
             # We use a smart string for the name, as it's often tricky to
             # manipulate otherwise.
-            files.append((otype, smstr(name), size))
+            files.append((otype, smstr(name), oid, size))
 
         return files
 
diff --git a/utils.py b/utils.py
index e12c477..49fbb98 100644
--- a/utils.py
+++ b/utils.py
@@ -26,6 +26,7 @@ try:
 except ImportError:
     markdown = None
 
+
 import base64
 import functools
 import mimetypes
@@ -224,3 +225,31 @@ def log_timing(*log_args):
         return wrapper
 
     return log_timing_decorator
+
+
+try:
+    import xattr
+
+    def set_xattr_oid(path: str, oid: str):
+        """Set the xattr 'user.git-arr.oid' on the given path."""
+        try:
+            xattr.setxattr(path, "user.git-arr.oid", oid.encode("utf-8"))
+        except OSError as e:
+            print(f"{path}: error writing xattr: {e}")
+
+    def get_xattr_oid(path: str) -> str:
+        """Get the xattr 'user.git-arr.oid' from the given path."""
+        try:
+            return xattr.getxattr(path, "user.git-arr.oid").decode("utf-8")
+        except OSError as e:
+            return ""
+
+except ImportError:
+
+    def set_xattr_oid(path: str, oid: str):
+        """Set the xattr 'user.git-arr.oid' on the given path."""
+        pass
+
+    def get_xattr_oid(path: str) -> str:
+        """Get the xattr 'user.git-arr.oid' from the given path."""
+        return ""
diff --git a/views/tree-list.html b/views/tree-list.html
index 89549bd..3c66acc 100644
--- a/views/tree-list.html
+++ b/views/tree-list.html
@@ -1,6 +1,6 @@
 <table class="nice toggable ls" id="ls">
 % key_func = lambda x: (x[0] != 'tree', x[1].raw)
-% for type, name, size in sorted(tree.ls(dirname.raw), key = key_func):
+% for type, name, oid, size in sorted(tree.ls(dirname.raw), key = key_func):
     <tr class="{{type}}">
 %   if type == "blob":
         <td class="name"><a href="{{treeroot}}/f={{name.url}}.html">