git » git-arr » commit 1183d6f

Move to Python 3

author Alberto Bertogli
2020-05-24 01:36:43 UTC
committer Alberto Bertogli
2020-05-24 03:50:39 UTC
parent cbb36e087c1bcf1c81de53e920baf0c681abfd87

Move to Python 3

Python 3 was released more than 10 years ago, and support for Python 2
is going away, with many Linux distributions starting to phase it out.

This patch migrates git-arr to Python 3.

The generated output is almost exactly the same, there are some minor
differences such as HTML characters being quoted more aggresively, and
handling of paths with non-utf8 values.

.gitignore +2 -1
git-arr +11 -13
git.py +38 -49
utils.py +4 -2
views/tree-list.html +1 -1

diff --git a/.gitignore b/.gitignore
index faf410c..94fab83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.pyc
 __pycache__
-.*.swp
+.*
+!.gitignore
diff --git a/git-arr b/git-arr
index 5c4e7db..98a6bc7 100755
--- a/git-arr
+++ b/git-arr
@@ -1,21 +1,15 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 """
 git-arr: A git web html generator.
 """
 
-from __future__ import print_function
-
+import configparser
 import math
 import optparse
 import os
 import re
 import sys
 
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
-
 import bottle
 
 import git
@@ -64,7 +58,7 @@ def load_config(path):
         'generate_patch': 'yes',
     }
 
-    config = configparser.SafeConfigParser(defaults)
+    config = configparser.ConfigParser(defaults)
     config.read(path)
 
     # Do a first pass for general sanity checking and recursive expansion.
@@ -118,7 +112,7 @@ def load_config(path):
         r.info.commits_per_page = config.getint(s, 'commits_per_page')
         r.info.max_pages = config.getint(s, 'max_pages')
         if r.info.max_pages <= 0:
-            r.info.max_pages = sys.maxint
+            r.info.max_pages = sys.maxsize
         r.info.generate_tree = config.getboolean(s, 'tree')
         r.info.root_diff = config.getboolean(s, 'rootdiff')
         r.info.generate_patch = config.getboolean(s, 'generate_patch')
@@ -263,6 +257,10 @@ def blob(repo, bname, fname, dirname = ''):
     fname = git.smstr.from_url(fname)
     path = dirname.raw + fname.raw
 
+    # Handle backslash-escaped characters, which are not utf8.
+    # This matches the generated links from git.unquote().
+    path = path.encode("utf8").decode("unicode-escape").encode("latin1")
+
     content = repo.blob(path, bname)
     if content is None:
         bottle.abort(404, "File %r not found in branch %s" % (path, bname))
@@ -339,7 +337,7 @@ def generate(output, only = None):
         else:
             # Otherwise, be lazy if we were given a function to run, or write
             # always if they gave us a string.
-            if isinstance(func_or_str, (str, unicode)):
+            if isinstance(func_or_str, str):
                 print(path)
                 s = func_or_str
             else:
@@ -348,7 +346,7 @@ def generate(output, only = None):
                 print(path)
                 s = func_or_str(*args)
 
-        open(path, 'w').write(s.encode('utf8', errors = 'xmlcharrefreplace'))
+        open(path, 'w').write(s)
         if mtime:
             os.utime(path, (mtime, mtime))
 
@@ -398,7 +396,7 @@ def generate(output, only = None):
     write_to('static/syntax.css', read_f, [static_path + '/syntax.css'],
             os.stat(static_path + '/syntax.css').st_mtime)
 
-    rs = sorted(repos.values(), key = lambda r: r.name)
+    rs = sorted(list(repos.values()), key = lambda r: r.name)
     if only:
         rs = [r for r in rs if r.name in only]
 
diff --git a/git.py b/git.py
index 2240175..09ccd37 100644
--- a/git.py
+++ b/git.py
@@ -12,35 +12,13 @@ import subprocess
 from collections import defaultdict
 import email.utils
 import datetime
-import urllib
-from cgi import escape
+import urllib.request, urllib.parse, urllib.error
+from html import escape
 
 
 # Path to the git binary.
 GIT_BIN = "git"
 
-class EncodeWrapper:
-    """File-like wrapper that returns data utf8 encoded."""
-    def __init__(self, fd, encoding = 'utf8', errors = 'replace'):
-        self.fd = fd
-        self.encoding = encoding
-        self.errors = errors
-
-    def __iter__(self):
-        for line in self.fd:
-            yield line.decode(self.encoding, errors = self.errors)
-
-    def read(self):
-        """Returns the whole content."""
-        s = self.fd.read()
-        return s.decode(self.encoding, errors = self.errors)
-
-    def readline(self):
-        """Returns a single line."""
-        s = self.fd.readline()
-        return s.decode(self.encoding, errors = self.errors)
-
-
 def run_git(repo_path, params, stdin = None, silent_stderr = False, raw = False):
     """Invokes git with the given parameters.
 
@@ -66,13 +44,8 @@ def run_git(repo_path, params, stdin = None, silent_stderr = False, raw = False)
     if raw:
         return p.stdout
 
-    # We need to wrap stdout if we want to decode it as utf8, subprocess
-    # doesn't support us telling it the encoding.
-    if sys.version_info.major == 3:
-        return io.TextIOWrapper(p.stdout, encoding = 'utf8',
-                errors = 'replace')
-    else:
-        return EncodeWrapper(p.stdout)
+    return io.TextIOWrapper(p.stdout, encoding = 'utf8',
+            errors = 'backslashreplace')
 
 
 class GitCommand (object):
@@ -109,6 +82,8 @@ class GitCommand (object):
     def stdin(self, s):
         """Sets the contents we will send in stdin."""
         self._override = True
+        if isinstance(s, str):
+            s = s.encode("utf8")
         self._stdin_buf = s
         self._override = False
 
@@ -116,7 +91,7 @@ class GitCommand (object):
         """Runs the git command."""
         params = [self._cmd]
 
-        for k, v in self._kwargs.items():
+        for k, v in list(self._kwargs.items()):
             dash = '--' if len(k) > 1 else '-'
             if v is None:
                 params.append('%s%s' % (dash, k))
@@ -146,11 +121,16 @@ class smstr:
         .html    -> an HTML-embeddable representation.
     """
     def __init__(self, raw):
-        if not isinstance(raw, str):
-            raise TypeError("The raw string must be instance of 'str'")
+        if not isinstance(raw, (str, bytes)):
+            raise TypeError(
+                    "The raw string must be instance of 'str', not %s" %
+                    type(raw))
         self.raw = raw
-        self.unicode = raw.decode('utf8', errors = 'replace')
-        self.url = urllib.pathname2url(raw)
+        if isinstance(raw, bytes):
+            self.unicode = raw.decode('utf8', errors = 'backslashreplace')
+        else:
+            self.unicode = raw
+        self.url = urllib.request.pathname2url(raw)
         self.html = self._to_html()
 
     def __cmp__(self, other):
@@ -163,7 +143,7 @@ class smstr:
     @staticmethod
     def from_url(url):
         """Returns an smstr() instance from an url-encoded string."""
-        return smstr(urllib.url2pathname(url))
+        return smstr(urllib.request.url2pathname(url))
 
     def split(self, sep):
         """Like str.split()."""
@@ -176,10 +156,10 @@ class smstr:
 
     def _to_html(self):
         """Returns an html representation of the unicode string."""
-        html = u''
+        html = ''
         for c in escape(self.unicode):
             if c in '\t\r\n\r\f\a\b\v\0':
-                esc_c = c.encode('ascii').encode('string_escape')
+                esc_c = c.encode("unicode-escape").decode("utf8")
                 html += '<span class="ctrlchr">%s</span>' % esc_c
             else:
                 html += c
@@ -190,14 +170,23 @@ class smstr:
 def unquote(s):
     """Git can return quoted file names, unquote them. Always return a str."""
     if not (s[0] == '"' and s[-1] == '"'):
-        # Unquoted strings are always safe, no need to mess with them; just
-        # make sure we return str.
-        s = s.encode('ascii')
+        # Unquoted strings are always safe, no need to mess with them
         return s
 
-    # Get rid of the quotes, we never want them in the output, and convert to
-    # a raw string, un-escaping the backslashes.
-    s = s[1:-1].decode('string-escape')
+    # The string will be of the form `"<escaped>"`, where <escaped> is a
+    # backslash-escaped representation of the name of the file.
+    # Examples:  "with\ttwo\ttabs" , "\303\261aca-utf8", "\361aca-latin1"
+
+    # Get rid of the quotes, we never want them in the output.
+    s = s[1:-1]
+
+    # Un-escape the backslashes.
+    # latin1 is ok to use here because in Python it just maps the code points
+    # 0-255 to the bytes 0x-0xff, which is what we expect.
+    s = s.encode("latin1").decode("unicode-escape")
+
+    # Convert to utf8.
+    s = s.encode("latin1").decode("utf8", errors='backslashreplace')
 
     return s
 
@@ -337,13 +326,13 @@ class Repo:
         cmd.raw(True)
         cmd.batch = '%(objectsize)'
 
-        if isinstance(ref, unicode):
-            ref = ref.encode('utf8')
-        cmd.stdin('%s:%s' % (ref, path))
+        # Format: <ref>:<path>
+        # Construct it in binary since the path might not be utf8.
+        cmd.stdin(ref.encode("utf8") + b":" + path)
 
         out = cmd.run()
         head = out.readline()
-        if not head or head.strip().endswith('missing'):
+        if not head or head.strip().endswith(b'missing'):
             return None
 
         return Blob(out.read()[:int(head)])
diff --git a/utils.py b/utils.py
index ada9c7e..4e12b0d 100644
--- a/utils.py
+++ b/utils.py
@@ -108,15 +108,17 @@ def markdown_blob(s):
 
 def embed_image_blob(fname, image_data):
     mimetype = mimetypes.guess_type(fname)[0]
+    b64img = base64.b64encode(image_data).decode("ascii")
     return '<img style="max-width:100%;" src="data:{0};base64,{1}" />'.format( \
-                                    mimetype, base64.b64encode(image_data))
+                                    mimetype, b64img)
 
 def is_binary(s):
     # Git considers a blob binary if NUL in first ~8KB, so do the same.
-    return '\0' in s[:8192]
+    return b'\0' in s[:8192]
 
 def hexdump(s):
     graph = string.ascii_letters + string.digits + string.punctuation + ' '
+    s = s.decode("latin1")
     offset = 0
     while s:
         t = s[:16]
diff --git a/views/tree-list.html b/views/tree-list.html
index 70f032a..ce5b0d6 100644
--- a/views/tree-list.html
+++ b/views/tree-list.html
@@ -1,5 +1,5 @@
 <table class="nice toggable ls" id="ls">
-% key_func = lambda (t, n, s): (t != 'tree', n.raw)
+% key_func = lambda x: (x[0] != 'tree', x[1].raw)
 % for type, name, size in sorted(tree.ls(dirname.raw), key = key_func):
     <tr class="{{type}}">
 %   if type == "blob":