git » chasquid » commit 41bb7b6

normalize: Improve ToCRLF/StringToCRLF performance

author Alberto Bertogli
2024-10-31 13:05:57 UTC
committer Alberto Bertogli
2024-10-31 13:10:14 UTC
parent 723c47d352f4dc7c1c30e0c91937c193218e29f2

normalize: Improve ToCRLF/StringToCRLF performance

The ToCRLF/StringToCRLF functions are not very performance critical, but
we call it for each mail, and the current implementation is very
inefficient (mainly because it goes one byte at a time).

This patch replaces it with a better implementation that goes line by line.

The new implementation of ToCRLF is ~40% faster, and StringToCRLF is ~60%
faster.

```
$ benchstat old.txt new.txt
goos: linux
goarch: amd64
pkg: blitiri.com.ar/go/chasquid/internal/normalize
cpu: 13th Gen Intel(R) Core(TM) i9-13900T
                │    old.txt    │               new.txt                │
                │    sec/op     │    sec/op     vs base                │
ToCRLF-32         162.96µ ±  6%   95.42µ ± 12%  -41.44% (p=0.000 n=10)
StringToCRLF-32   190.70µ ± 14%   76.51µ ±  6%  -59.88% (p=0.000 n=10)
geomean            176.3µ         85.44µ        -51.53%
```

internal/normalize/normalize.go +51 -11
internal/normalize/normalize_test.go +44 -1

diff --git a/internal/normalize/normalize.go b/internal/normalize/normalize.go
index 17768a5..6fd6f6a 100644
--- a/internal/normalize/normalize.go
+++ b/internal/normalize/normalize.go
@@ -78,23 +78,63 @@ func DomainToUnicode(addr string) (string, error) {
 // preexisting CRLF, it leaves it be. It assumes that CR is never used on its
 // own.
 func ToCRLF(in []byte) []byte {
-	b := bytes.NewBuffer(nil)
+	b := bytes.Buffer{}
 	b.Grow(len(in))
-	for _, c := range in {
-		switch c {
-		case '\r':
-			// Ignore CR, we'll add it back later. It should never appear
-			// alone in the contexts where this function is used.
-		case '\n':
-			b.Write([]byte("\r\n"))
-		default:
-			b.WriteByte(c)
+
+	// We go line by line, but beware:
+	//   Split("a\nb", "\n") -> ["a", "b"]
+	//   Split("a\nb\n", "\n") -> ["a", "b", ""]
+	// So we handle the last line separately.
+	lines := bytes.Split(in, []byte("\n"))
+	for i, line := range lines {
+		b.Write(line)
+		if i == len(lines)-1 {
+			// Do not add newline to the last line:
+			//  - If the string ends with a newline, we already added it in
+			//    the previous-to-last line, and this line is "".
+			//  - If the string does NOT end with a newline, this preserves
+			//    that property.
+			break
+		}
+		if !bytes.HasSuffix(line, []byte("\r")) {
+			// Missing the CR.
+			b.WriteByte('\r')
 		}
+		b.WriteByte('\n')
 	}
+
 	return b.Bytes()
 }
 
 // StringToCRLF is like ToCRLF, but operates on strings.
 func StringToCRLF(in string) string {
-	return string(ToCRLF([]byte(in)))
+	// We implement it the same way as ToCRLF, but with string versions.
+	// This is significantly faster than converting the string to a byte
+	// slice, calling ToCRLF, and converting it back.
+	b := strings.Builder{}
+	b.Grow(len(in))
+
+	// We go line by line, but beware:
+	//   Split("a\nb", "\n") -> ["a", "b"]
+	//   Split("a\nb\n", "\n") -> ["a", "b", ""]
+	// So we handle the last line separately.
+	lines := strings.Split(in, "\n")
+	for i, line := range lines {
+		b.WriteString(line)
+		if i == len(lines)-1 {
+			// Do not add newline to the last line:
+			//  - If the string ends with a newline, we already added it in
+			//    the previous-to-last line, and this line is "".
+			//  - If the string does NOT end with a newline, this preserves
+			//    that property.
+			break
+		}
+		if !strings.HasSuffix(line, "\r") {
+			// Missing the CR.
+			b.WriteByte('\r')
+		}
+		b.WriteByte('\n')
+	}
+
+	return b.String()
 }
diff --git a/internal/normalize/normalize_test.go b/internal/normalize/normalize_test.go
index e5c7829..66bd324 100644
--- a/internal/normalize/normalize_test.go
+++ b/internal/normalize/normalize_test.go
@@ -1,6 +1,10 @@
 package normalize
 
-import "testing"
+import (
+	"bytes"
+	"strings"
+	"testing"
+)
 
 func TestUser(t *testing.T) {
 	valid := []struct{ user, norm string }{
@@ -134,8 +138,19 @@ func TestToCRLF(t *testing.T) {
 		in, out string
 	}{
 		{"", ""},
+		{"a", "a"},
+
+		// Does not end in newline.
+		{"a\n", "a\r\n"},
 		{"a\nb", "a\r\nb"},
 		{"a\r\nb", "a\r\nb"},
+
+		// Ends in newline.
+		{"a\nb\n", "a\r\nb\r\n"},
+		{"a\r\nb\n", "a\r\nb\r\n"},
+		{"a\r\nb\r\n", "a\r\nb\r\n"},
+		{"a\r\nb\n\n", "a\r\nb\r\n\r\n"},
+		{"a\r\nb\r\n\r\n", "a\r\nb\r\n\r\n"},
 	}
 	for _, c := range cases {
 		got := string(ToCRLF([]byte(c.in)))
@@ -173,3 +188,31 @@ func FuzzDomainToUnicode(f *testing.F) {
 		DomainToUnicode(addr)
 	})
 }
+
+func BenchmarkToCRLF(b *testing.B) {
+	// Generate a 1000-line message.
+	bb := bytes.Buffer{}
+	for i := 0; i < 1000; i++ {
+		bb.WriteString("this is a very pretty line 🐅\n")
+	}
+	buf := bb.Bytes()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		ToCRLF(buf)
+	}
+}
+
+func BenchmarkStringToCRLF(b *testing.B) {
+	// Generate a 1000-line message.
+	sb := strings.Builder{}
+	for i := 0; i < 1000; i++ {
+		sb.WriteString("this is a very pretty line 🐅\n")
+	}
+	s := sb.String()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		StringToCRLF(s)
+	}
+}