author | Alberto Bertogli
<albertito@blitiri.com.ar> 2024-10-31 13:05:57 UTC |
committer | Alberto Bertogli
<albertito@blitiri.com.ar> 2024-10-31 13:10:14 UTC |
parent | 723c47d352f4dc7c1c30e0c91937c193218e29f2 |
internal/normalize/normalize.go | +51 | -11 |
internal/normalize/normalize_test.go | +44 | -1 |
diff --git a/internal/normalize/normalize.go b/internal/normalize/normalize.go index 17768a5..6fd6f6a 100644 --- a/internal/normalize/normalize.go +++ b/internal/normalize/normalize.go @@ -78,23 +78,63 @@ func DomainToUnicode(addr string) (string, error) { // preexisting CRLF, it leaves it be. It assumes that CR is never used on its // own. func ToCRLF(in []byte) []byte { - b := bytes.NewBuffer(nil) + b := bytes.Buffer{} b.Grow(len(in)) - for _, c := range in { - switch c { - case '\r': - // Ignore CR, we'll add it back later. It should never appear - // alone in the contexts where this function is used. - case '\n': - b.Write([]byte("\r\n")) - default: - b.WriteByte(c) + + // We go line by line, but beware: + // Split("a\nb", "\n") -> ["a", "b"] + // Split("a\nb\n", "\n") -> ["a", "b", ""] + // So we handle the last line separately. + lines := bytes.Split(in, []byte("\n")) + for i, line := range lines { + b.Write(line) + if i == len(lines)-1 { + // Do not add newline to the last line: + // - If the string ends with a newline, we already added it in + // the previous-to-last line, and this line is "". + // - If the string does NOT end with a newline, this preserves + // that property. + break + } + if !bytes.HasSuffix(line, []byte("\r")) { + // Missing the CR. + b.WriteByte('\r') } + b.WriteByte('\n') } + return b.Bytes() } // StringToCRLF is like ToCRLF, but operates on strings. func StringToCRLF(in string) string { - return string(ToCRLF([]byte(in))) + // We implement it the same way as ToCRLF, but with string versions. + // This is significantly faster than converting the string to a byte + // slice, calling ToCRLF, and converting it back. + b := strings.Builder{} + b.Grow(len(in)) + + // We go line by line, but beware: + // Split("a\nb", "\n") -> ["a", "b"] + // Split("a\nb\n", "\n") -> ["a", "b", ""] + // So we handle the last line separately. + lines := strings.Split(in, "\n") + for i, line := range lines { + b.WriteString(line) + if i == len(lines)-1 { + // Do not add newline to the last line: + // - If the string ends with a newline, we already added it in + // the previous-to-last line, and this line is "". + // - If the string does NOT end with a newline, this preserves + // that property. + break + } + if !strings.HasSuffix(line, "\r") { + // Missing the CR. + b.WriteByte('\r') + } + b.WriteByte('\n') + } + + return b.String() } diff --git a/internal/normalize/normalize_test.go b/internal/normalize/normalize_test.go index e5c7829..66bd324 100644 --- a/internal/normalize/normalize_test.go +++ b/internal/normalize/normalize_test.go @@ -1,6 +1,10 @@ package normalize -import "testing" +import ( + "bytes" + "strings" + "testing" +) func TestUser(t *testing.T) { valid := []struct{ user, norm string }{ @@ -134,8 +138,19 @@ func TestToCRLF(t *testing.T) { in, out string }{ {"", ""}, + {"a", "a"}, + + // Does not end in newline. + {"a\n", "a\r\n"}, {"a\nb", "a\r\nb"}, {"a\r\nb", "a\r\nb"}, + + // Ends in newline. + {"a\nb\n", "a\r\nb\r\n"}, + {"a\r\nb\n", "a\r\nb\r\n"}, + {"a\r\nb\r\n", "a\r\nb\r\n"}, + {"a\r\nb\n\n", "a\r\nb\r\n\r\n"}, + {"a\r\nb\r\n\r\n", "a\r\nb\r\n\r\n"}, } for _, c := range cases { got := string(ToCRLF([]byte(c.in))) @@ -173,3 +188,31 @@ func FuzzDomainToUnicode(f *testing.F) { DomainToUnicode(addr) }) } + +func BenchmarkToCRLF(b *testing.B) { + // Generate a 1000-line message. + bb := bytes.Buffer{} + for i := 0; i < 1000; i++ { + bb.WriteString("this is a very pretty line 🐅\n") + } + buf := bb.Bytes() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + ToCRLF(buf) + } +} + +func BenchmarkStringToCRLF(b *testing.B) { + // Generate a 1000-line message. + sb := strings.Builder{} + for i := 0; i < 1000; i++ { + sb.WriteString("this is a very pretty line 🐅\n") + } + s := sb.String() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + StringToCRLF(s) + } +}