author | Alberto Bertogli
<albertito@blitiri.com.ar> 2022-03-07 01:13:34 UTC |
committer | Alberto Bertogli
<albertito@blitiri.com.ar> 2022-11-13 11:09:19 UTC |
parent | 3ebe5c5173c0a160e9260b59291a1169117c4e2d |
internal/maillog/maillog.go | +1 | -1 |
internal/nettrace/bench_test.go | +57 | -0 |
internal/nettrace/context.go | +40 | -0 |
internal/nettrace/context_test.go | +66 | -0 |
internal/nettrace/evtring.go | +42 | -0 |
internal/nettrace/histogram.go | +71 | -0 |
internal/nettrace/histogram_test.go | +29 | -0 |
internal/nettrace/http.go | +259 | -0 |
internal/nettrace/http_test.go | +255 | -0 |
internal/nettrace/templates/_latency.html.tmpl | +25 | -0 |
internal/nettrace/templates/_recursive.html.tmpl | +51 | -0 |
internal/nettrace/templates/_single.html.tmpl | +35 | -0 |
internal/nettrace/templates/index.html.tmpl | +129 | -0 |
internal/nettrace/templates/style.css | +187 | -0 |
internal/nettrace/test_server.go | +243 | -0 |
internal/nettrace/trace.go | +558 | -0 |
internal/nettrace/trace_test.go | +215 | -0 |
internal/trace/trace.go | +13 | -53 |
diff --git a/internal/maillog/maillog.go b/internal/maillog/maillog.go index ce01119..9e95790 100644 --- a/internal/maillog/maillog.go +++ b/internal/maillog/maillog.go @@ -15,7 +15,7 @@ import ( // Global event logs. var ( - authLog = trace.NewEventLog("Authentication", "Incoming SMTP") + authLog = trace.New("Authentication", "Incoming SMTP") ) // Logger contains a backend used to log data to, such as a file or syslog. diff --git a/internal/nettrace/bench_test.go b/internal/nettrace/bench_test.go new file mode 100644 index 0000000..a58fdbb --- /dev/null +++ b/internal/nettrace/bench_test.go @@ -0,0 +1,57 @@ +package nettrace + +import ( + "testing" +) + +// Our benchmark loop is similar to the one from golang.org/x/net/trace, so we +// can compare results. +func runBench(b *testing.B, events int) { + nTraces := (b.N + events + 1) / events + + for i := 0; i < nTraces; i++ { + tr := New("bench", "test") + for j := 0; j < events; j++ { + tr.Printf("%d", j) + } + tr.Finish() + } +} + +func BenchmarkTrace_2(b *testing.B) { + runBench(b, 2) +} + +func BenchmarkTrace_10(b *testing.B) { + runBench(b, 10) +} + +func BenchmarkTrace_100(b *testing.B) { + runBench(b, 100) +} + +func BenchmarkTrace_1000(b *testing.B) { + runBench(b, 1000) +} + +func BenchmarkTrace_10000(b *testing.B) { + runBench(b, 10000) +} + +func BenchmarkNewAndFinish(b *testing.B) { + for i := 0; i < b.N; i++ { + tr := New("bench", "test") + tr.Finish() + } +} + +func BenchmarkPrintf(b *testing.B) { + tr := New("bench", "test") + defer tr.Finish() + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Keep this without any formatting, so we measure our code instead of + // the performance of fmt.Sprintf. + tr.Printf("this is printf") + } +} diff --git a/internal/nettrace/context.go b/internal/nettrace/context.go new file mode 100644 index 0000000..78663d5 --- /dev/null +++ b/internal/nettrace/context.go @@ -0,0 +1,40 @@ +package nettrace + +import "context" + +type ctxKeyT string + +const ctxKey ctxKeyT = "blitiri.com.ar/go/srv/nettrace" + +// NewContext returns a new context with the given trace attached. +func NewContext(ctx context.Context, tr Trace) context.Context { + return context.WithValue(ctx, ctxKey, tr) +} + +// FromContext returns the trace attached to the given context (if any). +func FromContext(ctx context.Context) (Trace, bool) { + tr, ok := ctx.Value(ctxKey).(Trace) + return tr, ok +} + +// FromContextOrNew returns the trace attached to the given context, or a new +// trace if there is none. +func FromContextOrNew(ctx context.Context, family, title string) (Trace, context.Context) { + tr, ok := FromContext(ctx) + if ok { + return tr, ctx + } + + tr = New(family, title) + return tr, NewContext(ctx, tr) +} + +// ChildFromContext returns a new trace that is a child of the one attached to +// the context (if any). +func ChildFromContext(ctx context.Context, family, title string) Trace { + parent, ok := FromContext(ctx) + if ok { + return parent.NewChild(family, title) + } + return New(family, title) +} diff --git a/internal/nettrace/context_test.go b/internal/nettrace/context_test.go new file mode 100644 index 0000000..11eb401 --- /dev/null +++ b/internal/nettrace/context_test.go @@ -0,0 +1,66 @@ +package nettrace + +import ( + "context" + "testing" +) + +func TestContext(t *testing.T) { + tr := New("TestContext", "trace") + defer tr.Finish() + + // Attach the trace to a new context. + ctx := NewContext(context.Background(), tr) + + // Get the trace back from the context. + { + tr2, ok := FromContext(ctx) + if !ok { + t.Errorf("Context with trace returned not found") + } + if tr != tr2 { + t.Errorf("Trace from context is different: %v != %v", tr, tr2) + } + } + + // Create a child trace from the context. + { + tr3 := ChildFromContext(ctx, "TestContext", "child") + if p := tr3.(*trace).Parent; p != tr { + t.Errorf("Child doesn't have the right parent: %v != %v", p, tr) + } + tr3.Finish() + } + + // FromContextOrNew returns the one from the context. + { + tr4, ctx4 := FromContextOrNew(ctx, "TestContext", "from-ctx") + if ctx4 != ctx { + t.Errorf("Got new context: %v != %v", ctx4, ctx) + } + if tr4 != tr { + t.Errorf("Context with trace returned new trace: %v != %v", tr4, tr) + } + } + + // FromContextOrNew needs to create a new one. + { + tr5, ctx5 := FromContextOrNew( + context.Background(), "TestContext", "tr5") + if tr, _ := FromContext(ctx5); tr != tr5 { + t.Errorf("Context with trace returned the wrong trace: %v != %v", + tr, tr5) + } + tr5.Finish() + } + + // Child from a context that has no trace attached. + { + tr6 := ChildFromContext( + context.Background(), "TestContext", "child") + tr6.Finish() + if p := tr6.(*trace).Parent; p != nil { + t.Errorf("Expected orphan trace, it has a parent: %v", p) + } + } +} diff --git a/internal/nettrace/evtring.go b/internal/nettrace/evtring.go new file mode 100644 index 0000000..026e8e3 --- /dev/null +++ b/internal/nettrace/evtring.go @@ -0,0 +1,42 @@ +package nettrace + +import "time" + +type evtRing struct { + evts []event + max int + pos int // Points to the latest element. + firstDrop time.Time +} + +func newEvtRing(n int) *evtRing { + return &evtRing{ + max: n, + pos: -1, + } +} + +func (r *evtRing) Add(e *event) { + if len(r.evts) < r.max { + r.evts = append(r.evts, *e) + r.pos++ + return + } + + r.pos = (r.pos + 1) % r.max + + // Record the first drop as the time of the first dropped message. + if r.firstDrop.IsZero() { + r.firstDrop = r.evts[r.pos].When + } + + r.evts[r.pos] = *e +} + +func (r *evtRing) Do(f func(e *event)) { + for i := 0; i < len(r.evts); i++ { + // Go from older to newer by starting at (r.pos+1). + pos := (r.pos + 1 + i) % len(r.evts) + f(&r.evts[pos]) + } +} diff --git a/internal/nettrace/histogram.go b/internal/nettrace/histogram.go new file mode 100644 index 0000000..662a887 --- /dev/null +++ b/internal/nettrace/histogram.go @@ -0,0 +1,71 @@ +package nettrace + +import ( + "time" +) + +type histogram struct { + count [nBuckets]uint64 + + totalQ uint64 + totalT time.Duration + min time.Duration + max time.Duration +} + +func (h *histogram) Add(bucket int, latency time.Duration) { + if h.totalQ == 0 || h.min > latency { + h.min = latency + } + if h.max < latency { + h.max = latency + } + + h.count[bucket]++ + h.totalQ++ + h.totalT += latency +} + +type histSnapshot struct { + Counts map[time.Duration]line + Count uint64 + Avg, Min, Max time.Duration +} + +type line struct { + Start time.Duration + BucketIdx int + Count uint64 + Percent float32 + CumPct float32 +} + +func (h *histogram) Snapshot() *histSnapshot { + s := &histSnapshot{ + Counts: map[time.Duration]line{}, + Count: h.totalQ, + Min: h.min, + Max: h.max, + } + + if h.totalQ > 0 { + s.Avg = time.Duration(uint64(h.totalT) / h.totalQ) + } + + var cumCount uint64 + for i := 0; i < nBuckets; i++ { + cumCount += h.count[i] + l := line{ + Start: buckets[i], + BucketIdx: i, + Count: h.count[i], + } + if h.totalQ > 0 { + l.Percent = float32(h.count[i]) / float32(h.totalQ) * 100 + l.CumPct = float32(cumCount) / float32(h.totalQ) * 100 + } + s.Counts[buckets[i]] = l + } + + return s +} diff --git a/internal/nettrace/histogram_test.go b/internal/nettrace/histogram_test.go new file mode 100644 index 0000000..c6c5ab4 --- /dev/null +++ b/internal/nettrace/histogram_test.go @@ -0,0 +1,29 @@ +package nettrace + +import ( + "testing" + "time" +) + +func TestHistogramBasic(t *testing.T) { + h := histogram{} + h.Add(1, 1*time.Millisecond) + + snap := h.Snapshot() + if snap.Count != 1 || + snap.Min != 1*time.Millisecond || + snap.Max != 1*time.Millisecond || + snap.Avg != 1*time.Millisecond { + t.Errorf("expected snapshot with only 1 sample, got %v", snap) + } +} + +func TestHistogramEmpty(t *testing.T) { + h := histogram{} + snap := h.Snapshot() + + if len(snap.Counts) != nBuckets || snap.Count != 0 || + snap.Avg != 0 || snap.Min != 0 || snap.Max != 0 { + t.Errorf("expected zero snapshot, got %v", snap) + } +} diff --git a/internal/nettrace/http.go b/internal/nettrace/http.go new file mode 100644 index 0000000..2f51acc --- /dev/null +++ b/internal/nettrace/http.go @@ -0,0 +1,259 @@ +package nettrace + +import ( + "bytes" + "embed" + "fmt" + "hash/crc32" + "html/template" + "math" + "net/http" + "sort" + "strconv" + "time" +) + +//go:embed "templates/*.tmpl" "templates/*.css" +var templatesFS embed.FS +var top *template.Template + +func init() { + top = template.Must( + template.New("_top").Funcs(template.FuncMap{ + "stripZeros": stripZeros, + "roundSeconds": roundSeconds, + "roundDuration": roundDuration, + "colorize": colorize, + "depthspan": depthspan, + "shorttitle": shorttitle, + "traceemoji": traceemoji, + }).ParseFS(templatesFS, "templates/*")) +} + +// RegisterHandler registers a the trace handler in the given ServeMux, on +// `/debug/traces`. +func RegisterHandler(mux *http.ServeMux) { + mux.HandleFunc("/debug/traces", RenderTraces) +} + +// RenderTraces is an http.Handler that renders the tracing information. +func RenderTraces(w http.ResponseWriter, req *http.Request) { + data := &struct { + Buckets *[]time.Duration + FamTraces map[string]*familyTraces + + // When displaying traces for a specific family. + Family string + Bucket int + BucketStr string + AllGT bool + Traces []*trace + + // When displaying latencies for a specific family. + Latencies *histSnapshot + + // When displaying a specific trace. + Trace *trace + AllEvents []traceAndEvent + + // Error to show to the user. + Error string + }{} + + // Reference the common buckets, no need to copy them. + data.Buckets = &buckets + + // Copy the family traces map, so we don't have to keep it locked for too + // long. We'll still need to lock individual entries. + data.FamTraces = copyFamilies() + + // Default to showing greater-than. + data.AllGT = true + if all := req.FormValue("all"); all != "" { + data.AllGT, _ = strconv.ParseBool(all) + } + + // Fill in the family related parameters. + if fam := req.FormValue("fam"); fam != "" { + if _, ok := data.FamTraces[fam]; !ok { + data.Family = "" + data.Error = "Unknown family" + w.WriteHeader(http.StatusNotFound) + goto render + } + data.Family = fam + + if bs := req.FormValue("b"); bs != "" { + i, err := strconv.Atoi(bs) + if err != nil { + data.Error = "Invalid bucket (not a number)" + w.WriteHeader(http.StatusBadRequest) + goto render + } else if i < -2 || i >= nBuckets { + data.Error = "Invalid bucket number" + w.WriteHeader(http.StatusBadRequest) + goto render + } + data.Bucket = i + data.Traces = data.FamTraces[data.Family].TracesFor(i, data.AllGT) + + switch i { + case -2: + data.BucketStr = "errors" + case -1: + data.BucketStr = "active" + default: + data.BucketStr = buckets[i].String() + } + } + } + + if lat := req.FormValue("lat"); data.Family != "" && lat != "" { + data.Latencies = data.FamTraces[data.Family].Latencies() + } + + if traceID := req.FormValue("trace"); traceID != "" { + refID := req.FormValue("ref") + tr := findInFamilies(id(traceID), id(refID)) + if tr == nil { + data.Error = "Trace not found" + w.WriteHeader(http.StatusNotFound) + goto render + } + data.Trace = tr + data.Family = tr.Family + data.AllEvents = allEvents(tr) + } + +render: + + // Write into a buffer, to avoid accidentally holding a lock on http + // writes. It shouldn't happen, but just to be extra safe. + bw := &bytes.Buffer{} + bw.Grow(16 * 1024) + err := top.ExecuteTemplate(bw, "index.html.tmpl", data) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + panic(err) + } + + w.Write(bw.Bytes()) +} + +type traceAndEvent struct { + Trace *trace + Event event + Depth uint +} + +// allEvents gets all the events for the trace and its children/linked traces; +// and returns them sorted by timestamp. +func allEvents(tr *trace) []traceAndEvent { + // Map tracking all traces we've seen, to avoid loops. + seen := map[id]bool{} + + // Recursively gather all events. + evts := appendAllEvents(tr, []traceAndEvent{}, seen, 0) + + // Sort them by time. + sort.Slice(evts, func(i, j int) bool { + return evts[i].Event.When.Before(evts[j].Event.When) + }) + + return evts +} + +func appendAllEvents(tr *trace, evts []traceAndEvent, seen map[id]bool, depth uint) []traceAndEvent { + if seen[tr.ID] { + return evts + } + seen[tr.ID] = true + + subTraces := []*trace{} + + // Append all events of this trace. + trevts := tr.Events() + for _, e := range trevts { + evts = append(evts, traceAndEvent{tr, e, depth}) + if e.Ref != nil { + subTraces = append(subTraces, e.Ref) + } + } + + for _, t := range subTraces { + evts = appendAllEvents(t, evts, seen, depth+1) + } + + return evts +} + +func stripZeros(d time.Duration) string { + if d < time.Second { + _, frac := math.Modf(d.Seconds()) + return fmt.Sprintf(" .%6d", int(frac*1000000)) + } + return fmt.Sprintf("%.6f", d.Seconds()) +} + +func roundSeconds(d time.Duration) string { + return fmt.Sprintf("%.6f", d.Seconds()) +} + +func roundDuration(d time.Duration) time.Duration { + return d.Round(time.Millisecond) +} + +func colorize(depth uint, id id) template.CSS { + if depth == 0 { + return template.CSS("rgba(var(--text-color))") + } + + if depth > 3 { + depth = 3 + } + + // Must match the number of nested color variables in the CSS. + colori := crc32.ChecksumIEEE([]byte(id)) % 6 + return template.CSS( + fmt.Sprintf("var(--nested-d%02d-c%02d)", depth, colori)) +} + +func depthspan(depth uint) template.HTML { + s := `<span class="depth">` + switch depth { + case 0: + case 1: + s += "· " + case 2: + s += "· · " + case 3: + s += "· · · " + case 4: + s += "· · · · " + default: + s += fmt.Sprintf("· (%d) · ", depth) + } + + s += `</span>` + return template.HTML(s) +} + +// Hand-picked emojis that have enough visual differences in most common +// renderings, and are common enough to be able to easily describe them. +var emojids = []rune(`😀🤣😇🥰🤧😈🤡👻👽🤖👋✊🦴👅` + + `🐒🐕🦊🐱🐯🐎🐄🐷🐑🐐🐪🦒🐘🐀🦇🐓🦆🦚🦜🐢🐍🦖🐋🐟🦈🐙` + + `🦋🐜🐝🪲🌻🌲🍉🍌🍍🍎🍑🥕🍄` + + `🧀🍦🍰🧉🚂🚗🚜🛵🚲🛼🪂🚀🌞🌈🌊⚽`) + +func shorttitle(tr *trace) string { + all := tr.Family + " - " + tr.Title + if len(all) > 20 { + all = "..." + all[len(all)-17:] + } + return all +} + +func traceemoji(id id) string { + i := crc32.ChecksumIEEE([]byte(id)) % uint32(len(emojids)) + return string(emojids[i]) +} diff --git a/internal/nettrace/http_test.go b/internal/nettrace/http_test.go new file mode 100644 index 0000000..24c82f6 --- /dev/null +++ b/internal/nettrace/http_test.go @@ -0,0 +1,255 @@ +package nettrace + +import ( + "fmt" + "io" + "net/http" + "net/http/httptest" + "net/url" + "strings" + "testing" + "time" +) + +func getValues(t *testing.T, vs url.Values, code int) string { + t.Helper() + + req := httptest.NewRequest("GET", "/debug/traces?"+vs.Encode(), nil) + w := httptest.NewRecorder() + RenderTraces(w, req) + + resp := w.Result() + body, _ := io.ReadAll(resp.Body) + + if resp.StatusCode != code { + t.Errorf("expected %d, got %v", code, resp) + } + + return string(body) +} + +type v struct { + fam, b, lat, trace, ref, all string +} + +func getCode(t *testing.T, vs v, code int) string { + t.Helper() + + u := url.Values{} + if vs.fam != "" { + u.Add("fam", vs.fam) + } + if vs.b != "" { + u.Add("b", vs.b) + } + if vs.lat != "" { + u.Add("lat", vs.lat) + } + if vs.trace != "" { + u.Add("trace", vs.trace) + } + if vs.ref != "" { + u.Add("ref", vs.ref) + } + if vs.all != "" { + u.Add("all", vs.all) + } + + return getValues(t, u, code) +} + +func get(t *testing.T, fam, b, lat, trace, ref, all string) string { + t.Helper() + return getCode(t, v{fam, b, lat, trace, ref, all}, 200) +} + +func getErr(t *testing.T, fam, b, lat, trace, ref, all string, code int, err string) string { + t.Helper() + body := getCode(t, v{fam, b, lat, trace, ref, all}, code) + if !strings.Contains(body, err) { + t.Errorf("Body does not contain error message %q", err) + t.Logf("Body: %v", body) + } + + return body +} + +func checkContains(t *testing.T, body, s string) { + t.Helper() + if !strings.Contains(body, s) { + t.Errorf("Body does not contain %q", s) + t.Logf("Body: %v", body) + } +} + +func TestHTTP(t *testing.T) { + tr := New("TestHTTP", "http") + tr.Printf("entry #1") + tr.Finish() + + tr = New("TestHTTP", "http") + tr.Printf("entry #2") + tr.Finish() + + tr = New("TestHTTP", "http") + tr.Errorf("entry #3 (error)") + tr.Finish() + + tr = New("TestHTTP", "http") + tr.Printf("hola marola") + tr.Printf("entry #4") + // This one is active until the end. + defer tr.Finish() + + // Get the plain index. + body := get(t, "", "", "", "", "", "") + checkContains(t, body, "TestHTTP") + + // Get a specific family, but no bucket. + body = get(t, "TestHTTP", "", "", "", "", "") + checkContains(t, body, "TestHTTP") + + // Get a family and active bucket. + body = get(t, "TestHTTP", "-1", "", "", "", "") + checkContains(t, body, "hola marola") + + // Get a family and error bucket. + body = get(t, "TestHTTP", "-2", "", "", "", "") + checkContains(t, body, "entry #3 (error)") + + // Get a family and first bucket. + body = get(t, "TestHTTP", "0", "", "", "", "") + checkContains(t, body, "entry #2") + + // Latency view. There are 3 events because the 4th is active. + body = get(t, "TestHTTP", "", "lat", "", "", "") + checkContains(t, body, "Count: 3") + + // Get a specific trace. No family given, since it shouldn't be needed (we + // take it from the id). + body = get(t, "", "", "", string(tr.(*trace).ID), "", "") + checkContains(t, body, "hola marola") + + // Check the "all=true" views. + body = get(t, "TestHTTP", "0", "", "", "", "true") + checkContains(t, body, "entry #2") + checkContains(t, body, "?fam=TestHTTP&b=-2&all=true") + + tr.Finish() +} + +func TestHTTPLong(t *testing.T) { + // Test a long trace. + tr := New("TestHTTPLong", "verbose") + for i := 0; i < 1000; i++ { + tr.Printf("entry #%d", i) + } + tr.Finish() + get(t, "TestHTTPLong", "", "", string(tr.(*trace).ID), "", "") +} + +func TestHTTPErrors(t *testing.T) { + tr := New("TestHTTPErrors", "http") + tr.Printf("entry #1") + tr.Finish() + + // Unknown family. + getErr(t, "unkfamily", "", "", "", "", "", + 404, "Unknown family") + + // Invalid bucket. + getErr(t, "TestHTTPErrors", "abc", "", "", "", "", + 400, "Invalid bucket") + getErr(t, "TestHTTPErrors", "-3", "", "", "", "", + 400, "Invalid bucket") + getErr(t, "TestHTTPErrors", "9", "", "", "", "", + 400, "Invalid bucket") + + // Unknown trace id (malformed). + getErr(t, "TestHTTPErrors", "", "", "unktrace", "", "", + 404, "Trace not found") + + // Unknown trace id. + getErr(t, "TestHTTPErrors", "", "", string(tr.(*trace).ID)+"xxx", "", "", + 404, "Trace not found") + + // Check that the trace is actually there. + get(t, "", "", "", string(tr.(*trace).ID), "", "") +} + +func TestHTTPUroboro(t *testing.T) { + trA := New("TestHTTPUroboro", "trA") + defer trA.Finish() + trA.Printf("this is trace A") + + trB := New("TestHTTPUroboro", "trB") + defer trB.Finish() + trB.Printf("this is trace B") + + trA.Link(trB, "B is my friend") + trB.Link(trA, "A is my friend") + + // Check that we handle cross-linked events well. + get(t, "TestHTTPUroboro", "", "", "", "", "") + get(t, "TestHTTPUroboro", "-1", "", "", "", "") + get(t, "", "", "", string(trA.(*trace).ID), "", "") + get(t, "", "", "", string(trB.(*trace).ID), "", "") +} + +func TestHTTPDeep(t *testing.T) { + tr := New("TestHTTPDeep", "level-0") + defer tr.Finish() + ts := []Trace{tr} + for i := 1; i <= 9; i++ { + tr = tr.NewChild("TestHTTPDeep", fmt.Sprintf("level-%d", i)) + defer tr.Finish() + ts = append(ts, tr) + } + + // Active view. + body := get(t, "TestHTTPDeep", "-1", "", "", "", "") + checkContains(t, body, "level-9") + + // Recursive view. + body = get(t, "TestHTTPDeep", "", "", string(ts[0].(*trace).ID), "", "") + checkContains(t, body, "level-9") +} + +func TestStripZeros(t *testing.T) { + cases := []struct { + d time.Duration + exp string + }{ + {0 * time.Second, " . 0"}, + {1 * time.Millisecond, " . 1000"}, + {5 * time.Millisecond, " . 5000"}, + {1 * time.Second, "1.000000"}, + {1*time.Second + 8*time.Millisecond, "1.008000"}, + } + for _, c := range cases { + if got := stripZeros(c.d); got != c.exp { + t.Errorf("stripZeros(%s) got %q, expected %q", + c.d, got, c.exp) + } + } +} + +func TestRegisterHandler(t *testing.T) { + mux := http.NewServeMux() + RegisterHandler(mux) + + req := httptest.NewRequest("GET", "/debug/traces", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + resp := w.Result() + + if resp.StatusCode != 200 { + t.Errorf("expected 200, got %v", resp) + } + + body, _ := io.ReadAll(resp.Body) + if !strings.Contains(string(body), "<h1>Traces</h1>") { + t.Errorf("unexpected body: %s", body) + } +} diff --git a/internal/nettrace/templates/_latency.html.tmpl b/internal/nettrace/templates/_latency.html.tmpl new file mode 100644 index 0000000..07c026a --- /dev/null +++ b/internal/nettrace/templates/_latency.html.tmpl @@ -0,0 +1,25 @@ +<table class="latencies"><tr> +<td>Count: {{.Latencies.Count}}</td> +<td>Avg: {{.Latencies.Avg | roundDuration}}</td> +<td>Min: {{.Latencies.Min | roundDuration}}</td> +<td>Max: {{.Latencies.Max | roundDuration}}</td> +</tr></table> +<p> + +<table class="latencies"> +<tr><th>Bucket</th><th>Count</th><th>%</th><th></th><th>Cumulative</th></tr> +{{range .Latencies.Counts}} +<tr> + <td> + <a href="?fam={{$.Family}}&b={{.BucketIdx}}" + {{if eq .Count 0}}class="muted"{{end}}> + ≥{{.Start}}</a> + </td> + <td>{{.Count}}</td> + <td>{{.Percent | printf "%5.2f"}}%</td> + <td><meter max="100" value="{{.Percent}}"> + {{.Percent | printf "%.2f"}}%</meter> + <td>{{.CumPct | printf "%5.2f"}}%</td> +</tr> +{{end}} +</table> diff --git a/internal/nettrace/templates/_recursive.html.tmpl b/internal/nettrace/templates/_recursive.html.tmpl new file mode 100644 index 0000000..178624d --- /dev/null +++ b/internal/nettrace/templates/_recursive.html.tmpl @@ -0,0 +1,51 @@ +{{if .Trace.Parent}} +<a href="?trace={{.Trace.Parent.ID}}&ref={{.Trace.ID}}"> + Parent: {{.Trace.Parent.Family}} - {{.Trace.Parent.Title}}</a> +<p> +{{end}} + +<table class="trace"> + +<thead> +<tr> + <th>Trace</th> + <th>Timestamp</th> + <th>Elapsed (s)</th> + <th></th> + <th>Message</th> +</tr> +</thead> + +<tbody> +{{$prev := .Trace.Start}} +{{range .AllEvents}} +<tr style="background: {{colorize .Depth .Trace.ID}};"> +<td title='{{.Trace.Family}} - {{.Trace.Title}} +@ {{.Trace.Start.Format "2006-01-02 15:04:05.999999" }}'> + {{shorttitle .Trace}}</td> + +<td class="when">{{.Event.When.Format "15:04:05.000000"}}</td> +<td class="duration">{{(.Event.When.Sub $prev) | stripZeros}}</td> +<td class="emoji" title='{{.Trace.Family}} - {{.Trace.Title}} +@ {{.Trace.Start.Format "2006-01-02 15:04:05.999999" }}'> + <div class="emoji">{{traceemoji .Trace.ID}}</div></td> +<td class="msg"> + {{- depthspan .Depth -}} + {{- if .Event.Type.IsLog -}} + {{.Event.Msg}} + {{- else if .Event.Type.IsChild -}} + new child: <a href="?trace={{.Event.Ref.ID}}&ref={{.Trace.ID}}">{{.Event.Ref.Family}} - {{.Event.Ref.Title}}</a> + {{- else if .Event.Type.IsLink -}} + <a href="?trace={{.Event.Ref.ID}}&ref={{.Trace.ID}}">{{.Event.Msg}}</a> + {{- else if .Event.Type.IsDrop -}} + <b><i>[ events dropped ]</i></b> + {{- else -}} + <i>[ unknown event type ]</i> + {{- end -}} +</td> +</tr> +{{$prev = .Event.When}} +{{end}} + +</tbody> +</table> diff --git a/internal/nettrace/templates/_single.html.tmpl b/internal/nettrace/templates/_single.html.tmpl new file mode 100644 index 0000000..2e75464 --- /dev/null +++ b/internal/nettrace/templates/_single.html.tmpl @@ -0,0 +1,35 @@ +<tr class="title"> +<td class="when">{{.Start.Format "2006-01-02 15:04:05.000000"}}</td> +<td class="duration">{{.Duration | roundSeconds}}</td> +<td><a href="?trace={{.ID}}">{{.Title}}</a> + {{if .Parent}}(parent: <a href="?trace={{.Parent.ID}}&ref={{.ID}}"> + {{.Parent.Family}} - {{.Parent.Title}}</a>) + {{end}} +</td> +<tr> + +{{$prev := .Start}} +{{range .Events}} +<tr> +<td class="when">{{.When.Format "15:04:05.000000"}}</td> +<td class="duration">{{(.When.Sub $prev) | stripZeros}}</td> +<td class="msg"> + {{- if .Type.IsLog -}} + {{.Msg}} + {{- else if .Type.IsChild -}} + new child <a href="?trace={{.Ref.ID}}&ref={{$.ID}}">{{.Ref.Family}} {{.Ref.Title}}</a> + {{- else if .Type.IsLink -}} + <a href="?trace={{.Ref.ID}}&ref={{$.ID}}">{{.Msg}}</a> + {{- else if .Type.IsDrop -}} + <i>[ events dropped ]</i> + {{- else -}} + <i>[ unknown event type ]</i> + {{- end -}} +</td> +</tr> +{{$prev = .When}} +{{end}} + +<tr> +<td> </td> +</tr> diff --git a/internal/nettrace/templates/index.html.tmpl b/internal/nettrace/templates/index.html.tmpl new file mode 100644 index 0000000..8909c34 --- /dev/null +++ b/internal/nettrace/templates/index.html.tmpl @@ -0,0 +1,129 @@ +<!DOCTYPE html> +<html lang="en"> +<head> +<style> +{{template "style.css"}} +</style> + +<title> +{{if .Trace}}{{.Trace.Family}} - {{.Trace.Title}} +{{else if .BucketStr}}{{.Family}} - {{.BucketStr}} +{{else if .Latencies}}{{.Family}} - latency +{{else}}Traces +{{end}} +</title> + +</head> + +<body> +<h1>Traces</h1> + +<table class="index"> +{{range $name, $ftr := .FamTraces}} +<tr> + <td class="family"> + <a href="?fam={{$name}}&b=0&all=true"> + {{if eq $name $.Family}}<u>{{end}} + {{$name}} + {{if eq $name $.Family}}</u>{{end}} + </a> + </td> + + <td class="bucket active"> + {{$n := $ftr.LenActive}} + {{if and (eq $name $.Family) (eq "active" $.BucketStr)}}<u>{{end}} + + <a href="?fam={{$name}}&b=-1&all={{$.AllGT}}" + {{if eq $n 0}}class="muted"{{end}}> + {{$n}} active</a> + + {{if and (eq $name $.Family) (eq "active" $.BucketStr)}}</u>{{end}} + </td> + + {{range $i, $b := $.Buckets}} + <td class="bucket"> + {{$n := $ftr.LenBucket $i}} + {{if and (eq $name $.Family) (eq $b.String $.BucketStr)}}<u>{{end}} + + <a href="?fam={{$name}}&b={{$i}}&all={{$.AllGT}}" + {{if eq $n 0}}class="muted"{{end}}> + ≥{{$b}}</a> + + {{if and (eq $name $.Family) (eq $b.String $.BucketStr)}}</u>{{end}} + </td> + {{end}} + + <td class="bucket"> + {{$n := $ftr.LenErrors}} + {{if and (eq $name $.Family) (eq "errors" $.BucketStr)}}<u>{{end}} + + <a href="?fam={{$name}}&b=-2&all={{$.AllGT}}" + {{if eq $n 0}}class="muted"{{end}}> + errors</a> + + {{if and (eq $name $.Family) (eq "errors" $.BucketStr)}}</u>{{end}} + </td> + + <td class="bucket"> + <a href="?fam={{$name}}&lat=true&all={{$.AllGT}}">[latency]</a> + </td> +</tr> +{{end}} +</table> +<br> +Show: <a href="?fam={{.Family}}&b={{.Bucket}}&all=false"> + {{if not .AllGT}}<u>{{end}} + Only in bucket</a> + {{if not .AllGT}}</u>{{end}} +/ +<a href="?fam={{.Family}}&b={{.Bucket}}&all=true"> + {{if .AllGT}}<u>{{end}} + All ≥ bucket</a> + {{if .AllGT}}</u>{{end}} +<p> + +<!---------------------------------------------> +{{if .Error}} +<p class="error">Error: {{.Error}}</p> +{{end}} + +<!---------------------------------------------> +{{if .BucketStr}} +<h2>{{.Family}} - {{.BucketStr}}</h2> + +<table class="trace"> +<thead> +<tr> + <th>Timestamp</th> + <th>Elapsed (s)</th> + <th>Message</th> +</tr> +</thead> +<tbody> +<tr> +<td> </td> +</tr> +{{range .Traces}} +{{template "_single.html.tmpl" .}}<p> +{{end}} +</tbody> +</table> + +<p> +{{end}} + +<!---------------------------------------------> +{{if .Latencies}} +<h2>{{.Family}} - latency</h2> +{{template "_latency.html.tmpl" .}}<p> +{{end}} + +<!---------------------------------------------> +{{if .Trace}} +<h2>{{.Trace.Family}} - <i>{{.Trace.Title}}</i></h2> +{{template "_recursive.html.tmpl" .}}<p> +{{end}} + +</body> + +</html> diff --git a/internal/nettrace/templates/style.css b/internal/nettrace/templates/style.css new file mode 100644 index 0000000..077db1e --- /dev/null +++ b/internal/nettrace/templates/style.css @@ -0,0 +1,187 @@ +:root { + --text-color: black; + --bg-color: #fffff7; + --zebra-bg-color: #eeeee7; + --muted-color: #444; + --xmuted-color: #a1a1a1; + --link-color: #39c; + --link-hover: #069; + --underline-color: grey; + --error-color: red; + + /* Colors for the nested zebras. */ + --nested-d01-c00: #ffebee; + --nested-d01-c01: #ede7f6; + --nested-d01-c02: #e3f2fd; + --nested-d01-c03: #e8f5e9; + --nested-d01-c04: #fff8e1; + --nested-d01-c05: #efebe9; + --nested-d02-c00: #f0dcdf; + --nested-d02-c01: #ded8e7; + --nested-d02-c02: #d4e3ee; + --nested-d02-c03: #d9e6da; + --nested-d02-c04: #f0e9d2; + --nested-d02-c05: #e0dcda; + --nested-d03-c00: #e1cdd0; + --nested-d03-c01: #cfc9d8; + --nested-d03-c02: #c5d4df; + --nested-d03-c03: #cad7cb; + --nested-d03-c04: #e1dac3; + --nested-d03-c05: #d1cdcb; +} + +@media (prefers-color-scheme: dark) { + :root { + --text-color: rgba(255, 255, 255, 0.90); + --bg-color: #121212; + --zebra-bg-color: #222222; + --muted-color: #aaaaaa; + --xmuted-color: #a1a1a1; + --link-color: #44b4ec; + --link-hover: #7fc9ee; + --underline-color: grey; + --error-color: #dd4747; + + /* Colors for the nested zebras. */ + --nested-d01-c00: #220212; + --nested-d01-c01: #1c1c22; + --nested-d01-c02: #001e20; + --nested-d01-c03: #0f0301; + --nested-d01-c04: #201d06; + --nested-d01-c05: #00192b; + --nested-d02-c00: #311121; + --nested-d02-c01: #2b2b31; + --nested-d02-c02: #0f2d2f; + --nested-d02-c03: #1e1210; + --nested-d02-c04: #2f2c15; + --nested-d02-c05: #0f283a; + --nested-d03-c00: #402030; + --nested-d03-c01: #3a3a40; + --nested-d03-c02: #1e3c3e; + --nested-d03-c03: #2d211f; + --nested-d03-c04: #3e3b24; + --nested-d03-c05: #1e3749; + } +} + +body { + font-family: sans-serif; + color: var(--text-color); + background: var(--bg-color); +} + +p.error { + color: var(--error-color); + font-size: large; +} + +a { + color: var(--link-color); + text-decoration: none; +} + +a:hover { + color: var(--link-hover); +} + +.family a { + color: var(--text-color); +} + +u { + text-decoration-color: var(--underline-color); +} + +table.index { + border-collapse: collapse; +} + +table.index tr:nth-child(odd) { + + background-color: var(--zebra-bg-color); +} + +table.index td { + padding: 0.25em 0.5em; +} + +table.index td.bucket { + min-width: 2em; + text-align: center; +} + +table.index td.active { + /* Make the "active" column wider so there's less jumping on refresh. */ + min-width: 5em; + text-align: right; +} + +table.index a { + text-decoration: none; +} + +a.muted { + color: var(--muted-color); +} + +table.trace { + font-family: monospace; + border-collapse: collapse; +} + +table.trace thead { + border-bottom: 1px solid var(--text-color); +} + +table.trace th { + text-align: left; + padding: 0.1em 1em; +} + +table.trace tr.title { + font-weight: bold; +} + +table.trace td { + padding: 0.1em 1em; +} + +table.trace td.when { + text-align: right; +} + +table.trace td.duration { + text-align: right; + white-space: pre; +} + +table.trace td.msg { + white-space: pre-wrap; +} + +span.depth { + color: var(--xmuted-color); +} + +div.emoji { + /* Emojis sometimes are rendered excessively tall. */ + /* This ensures they're sized appropriately. */ + max-height: 1.3em; + overflow: hidden; +} + +table.latencies { + text-align: right; +} + +table.latencies td { + padding: 0 0.3em; +} + +table.latencies th { + text-align: center; +} + +meter { + width: 15em; +} diff --git a/internal/nettrace/test_server.go b/internal/nettrace/test_server.go new file mode 100644 index 0000000..75f1d96 --- /dev/null +++ b/internal/nettrace/test_server.go @@ -0,0 +1,243 @@ +//go:build ignore + +package main + +import ( + "flag" + "fmt" + "math/rand" + "net/http" + "time" + + _ "net/http/pprof" + + "blitiri.com.ar/go/srv/nettrace" +) + +func main() { + addr := flag.String("addr", ":8080", "listening address") + flag.Parse() + + go RandomEvents("random-one") + go RandomEvents("random-two") + go RandomEvents("random-three") + go RandomLongEvent("random-long", "long-one") + go RandomLongEvent("random-long", "long-two") + go RandomLongEvent("random-long", "long-three") + go RandomBunny("random-bunny", "first 🐇") + go RandomNested("random-nested") + go RandomLazy("random-lazy") + + http.DefaultServeMux.Handle("/", + WithLogging(http.HandlerFunc(HandleRoot))) + + http.DefaultServeMux.Handle("/debug/traces", + WithLogging(http.HandlerFunc(nettrace.RenderTraces))) + + fmt.Printf("listening on %s\n", *addr) + http.ListenAndServe(*addr, nil) +} + +func RandomEvents(family string) { + for i := 0; ; i++ { + tr := nettrace.New(family, fmt.Sprintf("evt-%d", i)) + randomTrace(family, tr) + } +} + +func randomTrace(family string, tr nettrace.Trace) { + tr.Printf("this is a random event") + tr.Printf("and it has a random delay") + delay := time.Duration(rand.Intn(1000)) * time.Millisecond + tr.Printf("sleeping %v", delay) + time.Sleep(delay) + + if rand.Intn(100) < 1 { + tr.Printf("this unlucky one is an error") + tr.SetError() + } + + if rand.Intn(100) < 10 { + tr.Printf("this one got super verbose!") + for j := 0; j < 100; j++ { + tr.Printf("message %d", j) + } + } + + if rand.Intn(100) < 30 { + tr.Printf("this one had a child") + c := tr.NewChild(family, "achild") + go randomTrace(family, c) + } + + tr.Printf("all done") + tr.Finish() +} + +func RandomLongEvent(family, title string) { + tr := nettrace.New(family, title) + tr.Printf("this is a random long event") + for i := 0; ; i++ { + delay := time.Duration(rand.Intn(100)) * time.Millisecond + time.Sleep(delay) + tr.Printf("message %d, slept %v", i, delay) + } + tr.Finish() +} + +func RandomBunny(family, title string) { + tr := nettrace.New(family, title) + tr.SetMaxEvents(100) + tr.Printf("this is the top 🐇") + for i := 0; ; i++ { + delay := time.Duration(rand.Intn(100)) * time.Millisecond + time.Sleep(delay) + tr.Printf("message %d, slept %v", i, delay) + + if rand.Intn(100) < 40 { + c := tr.NewChild(family, fmt.Sprintf("child-%d", i)) + go randomTrace(family, c) + } + + if rand.Intn(100) < 40 { + n := nettrace.New(family, fmt.Sprintf("linked-%d", i)) + go randomTrace(family, n) + tr.Link(n, "linking with this guy") + } + } + tr.Finish() +} + +func randomNested(family string, depth int, parent nettrace.Trace) { + tr := parent.NewChild(family, fmt.Sprintf("nested-%d", depth)) + defer tr.Finish() + + tr.Printf("I am a spoiled child") + + delay := time.Duration(rand.Intn(100)) * time.Millisecond + time.Sleep(delay) + tr.Printf("slept %v", delay) + + if depth > 10 { + tr.Printf("I went too far") + return + } + + // If we make this < 50, then it grows forever. + if rand.Intn(100) < 75 { + tr.Printf("I sang really well") + return + } + + max := rand.Intn(5) + for i := 0; i < max; i++ { + tr.Printf("spawning %d", i) + go randomNested(family, depth+1, tr) + } + +} +func RandomNested(family string) { + tr := nettrace.New(family, "nested-0") + for i := 0; ; i++ { + randomNested(family, 1, tr) + } +} + +func RandomLazy(family string) { + for i := 0; ; i++ { + tr := nettrace.New(family, fmt.Sprintf("evt-%d", i)) + tr.Printf("I am very lazy and do little work") + tr.Finish() + time.Sleep(500 * time.Millisecond) + } +} + +func HandleRoot(w http.ResponseWriter, r *http.Request) { + if delayS := r.FormValue("delay"); delayS != "" { + delay, err := time.ParseDuration(delayS) + if err != nil { + http.Error(w, err.Error(), http.StatusBadRequest) + return + } + time.Sleep(delay) + } + + if withError := r.FormValue("error"); withError != "" { + tr, _ := nettrace.FromContext(r.Context()) + tr.SetError() + } + + w.Write([]byte(rootHTML)) +} + +func WithLogging(parent http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + tr := nettrace.New("http", r.URL.String()) + defer tr.Finish() + + // Associate the trace with this request. + r = r.WithContext(nettrace.NewContext(r.Context(), tr)) + + tr.Printf("%s %s %s %s", + r.RemoteAddr, r.Proto, r.Method, r.URL.String()) + start := time.Now() + parent.ServeHTTP(w, r) + latency := time.Since(start) + tr.Printf("handler took %s", latency) + + // 1.2.3.4:34575 HTTP/2.0 GET /path = 1.2ms + fmt.Printf("%s %s %s %s = %s\n", + r.RemoteAddr, r.Proto, r.Method, r.URL.String(), + latency) + }) +} + +const rootHTML = ` +<html> +<head> +<style> + td { + min-width: 2em; + text-align: center; + } + input#delay { + width: 5em; + } +</style> +</head> +<body> + +<a href="/debug/traces">Traces</a><p> + +<table> + <tr> + <td>Delay:</td> + <td><a href="/?delay=0s">0s</a></td> + <td><a href="/?delay=0.1s">0.1s</a></td> + <td><a href="/?delay=0.2s">0.2s</a></td> + <td><a href="/?delay=0.5s">0.5s</a></td> + <td><a href="/?delay=1s">1s</a></td> + </tr> + + <tr> + <td>+ error:</td> + <td><a href="/?delay=0&error=on">0s</a></td> + <td><a href="/?delay=0.1s&error=on">0.1s</a></td> + <td><a href="/?delay=0.2s&error=on">0.2s</a></td> + <td><a href="/?delay=0.5s&error=on">0.5s</a></td> + <td><a href="/?delay=1s&error=on">1s</a></td> + </tr> +</table> + +<form action="/" method="get"> + <label for="delay">Custom delay:</label> + <input type="text" id="delay" name="delay" placeholder="250ms" + autofocus required> + <input type="checkbox" id="error" name="error"> + <label for="error">is error</label> + <input type="submit"> +</form> + +</body> +</html> +` diff --git a/internal/nettrace/trace.go b/internal/nettrace/trace.go new file mode 100644 index 0000000..7013e48 --- /dev/null +++ b/internal/nettrace/trace.go @@ -0,0 +1,558 @@ +// Package nettrace implements tracing of requests. Traces are created by +// nettrace.New, and can then be viewed over HTTP on /debug/traces. +package nettrace + +import ( + "container/ring" + "fmt" + "math/rand" + "sort" + "strconv" + "strings" + "sync" + "time" +) + +// IDs are of the form "family!timestamp!unique identifier", which allows for +// sorting them by time much easily, and also some convenient optimizations +// when looking up an id across all the known ones. +// Family is not escaped. It should not contain the separator. +// It is not expected to be stable, for internal use only. +type id string + +func newID(family string, ts int64) id { + return id( + family + "!" + + strconv.FormatInt(ts, 10) + "!" + + strconv.FormatUint(rand.Uint64(), 10)) +} + +func (id id) Family() string { + sp := strings.SplitN(string(id), "!", 2) + if len(sp) != 2 { + return string(id) + } + return sp[0] +} + +// Trace represents a single request trace. +type Trace interface { + // NewChild creates a new trace, that is a child of this one. + NewChild(family, title string) Trace + + // Link to another trace with the given message. + Link(other Trace, msg string) + + // SetMaxEvents sets the maximum number of events that will be stored in + // the trace. It must be called right after initialization. + SetMaxEvents(n int) + + // SetError marks that the trace was for an error event. + SetError() + + // Printf adds a message to the trace. + Printf(format string, a ...interface{}) + + // Errorf adds a message to the trace, marks it as an error, and returns + // an error for it. + Errorf(format string, a ...interface{}) error + + // Finish marks the trace as complete. + // The trace should not be used after calling this method. + Finish() +} + +// A single trace. Can be active or inactive. +// Exported fields are allowed to be accessed directly, e.g. by the HTTP +// handler. Private ones are mutex protected. +type trace struct { + ID id + + Family string + Title string + + Parent *trace + + Start time.Time + + // Fields below are mu-protected. + // We keep them unexported so they're not accidentally accessed in html + // templates. + mu sync.Mutex + + end time.Time + + isError bool + maxEvents int + + // We keep two separate groups: the first ~1/3rd events in a simple slice, + // and the last 2/3rd in a ring so we can drop events without losing the + // first ones. + cutoff int + firstEvents []event + lastEvents *evtRing +} + +type evtType uint8 + +const ( + evtLOG = evtType(1 + iota) + evtCHILD + evtLINK + evtDROP +) + +func (t evtType) IsLog() bool { return t == evtLOG } +func (t evtType) IsChild() bool { return t == evtCHILD } +func (t evtType) IsLink() bool { return t == evtLINK } +func (t evtType) IsDrop() bool { return t == evtDROP } + +type event struct { + When time.Time + Type evtType + + Ref *trace + Msg string +} + +const defaultMaxEvents = 30 + +func newTrace(family, title string) *trace { + start := time.Now() + tr := &trace{ + ID: newID(family, start.UnixNano()), + Family: family, + Title: title, + Start: start, + + maxEvents: defaultMaxEvents, + cutoff: defaultMaxEvents / 3, + } + + // Pre-allocate a couple of events to speed things up. + // Don't allocate lastEvents, that can be expensive and it is not always + // needed. No need to slow down trace creation just for it. + tr.firstEvents = make([]event, 0, 4) + + familiesMu.Lock() + ft, ok := families[family] + if !ok { + ft = newFamilyTraces() + families[family] = ft + } + familiesMu.Unlock() + + ft.mu.Lock() + ft.active[tr.ID] = tr + ft.mu.Unlock() + + return tr +} + +// New creates a new trace with the given family and title. +func New(family, title string) Trace { + return newTrace(family, title) +} + +func (tr *trace) append(evt *event) { + tr.mu.Lock() + defer tr.mu.Unlock() + + if len(tr.firstEvents) < tr.cutoff { + tr.firstEvents = append(tr.firstEvents, *evt) + return + } + + if tr.lastEvents == nil { + // The ring holds the last 2/3rds of the events. + tr.lastEvents = newEvtRing(tr.maxEvents - tr.cutoff) + } + + tr.lastEvents.Add(evt) +} + +// String is for debugging only. +func (tr *trace) String() string { + return fmt.Sprintf("trace{%s, %s, %q, %d}", + tr.Family, tr.Title, tr.ID, len(tr.Events())) +} + +func (tr *trace) NewChild(family, title string) Trace { + c := newTrace(family, title) + c.Parent = tr + + // Add the event to the parent. + evt := &event{ + When: time.Now(), + Type: evtCHILD, + Ref: c, + } + tr.append(evt) + + return c +} + +func (tr *trace) Link(other Trace, msg string) { + evt := &event{ + When: time.Now(), + Type: evtLINK, + Ref: other.(*trace), + Msg: msg, + } + tr.append(evt) +} + +func (tr *trace) SetMaxEvents(n int) { + // Set a minimum of 6, so the truncation works without running into + // issues. + if n < 6 { + n = 6 + } + tr.mu.Lock() + tr.maxEvents = n + tr.cutoff = n / 3 + tr.mu.Unlock() +} + +func (tr *trace) SetError() { + tr.mu.Lock() + tr.isError = true + tr.mu.Unlock() +} + +func (tr *trace) Printf(format string, a ...interface{}) { + evt := &event{ + When: time.Now(), + Type: evtLOG, + Msg: fmt.Sprintf(format, a...), + } + + tr.append(evt) +} + +func (tr *trace) Errorf(format string, a ...interface{}) error { + tr.SetError() + err := fmt.Errorf(format, a...) + tr.Printf(err.Error()) + return err +} + +func (tr *trace) Finish() { + tr.mu.Lock() + tr.end = time.Now() + tr.mu.Unlock() + + familiesMu.Lock() + ft := families[tr.Family] + familiesMu.Unlock() + ft.finalize(tr) +} + +// Duration of this trace. +func (tr *trace) Duration() time.Duration { + tr.mu.Lock() + start, end := tr.Start, tr.end + tr.mu.Unlock() + + if end.IsZero() { + return time.Since(start) + } + return end.Sub(start) +} + +// Events returns a copy of the trace events. +func (tr *trace) Events() []event { + tr.mu.Lock() + defer tr.mu.Unlock() + + evts := make([]event, len(tr.firstEvents)) + copy(evts, tr.firstEvents) + + if tr.lastEvents == nil { + return evts + } + + if !tr.lastEvents.firstDrop.IsZero() { + evts = append(evts, event{ + When: tr.lastEvents.firstDrop, + Type: evtDROP, + }) + } + + tr.lastEvents.Do(func(e *event) { + evts = append(evts, *e) + }) + + return evts +} + +func (tr *trace) IsError() bool { + tr.mu.Lock() + defer tr.mu.Unlock() + return tr.isError +} + +// +// Trace hierarchy +// +// Each trace belongs to a family. For each family, we have all active traces, +// and then N traces that finished <1s, N that finished <2s, etc. + +// We keep this many buckets of finished traces. +const nBuckets = 8 + +// Buckets to use. Lenght must match nBuckets. +// "Traces with a latency >= $duration". +var buckets = []time.Duration{ + time.Duration(0), + 5 * time.Millisecond, + 10 * time.Millisecond, + 50 * time.Millisecond, + 100 * time.Millisecond, + 300 * time.Millisecond, + 1 * time.Second, + 10 * time.Second, +} + +func findBucket(latency time.Duration) int { + for i, d := range buckets { + if latency >= d { + continue + } + return i - 1 + } + + return nBuckets - 1 +} + +// How many traces we keep per bucket. +const tracesInBucket = 10 + +type traceRing struct { + ring *ring.Ring + max int + l int +} + +func newTraceRing(n int) *traceRing { + return &traceRing{ + ring: ring.New(n), + max: n, + } +} + +func (r *traceRing) Add(tr *trace) { + r.ring.Value = tr + r.ring = r.ring.Next() + if r.l < r.max { + r.l++ + } +} + +func (r *traceRing) Len() int { + return r.l +} + +func (r *traceRing) Do(f func(tr *trace)) { + r.ring.Do(func(x interface{}) { + if x == nil { + return + } + f(x.(*trace)) + }) +} + +type familyTraces struct { + mu sync.Mutex + + // All active ones. + active map[id]*trace + + // The ones we decided to keep. + // Each bucket is a ring-buffer, finishedHead keeps the head pointer. + finished [nBuckets]*traceRing + + // The ones that errored have their own bucket. + errors *traceRing + + // Histogram of latencies. + latencies histogram +} + +func newFamilyTraces() *familyTraces { + ft := &familyTraces{} + ft.active = map[id]*trace{} + for i := 0; i < nBuckets; i++ { + ft.finished[i] = newTraceRing(tracesInBucket) + } + ft.errors = newTraceRing(tracesInBucket) + return ft +} + +func (ft *familyTraces) LenActive() int { + ft.mu.Lock() + defer ft.mu.Unlock() + return len(ft.active) +} + +func (ft *familyTraces) LenErrors() int { + ft.mu.Lock() + defer ft.mu.Unlock() + return ft.errors.Len() +} + +func (ft *familyTraces) LenBucket(b int) int { + ft.mu.Lock() + defer ft.mu.Unlock() + return ft.finished[b].Len() +} + +func (ft *familyTraces) TracesFor(b int, allgt bool) []*trace { + ft.mu.Lock() + defer ft.mu.Unlock() + + trs := []*trace{} + appendTrace := func(tr *trace) { + trs = append(trs, tr) + } + if b == -2 { + ft.errors.Do(appendTrace) + } else if b == -1 { + for _, tr := range ft.active { + appendTrace(tr) + } + } else if b < nBuckets { + ft.finished[b].Do(appendTrace) + if allgt { + for i := b + 1; i < nBuckets; i++ { + ft.finished[i].Do(appendTrace) + } + } + } + + // Sort them by start, newer first. This is the order that will be used + // when displaying them. + sort.Slice(trs, func(i, j int) bool { + return trs[i].Start.After(trs[j].Start) + }) + return trs +} + +func (ft *familyTraces) find(id id) *trace { + ft.mu.Lock() + defer ft.mu.Unlock() + + if tr, ok := ft.active[id]; ok { + return tr + } + + var found *trace + for _, bs := range ft.finished { + bs.Do(func(tr *trace) { + if tr.ID == id { + found = tr + } + }) + if found != nil { + return found + } + } + + ft.errors.Do(func(tr *trace) { + if tr.ID == id { + found = tr + } + }) + if found != nil { + return found + } + + return nil +} + +func (ft *familyTraces) finalize(tr *trace) { + latency := tr.end.Sub(tr.Start) + b := findBucket(latency) + + ft.mu.Lock() + + // Delete from the active list. + delete(ft.active, tr.ID) + + // Add it to the corresponding finished bucket, based on the trace + // latency. + ft.finished[b].Add(tr) + + // Errors go on their own list, in addition to the above. + if tr.isError { + ft.errors.Add(tr) + } + + ft.latencies.Add(b, latency) + + ft.mu.Unlock() +} + +func (ft *familyTraces) Latencies() *histSnapshot { + ft.mu.Lock() + defer ft.mu.Unlock() + return ft.latencies.Snapshot() +} + +// +// Global state +// + +var ( + familiesMu sync.Mutex + families = map[string]*familyTraces{} +) + +func copyFamilies() map[string]*familyTraces { + n := map[string]*familyTraces{} + + familiesMu.Lock() + for f, trs := range families { + n[f] = trs + } + familiesMu.Unlock() + + return n +} + +func findInFamilies(traceID id, refID id) *trace { + // First, try to find it via the family. + family := traceID.Family() + familiesMu.Lock() + fts, ok := families[family] + familiesMu.Unlock() + + if ok { + tr := fts.find(traceID) + if tr != nil { + return tr + } + } + + // If that fail and we have a reference, try finding via it. + // The reference can be a parent or a child. + if refID != id("") { + ref := findInFamilies(refID, "") + if ref == nil { + return nil + } + + // Is the reference's parent the one we're looking for? + if ref.Parent != nil && ref.Parent.ID == traceID { + return ref.Parent + } + + // Try to find it in the ref's events. + for _, e := range ref.Events() { + if e.Ref != nil && e.Ref.ID == traceID { + return e.Ref + } + } + } + return nil +} diff --git a/internal/nettrace/trace_test.go b/internal/nettrace/trace_test.go new file mode 100644 index 0000000..ebdb5c5 --- /dev/null +++ b/internal/nettrace/trace_test.go @@ -0,0 +1,215 @@ +package nettrace + +import ( + "fmt" + "strings" + "testing" + "time" +) + +func expectEvents(t *testing.T, tr Trace, n int) { + t.Helper() + if evts := tr.(*trace).Events(); len(evts) != n { + t.Errorf("expected %d events, got %d", n, len(evts)) + t.Logf("%v", evts) + } +} + +func TestBasic(t *testing.T) { + var tr Trace = New("TestBasic", "basic") + defer tr.Finish() + tr.Printf("hola marola") + tr.Printf("hola marola 2") + + c1 := tr.NewChild("TestBasic", "basic child") + c1.Printf("hijo de la noche") + + expectEvents(t, tr, 3) + + if s := tr.(*trace).String(); !strings.Contains(s, "TestBasic, basic") { + t.Errorf("tr.String does not contain family and title: %q", s) + } +} + +func TestLong(t *testing.T) { + tr := New("TestLong", "long") + defer tr.Finish() + tr.SetMaxEvents(100) + + // First 90 events, no drop. + for i := 0; i < 90; i++ { + tr.Printf("evt %d", i) + } + expectEvents(t, tr, 90) + + // Up to 99, still no drop. + for i := 0; i < 9; i++ { + tr.Printf("evt %d", i) + } + expectEvents(t, tr, 99) + + // Note that we go up to 101 due to rounding errors, we're ok with it. + tr.Printf("evt 100") + expectEvents(t, tr, 100) + tr.Printf("evt 101") + expectEvents(t, tr, 101) + tr.Printf("evt 102") + expectEvents(t, tr, 101) + + // Add more events, expect none of them to exceed 101. + for i := 0; i < 9; i++ { + tr.Printf("evt %d", i) + expectEvents(t, tr, 101) + } +} + +func TestIsError(t *testing.T) { + tr := New("TestIsError", "long") + defer tr.Finish() + if tr.(*trace).IsError() != false { + tr.Errorf("new trace is error") + } + + tr.Errorf("this is an error") + if tr.(*trace).IsError() != true { + tr.Errorf("error not recorded properly") + } +} + +func TestFindViaRef(t *testing.T) { + parent := New("TestFindViaRef", "parent") + parentID := parent.(*trace).ID + defer parent.Finish() + child := parent.NewChild("TestFindViaRef", "child") + childID := child.(*trace).ID + defer child.Finish() + + // Basic check that both can be directly found. + if f := findInFamilies(parentID, id("")); f != parent { + t.Errorf("didn't find parent directly, found: %v", f) + } + if f := findInFamilies(childID, id("")); f != child { + t.Errorf("didn't find child directly, found: %v", f) + } + + // Hackily remove child from the active traces, to force a reference + // lookup when needed. + familiesMu.Lock() + delete(families["TestFindViaRef"].active, child.(*trace).ID) + familiesMu.Unlock() + + // Now the child should not be findable directly anymore. + if f := findInFamilies(childID, id("")); f != nil { + t.Errorf("child should not be findable directly, found: %v", f) + } + + // But we still should be able to get to it via the parent. + if f := findInFamilies(childID, parentID); f != child { + t.Errorf("didn't find child via parent, found: %v", f) + } +} + +func TestMaxEvents(t *testing.T) { + tr := trace{} + + // Test that we keep a minimum, and that the cutoff behaves as expected. + cases := []struct{ me, exp, cutoffExp int }{ + {0, 6, 2}, + {5, 6, 2}, + {6, 6, 2}, + {7, 7, 2}, + {8, 8, 2}, + {9, 9, 3}, + {12, 12, 4}, + } + for _, c := range cases { + tr.SetMaxEvents(c.me) + if got := tr.maxEvents; got != c.exp { + t.Errorf("set max events to %d, expected %d, got %d", + c.me, c.exp, got) + } + if got := tr.cutoff; got != c.cutoffExp { + t.Errorf("set max events to %d, expected cutoff %d, got %d", + c.me, c.cutoffExp, got) + } + } +} + +func TestFind(t *testing.T) { + // Make sure we find the right bucket, including for latencies above the + // last one. + for i, d := range buckets { + found := findBucket(d + 1*time.Millisecond) + if found != i { + t.Errorf("find bucket [%s + 1ms] got %d, expected %d", + d, found, i) + } + } + + // Create a family, populate it with traces in all buckets. + finished := [nBuckets]*trace{} + for i, d := range buckets { + lat := d + 1*time.Millisecond + tr := newTrace("TestFind", fmt.Sprintf("evt-%s", lat)) + tr.end = tr.Start.Add(lat) + families[tr.Family].finalize(tr) + finished[i] = tr + } + + // Also have an active trace. + activeTr := newTrace("TestFind", "active") + + // And add an error trace, which isn't on any of the other buckets (to + // simulate that they've been rotated out of the latency buckets, but are + // still around in errors) + errTr := newTrace("TestFind", "evt-err") + errTr.end = errTr.Start.Add(666 * time.Millisecond) + errTr.SetError() + delete(families[errTr.Family].active, errTr.ID) + families[errTr.Family].errors.Add(errTr) + + // Find all of them. + for i := range buckets { + found := findInFamilies(finished[i].ID, "") + if found != finished[i] { + t.Errorf("finding trace %d on bucket %s, expected %v, got %v", + i, buckets[i], finished[i], found) + } + } + if found := findInFamilies(activeTr.ID, ""); found != activeTr { + t.Errorf("finding active trace, expected %v, got %v", + activeTr, found) + } + if found := findInFamilies(errTr.ID, ""); found != errTr { + t.Errorf("finding error trace, expected %v, got %v", + errTr, found) + } + + // Non-existent traces. + if found := findInFamilies("does!notexist", ""); found != nil { + t.Errorf("finding non-existent trace, expected nil, got %v", found) + } + if found := findInFamilies("does!notexist", "does!notexist"); found != nil { + t.Errorf("finding non-existent trace w/ref, expected nil, got %v", found) + } +} + +func TestFindParent(t *testing.T) { + // Direct parent finding. + // If the ref is the parent, we should find it even if the target trace + // isn't known to the family (e.g. the child is there, but the parent has + // been rotated and is no longer referenced). + + parent := newTrace("TestFindParent", "parent") + child := parent.NewChild("TestFindParent", "child").(*trace) + + // Remove the parent from the active list. + delete(families[parent.Family].active, parent.ID) + + if found := findInFamilies(parent.ID, ""); found != nil { + t.Errorf("finding parent without ref, expected nil, got %v", found) + } + if found := findInFamilies(parent.ID, child.ID); found != parent { + t.Errorf("finding parent with ref, expected %v, got %v", parent, found) + } +} diff --git a/internal/trace/trace.go b/internal/trace/trace.go index 8dc9411..27aeba4 100644 --- a/internal/trace/trace.go +++ b/internal/trace/trace.go @@ -1,23 +1,15 @@ -// Package trace extends golang.org/x/net/trace. +// Package trace extends nettrace with logging. package trace import ( "fmt" - "net/http" "strconv" + "blitiri.com.ar/go/chasquid/internal/nettrace" "blitiri.com.ar/go/log" - - nettrace "golang.org/x/net/trace" ) func init() { - // golang.org/x/net/trace has its own authorization which by default only - // allows localhost. This can be confusing and limiting in environments - // which access the monitoring server remotely. - nettrace.AuthRequest = func(req *http.Request) (any, sensitive bool) { - return true, true - } } // A Trace represents an active request. @@ -38,9 +30,16 @@ func New(family, title string) *Trace { return t } +// NewChild creates a new child trace. +func (t *Trace) NewChild(family, title string) *Trace { + n := &Trace{family, title, t.t.NewChild(family, title)} + n.t.SetMaxEvents(30) + return n +} + // Printf adds this message to the trace's log. func (t *Trace) Printf(format string, a ...interface{}) { - t.t.LazyPrintf(format, a...) + t.t.Printf(format, a...) log.Log(log.Info, 1, "%s %s: %s", t.family, t.title, quote(fmt.Sprintf(format, a...))) @@ -48,7 +47,7 @@ func (t *Trace) Printf(format string, a ...interface{}) { // Debugf adds this message to the trace's log, with a debugging level. func (t *Trace) Debugf(format string, a ...interface{}) { - t.t.LazyPrintf(format, a...) + t.t.Printf(format, a...) log.Log(log.Debug, 1, "%s %s: %s", t.family, t.title, quote(fmt.Sprintf(format, a...))) @@ -59,7 +58,7 @@ func (t *Trace) Errorf(format string, a ...interface{}) error { // Note we can't just call t.Error here, as it breaks caller logging. err := fmt.Errorf(format, a...) t.t.SetError() - t.t.LazyPrintf("error: %v", err) + t.t.Printf("error: %v", err) log.Log(log.Info, 1, "%s %s: error: %s", t.family, t.title, quote(err.Error())) @@ -70,7 +69,7 @@ func (t *Trace) Errorf(format string, a ...interface{}) error { // trace's log. func (t *Trace) Error(err error) error { t.t.SetError() - t.t.LazyPrintf("error: %v", err) + t.t.Printf("error: %v", err) log.Log(log.Info, 1, "%s %s: error: %s", t.family, t.title, quote(err.Error())) @@ -83,45 +82,6 @@ func (t *Trace) Finish() { t.t.Finish() } -// EventLog is used for tracing long-lived objects. -type EventLog struct { - family string - title string - e nettrace.EventLog -} - -// NewEventLog returns a new EventLog. -func NewEventLog(family, title string) *EventLog { - return &EventLog{family, title, nettrace.NewEventLog(family, title)} -} - -// Printf adds the message to the EventLog. -func (e *EventLog) Printf(format string, a ...interface{}) { - e.e.Printf(format, a...) - - log.Log(log.Info, 1, "%s %s: %s", e.family, e.title, - quote(fmt.Sprintf(format, a...))) -} - -// Debugf adds the message to the EventLog, with a debugging level. -func (e *EventLog) Debugf(format string, a ...interface{}) { - e.e.Printf(format, a...) - - log.Log(log.Debug, 1, "%s %s: %s", e.family, e.title, - quote(fmt.Sprintf(format, a...))) -} - -// Errorf adds the message to the EventLog, with an error level. -func (e *EventLog) Errorf(format string, a ...interface{}) error { - err := fmt.Errorf(format, a...) - e.e.Errorf("error: %v", err) - - log.Log(log.Info, 1, "%s %s: error: %s", - e.family, e.title, quote(err.Error())) - - return err -} - func quote(s string) string { qs := strconv.Quote(s) return qs[1 : len(qs)-1]