git » chasquid » commit 3e6dd12

Improve debugging and tracing information

author Alberto Bertogli
2016-10-08 00:27:13 UTC
committer Alberto Bertogli
2016-10-09 23:51:05 UTC
parent 2b801a84d19adbed3007d4f8c19f9df43f06dea6

Improve debugging and tracing information

This patch reviews various debug and informational messages, making more
uniform use of tracing, and extends the monitoring http server with
useful information like an index and a queue dump.

chasquid.go +99 -25
internal/courier/procmail.go +3 -2
internal/courier/smtp.go +8 -7
internal/queue/queue.go +44 -28
internal/trace/trace.go +23 -8

diff --git a/chasquid.go b/chasquid.go
index 2bc34ff..a7c80c1 100644
--- a/chasquid.go
+++ b/chasquid.go
@@ -9,7 +9,6 @@ import (
 	"io/ioutil"
 	"math/rand"
 	"net"
-	"net/http"
 	"net/mail"
 	"net/textproto"
 	"os"
@@ -32,6 +31,7 @@ import (
 	"blitiri.com.ar/go/chasquid/internal/trace"
 	"blitiri.com.ar/go/chasquid/internal/userdb"
 
+	"net/http"
 	_ "net/http/pprof"
 
 	"github.com/golang/glog"
@@ -67,9 +67,7 @@ func main() {
 	os.Chdir(*configDir)
 
 	if conf.MonitoringAddress != "" {
-		glog.Infof("Monitoring HTTP server listening on %s",
-			conf.MonitoringAddress)
-		go http.ListenAndServe(conf.MonitoringAddress, nil)
+		launchMonitoringServer(conf.MonitoringAddress)
 	}
 
 	s := NewServer()
@@ -301,6 +299,11 @@ func (s *Server) InitQueue(path string, localC, remoteC courier.Courier) {
 		glog.Fatalf("Error loading queue: %v", err)
 	}
 	s.queue = q
+
+	http.HandleFunc("/debug/queue",
+		func(w http.ResponseWriter, r *http.Request) {
+			w.Write([]byte(q.DumpString()))
+		})
 }
 
 // periodicallyReload some of the server's information, such as aliases and
@@ -395,6 +398,9 @@ type Conn struct {
 	mode         SocketMode
 	tlsConnState *tls.ConnectionState
 
+	// Tracer to use.
+	tr *trace.Trace
+
 	// System configuration.
 	config *config.Config
 
@@ -442,9 +448,8 @@ type Conn struct {
 func (c *Conn) Handle() {
 	defer c.netconn.Close()
 
-	tr := trace.New("SMTP", "Connection")
-	defer tr.Finish()
-	tr.LazyPrintf("RemoteAddr: %s", c.netconn.RemoteAddr())
+	c.tr = trace.New("SMTP.Conn", c.netconn.RemoteAddr().String())
+	defer c.tr.Finish()
 
 	c.tc.PrintfLine("220 %s ESMTP chasquid", c.hostname)
 
@@ -454,8 +459,8 @@ func (c *Conn) Handle() {
 loop:
 	for {
 		if time.Since(c.deadline) > 0 {
-			tr.LazyPrintf("connection deadline exceeded")
 			err = fmt.Errorf("connection deadline exceeded")
+			c.tr.Error(err)
 			break
 		}
 
@@ -467,7 +472,7 @@ loop:
 			break
 		}
 
-		tr.LazyPrintf("-> %s %s", cmd, params)
+		c.tr.Debugf("-> %s %s", cmd, params)
 
 		var code int
 		var msg string
@@ -493,11 +498,11 @@ loop:
 			code, msg = c.RCPT(params)
 		case "DATA":
 			// DATA handles the whole sequence.
-			code, msg = c.DATA(params, tr)
+			code, msg = c.DATA(params)
 		case "STARTTLS":
-			code, msg = c.STARTTLS(params, tr)
+			code, msg = c.STARTTLS(params)
 		case "AUTH":
-			code, msg = c.AUTH(params, tr)
+			code, msg = c.AUTH(params)
 		case "QUIT":
 			c.writeResponse(221, "Be seeing you...")
 			break loop
@@ -507,7 +512,12 @@ loop:
 		}
 
 		if code > 0 {
-			tr.LazyPrintf("<- %d  %s", code, msg)
+			c.tr.Debugf("<- %d  %s", code, msg)
+
+			// Be verbose about errors, to help troubleshooting.
+			if code >= 400 {
+				c.tr.Errorf("%s failed: %d  %s", cmd, code, msg)
+			}
 
 			err = c.writeResponse(code, msg)
 			if err != nil {
@@ -517,8 +527,7 @@ loop:
 	}
 
 	if err != nil {
-		tr.LazyPrintf("exiting with error: %v", err)
-		tr.SetError()
+		c.tr.Errorf("exiting with error: %v", err)
 	}
 }
 
@@ -624,6 +633,7 @@ func (c *Conn) MAIL(params string) (code int, msg string) {
 		if tcp, ok := c.netconn.RemoteAddr().(*net.TCPAddr); ok {
 			c.spfResult, c.spfError = spf.CheckHost(
 				tcp.IP, envelope.DomainOf(e.Address))
+			c.tr.Debugf("SPF %v (%v)", c.spfResult, c.spfError)
 			// https://tools.ietf.org/html/rfc7208#section-8
 			// We opt not to fail on errors, to avoid accidents to prevent
 			// delivery.
@@ -688,7 +698,7 @@ func (c *Conn) RCPT(params string) (code int, msg string) {
 	return 250, "You have an eerie feeling..."
 }
 
-func (c *Conn) DATA(params string, tr *trace.Trace) (code int, msg string) {
+func (c *Conn) DATA(params string) (code int, msg string) {
 	if c.mailFrom == "" {
 		return 503, "sender not yet given"
 	}
@@ -703,7 +713,7 @@ func (c *Conn) DATA(params string, tr *trace.Trace) (code int, msg string) {
 		return 554, fmt.Sprintf("error writing DATA response: %v", err)
 	}
 
-	tr.LazyPrintf("<- 354  You experience a strange sense of peace")
+	c.tr.Debugf("<- 354  You experience a strange sense of peace")
 
 	// Increase the deadline for the data transfer to the connection-level
 	// one, we don't want the command timeout to interfere.
@@ -715,7 +725,7 @@ func (c *Conn) DATA(params string, tr *trace.Trace) (code int, msg string) {
 		return 554, fmt.Sprintf("error reading DATA: %v", err)
 	}
 
-	tr.LazyPrintf("-> ... %d bytes of data", len(c.data))
+	c.tr.Debugf("-> ... %d bytes of data", len(c.data))
 
 	c.addReceivedHeader()
 
@@ -723,12 +733,10 @@ func (c *Conn) DATA(params string, tr *trace.Trace) (code int, msg string) {
 	// individual deliveries fail, we report via email.
 	msgID, err := c.queue.Put(c.hostname, c.mailFrom, c.rcptTo, c.data)
 	if err != nil {
-		tr.LazyPrintf("   error queueing: %v", err)
-		tr.SetError()
 		return 554, fmt.Sprintf("Failed to enqueue message: %v", err)
 	}
 
-	tr.LazyPrintf("   ... queued: %q", msgID)
+	c.tr.Printf("Queued from %s to %s - %s", c.mailFrom, c.rcptTo, msgID)
 
 	// It is very important that we reset the envelope before returning,
 	// so clients can send other emails right away without needing to RSET.
@@ -779,7 +787,7 @@ func (c *Conn) addReceivedHeader() {
 	}
 }
 
-func (c *Conn) STARTTLS(params string, tr *trace.Trace) (code int, msg string) {
+func (c *Conn) STARTTLS(params string) (code int, msg string) {
 	if c.onTLS {
 		return 503, "You are already wearing that!"
 	}
@@ -789,7 +797,7 @@ func (c *Conn) STARTTLS(params string, tr *trace.Trace) (code int, msg string) {
 		return 554, fmt.Sprintf("error writing STARTTLS response: %v", err)
 	}
 
-	tr.LazyPrintf("<- 220  You experience a strange sense of peace")
+	c.tr.Debugf("<- 220  You experience a strange sense of peace")
 
 	server := tls.Server(c.netconn, c.tlsConfig)
 	err = server.Handshake()
@@ -797,7 +805,7 @@ func (c *Conn) STARTTLS(params string, tr *trace.Trace) (code int, msg string) {
 		return 554, fmt.Sprintf("error in TLS handshake: %v", err)
 	}
 
-	tr.LazyPrintf("<> ...  jump to TLS was successful")
+	c.tr.Debugf("<> ...  jump to TLS was successful")
 
 	// Override the connections. We don't need the older ones anymore.
 	c.netconn = server
@@ -823,7 +831,7 @@ func (c *Conn) STARTTLS(params string, tr *trace.Trace) (code int, msg string) {
 	return 0, ""
 }
 
-func (c *Conn) AUTH(params string, tr *trace.Trace) (code int, msg string) {
+func (c *Conn) AUTH(params string) (code int, msg string) {
 	if !c.onTLS {
 		return 503, "You feel vulnerable"
 	}
@@ -966,3 +974,69 @@ func writeResponse(w io.Writer, code int, msg string) error {
 
 	return nil
 }
+
+func launchMonitoringServer(addr string) {
+	glog.Infof("Monitoring HTTP server listening on %s", addr)
+
+	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path != "/" {
+			http.NotFound(w, r)
+			return
+		}
+		w.Write([]byte(monitoringHTMLIndex))
+	})
+
+	flags := dumpFlags()
+	http.HandleFunc("/debug/flags", func(w http.ResponseWriter, r *http.Request) {
+		w.Write([]byte(flags))
+	})
+
+	go http.ListenAndServe(addr, nil)
+}
+
+// Static index for the monitoring website.
+const monitoringHTMLIndex = `<!DOCTYPE html>
+<html>
+  <head>
+    <title>chasquid monitoring</title>
+  </head>
+  <body>
+    <h1>chasquid monitoring</h1>
+    <ul>
+      <li><a href="/debug/queue">queue</a>
+      <li><a href="/debug/requests">requests</a>
+          <small><a href="https://godoc.org/golang.org/x/net/trace">
+            (ref)</a></small>
+      <li><a href="/debug/flags">flags</a>
+      <li><a href="/debug/vars">public variables</a>
+      <li><a href="/debug/pprof">pprof</a>
+          <small><a href="https://golang.org/pkg/net/http/pprof/">
+            (ref)</a></small>
+        <ul>
+          <li><a href="/debug/pprof/goroutine?debug=1">goroutines</a>
+        </ul>
+    </ul>
+  </body>
+</html>
+`
+
+// dumpFlags to a string, for troubleshooting purposes.
+func dumpFlags() string {
+	s := ""
+	visited := make(map[string]bool)
+
+	// Print set flags first, then the rest.
+	flag.Visit(func(f *flag.Flag) {
+		s += fmt.Sprintf("-%s=%s\n", f.Name, f.Value.String())
+		visited[f.Name] = true
+	})
+
+	s += "\n"
+	flag.VisitAll(func(f *flag.Flag) {
+		if !visited[f.Name] {
+			s += fmt.Sprintf("-%s=%s\n", f.Name, f.Value.String())
+		}
+	})
+
+	return s
+}
diff --git a/internal/courier/procmail.go b/internal/courier/procmail.go
index c043165..a0d9188 100644
--- a/internal/courier/procmail.go
+++ b/internal/courier/procmail.go
@@ -26,14 +26,14 @@ type Procmail struct {
 }
 
 func (p *Procmail) Deliver(from string, to string, data []byte) (error, bool) {
-	tr := trace.New("Procmail", "Deliver")
+	tr := trace.New("Procmail.Courier", to)
 	defer tr.Finish()
 
 	// Sanitize, just in case.
 	from = sanitizeForProcmail(from)
 	to = sanitizeForProcmail(to)
 
-	tr.LazyPrintf("%s -> %s", from, to)
+	tr.Debugf("%s -> %s", from, to)
 
 	// Prepare the command, replacing the necessary arguments.
 	replacer := strings.NewReplacer(
@@ -97,6 +97,7 @@ func (p *Procmail) Deliver(from string, to string, data []byte) (error, bool) {
 		return err, permanent
 	}
 
+	tr.Debugf("delivered")
 	return nil, false
 }
 
diff --git a/internal/courier/smtp.go b/internal/courier/smtp.go
index 43361c6..71fcb11 100644
--- a/internal/courier/smtp.go
+++ b/internal/courier/smtp.go
@@ -34,9 +34,9 @@ type SMTP struct {
 }
 
 func (s *SMTP) Deliver(from string, to string, data []byte) (error, bool) {
-	tr := trace.New("SMTP", "Deliver")
+	tr := trace.New("SMTP.Courier", to)
 	defer tr.Finish()
-	tr.LazyPrintf("%s  ->  %s", from, to)
+	tr.Debugf("%s  ->  %s", from, to)
 
 	mx, err := lookupMX(envelope.DomainOf(to))
 	if err != nil {
@@ -46,7 +46,7 @@ func (s *SMTP) Deliver(from string, to string, data []byte) (error, bool) {
 		// have to make sure we try hard enough on the lookup above.
 		return tr.Errorf("Could not find mail server: %v", err), true
 	}
-	tr.LazyPrintf("MX: %s", mx)
+	tr.Debugf("MX: %s", mx)
 
 	// Do we use insecure TLS?
 	// Set as fallback when retrying.
@@ -94,17 +94,17 @@ retry:
 			}
 
 			insecure = true
-			tr.LazyPrintf("TLS error, retrying insecurely")
+			tr.Debugf("TLS error, retrying insecurely")
 			goto retry
 		}
 
 		if config.InsecureSkipVerify {
-			tr.LazyPrintf("Insecure - self-signed certificate")
+			tr.Debugf("Insecure - using TLS, but cert does not match %s", mx)
 		} else {
-			tr.LazyPrintf("Secure - using TLS")
+			tr.Debugf("Secure - using TLS")
 		}
 	} else {
-		tr.LazyPrintf("Insecure - not using TLS")
+		tr.Debugf("Insecure - NOT using TLS")
 	}
 
 	// c.Mail will add the <> for us when the address is empty.
@@ -130,6 +130,7 @@ retry:
 	}
 
 	c.Quit()
+	tr.Debugf("done")
 
 	return nil, false
 }
diff --git a/internal/queue/queue.go b/internal/queue/queue.go
index 1e531c6..363ff78 100644
--- a/internal/queue/queue.go
+++ b/internal/queue/queue.go
@@ -24,12 +24,12 @@ import (
 	"blitiri.com.ar/go/chasquid/internal/envelope"
 	"blitiri.com.ar/go/chasquid/internal/protoio"
 	"blitiri.com.ar/go/chasquid/internal/set"
+	"blitiri.com.ar/go/chasquid/internal/trace"
 
 	"github.com/golang/glog"
 	"github.com/golang/protobuf/ptypes"
 	"github.com/golang/protobuf/ptypes/timestamp"
 	"golang.org/x/net/idna"
-	"golang.org/x/net/trace"
 )
 
 const (
@@ -198,8 +198,6 @@ func (q *Queue) Put(hostname, from string, to []string, data []byte) (string, er
 	q.q[item.ID] = item
 	q.mu.Unlock()
 
-	glog.Infof("%s accepted from %q  to  %v", item.ID, from, to)
-
 	// Begin to send it right away.
 	go item.SendLoop(q)
 
@@ -219,8 +217,32 @@ func (q *Queue) Remove(id string) {
 	q.mu.Unlock()
 }
 
-// TODO: http handler for dumping the queue.
-// Register it in main().
+// DumpString returns a human-readable string with the current queue.
+// Useful for debugging purposes.
+func (q *Queue) DumpString() string {
+	q.mu.RLock()
+	defer q.mu.RUnlock()
+	s := fmt.Sprintf("# Queue status\n\n")
+	s += fmt.Sprintf("date: %v\n", time.Now())
+	s += fmt.Sprintf("length: %d\n\n", len(q.q))
+
+	for id, item := range q.q {
+		s += fmt.Sprintf("## Item %s\n", id)
+		item.Lock()
+		s += fmt.Sprintf("created at: %s\n", item.CreatedAt)
+		s += fmt.Sprintf("from: %s\n", item.From)
+		s += fmt.Sprintf("to: %s\n", item.To)
+		for _, rcpt := range item.Rcpt {
+			s += fmt.Sprintf("%s %s (%s)\n", rcpt.Status, rcpt.Address, rcpt.Type)
+			s += fmt.Sprintf("  original address: %s\n", rcpt.OriginalAddress)
+			s += fmt.Sprintf("  last failure: %q\n", rcpt.LastFailureMessage)
+		}
+		item.Unlock()
+		s += fmt.Sprintf("\n")
+	}
+
+	return s
+}
 
 // An item in the queue.
 type Item struct {
@@ -263,9 +285,9 @@ func (item *Item) WriteTo(dir string) error {
 }
 
 func (item *Item) SendLoop(q *Queue) {
-	tr := trace.New("Queue", item.ID)
+	tr := trace.New("Queue.SendLoop", item.ID)
 	defer tr.Finish()
-	tr.LazyPrintf("from: %s", item.From)
+	tr.Printf("from %s", item.From)
 
 	var delay time.Duration
 	for time.Since(item.CreatedAt) < giveUpAfter {
@@ -284,23 +306,19 @@ func (item *Item) SendLoop(q *Queue) {
 			go func(rcpt *Recipient, oldStatus Recipient_Status) {
 				defer wg.Done()
 				to := rcpt.Address
-				tr.LazyPrintf("%s sending", to)
+				tr.Debugf("%s sending", to)
 
 				err, permanent := item.deliver(q, rcpt)
 
 				if err != nil {
 					if permanent {
-						tr.LazyPrintf("permanent error: %v", err)
-						glog.Infof("%s  -> %q permanent fail: %v", item.ID, to, err)
+						tr.Errorf("%s permanent error: %v", to, err)
 						status = Recipient_FAILED
 					} else {
-						tr.LazyPrintf("error: %v", err)
-						glog.Infof("%s  -> %q fail: %v", item.ID, to, err)
+						tr.Printf("%s temporary error: %v", to, err)
 					}
 				} else {
-					tr.LazyPrintf("%s successful", to)
-					glog.Infof("%s  -> %q sent", item.ID, to)
-
+					tr.Printf("%s sent", to)
 					status = Recipient_SENT
 				}
 
@@ -315,8 +333,7 @@ func (item *Item) SendLoop(q *Queue) {
 
 					err = item.WriteTo(q.path)
 					if err != nil {
-						tr.LazyPrintf("failed to write: %v", err)
-						glog.Errorf("%s failed to write: %v", item.ID, err)
+						tr.Errorf("failed to write: %v", err)
 					}
 				}
 			}(rcpt, status)
@@ -338,14 +355,11 @@ func (item *Item) SendLoop(q *Queue) {
 		// that some of the messages have been delayed.
 
 		delay = nextDelay(delay)
-		tr.LazyPrintf("waiting for %v", delay)
-		glog.Infof("%s waiting for %v", item.ID, delay)
+		tr.Printf("waiting for %v", delay)
 		time.Sleep(delay)
 	}
 
 	// Completed to all recipients (some may not have succeeded).
-	tr.LazyPrintf("all done")
-	glog.Infof("%s all done", item.ID)
 
 	failed := 0
 	for _, rcpt := range item.Rcpt {
@@ -358,26 +372,28 @@ func (item *Item) SendLoop(q *Queue) {
 		sendDSN(tr, q, item)
 	}
 
+	tr.Printf("all done")
 	q.Remove(item.ID)
 
 	return
 }
 
-func sendDSN(tr trace.Trace, q *Queue, item *Item) {
-	tr.LazyPrintf("sending DSN")
+func sendDSN(tr *trace.Trace, q *Queue, item *Item) {
+	tr.Debugf("sending DSN")
 
 	msg, err := deliveryStatusNotification(item)
 	if err != nil {
-		tr.LazyPrintf("failed to build DSN: %v", err)
-		glog.Infof("%s: failed to build DSN: %v", item.ID, err)
+		tr.Errorf("failed to build DSN: %v", err)
 		return
 	}
 
-	_, err = q.Put(item.Hostname, "<>", []string{item.From}, msg)
+	id, err := q.Put(item.Hostname, "<>", []string{item.From}, msg)
 	if err != nil {
-		tr.LazyPrintf("failed to queue DSN: %v", err)
-		glog.Infof("%s: failed to queue DSN: %v", item.ID, err)
+		tr.Errorf("failed to queue DSN: %v", err)
+		return
 	}
+
+	tr.Printf("queued DSN: %s", id)
 }
 
 // deliver the item to the given recipient, using the couriers from the queue.
diff --git a/internal/trace/trace.go b/internal/trace/trace.go
index 4314079..fa4130f 100644
--- a/internal/trace/trace.go
+++ b/internal/trace/trace.go
@@ -3,6 +3,7 @@ package trace
 
 import (
 	"fmt"
+	"strconv"
 
 	"github.com/golang/glog"
 	nettrace "golang.org/x/net/trace"
@@ -18,16 +19,30 @@ func New(family, title string) *Trace {
 	return &Trace{family, title, nettrace.New(family, title)}
 }
 
-func (t *Trace) LazyPrintf(format string, a ...interface{}) {
+func (t *Trace) Printf(format string, a ...interface{}) {
 	t.t.LazyPrintf(format, a...)
 
+	if glog.V(0) {
+		msg := fmt.Sprintf("%s %s: %s", t.family, t.title,
+			quote(fmt.Sprintf(format, a...)))
+		glog.InfoDepth(2, msg)
+	}
+}
+
+func (t *Trace) Debugf(format string, a ...interface{}) {
 	if glog.V(2) {
-		msg := fmt.Sprintf("%p %s %s: %+q", t, t.family, t.title,
-			fmt.Sprintf(format, a...))
-		glog.InfoDepth(1, msg)
+		t.t.LazyPrintf(format, a...)
+		msg := fmt.Sprintf("%s %s: %s", t.family, t.title,
+			quote(fmt.Sprintf(format, a...)))
+		glog.InfoDepth(2, msg)
 	}
 }
 
+func quote(s string) string {
+	qs := strconv.Quote(s)
+	return qs[1 : len(qs)-1]
+}
+
 func (t *Trace) SetError() {
 	t.t.SetError()
 }
@@ -37,8 +52,8 @@ func (t *Trace) Errorf(format string, a ...interface{}) error {
 	t.t.SetError()
 	t.t.LazyPrintf("error: %v", err)
 
-	if glog.V(2) {
-		msg := fmt.Sprintf("%p %s %s: error: %v", t, t.family, t.title, err)
+	if glog.V(0) {
+		msg := fmt.Sprintf("%s %s: error: %v", t.family, t.title, err)
 		glog.InfoDepth(1, msg)
 	}
 	return err
@@ -48,8 +63,8 @@ func (t *Trace) Error(err error) error {
 	t.t.SetError()
 	t.t.LazyPrintf("error: %v", err)
 
-	if glog.V(2) {
-		msg := fmt.Sprintf("%p %s %s: error: %v", t, t.family, t.title, err)
+	if glog.V(0) {
+		msg := fmt.Sprintf("%s %s: error: %v", t, t.family, t.title, err)
 		glog.InfoDepth(1, msg)
 	}
 	return err