git » chasquid » commit b855172

smtp: Try all entries in MX, not just the first one

author Alberto Bertogli
2017-02-26 03:26:07 UTC
committer Alberto Bertogli
2017-02-28 22:27:15 UTC
parent c2ea8a8ef002163e42a211fed9f42f92c56ea8d7

smtp: Try all entries in MX, not just the first one

Currently, we pick the first host in the MX list, and attempt delivery
there. If it fails, we just report the failure to the queue, which will
wait for some time and then try again.

This is not ideal: we should fall back to the other MXs in the list, as
the first host could be having issues for a long time, and not
attempting with the rest just delays delivery.

This patch implements the fallback, so we try all MXs before deciding to
report a failed delivery (unless, of course, an MX returned a permanent
failure).

internal/courier/smtp.go +107 -57
internal/courier/smtp_test.go +4 -2

diff --git a/internal/courier/smtp.go b/internal/courier/smtp.go
index 876cd1f..159eee4 100644
--- a/internal/courier/smtp.go
+++ b/internal/courier/smtp.go
@@ -27,7 +27,7 @@ var (
 		"port to use for outgoing SMTP connections, ONLY FOR TESTING")
 
 	// Fake MX records, used for testing only.
-	fakeMX = map[string]string{}
+	fakeMX = map[string][]string{}
 )
 
 // Exported variables.
@@ -42,31 +42,72 @@ type SMTP struct {
 }
 
 func (s *SMTP) Deliver(from string, to string, data []byte) (error, bool) {
-	tr := trace.New("Courier.SMTP", to)
-	defer tr.Finish()
-	tr.Debugf("%s  ->  %s", from, to)
+	a := &attempt{
+		courier: s,
+		from:    from,
+		to:      to,
+		data:    data,
+		tr:      trace.New("Courier.SMTP", to),
+	}
+	defer a.tr.Finish()
+	a.tr.Debugf("%s  ->  %s", from, to)
+
+	// smtp.Client.Mail will add the <> for us when the address is empty.
+	if a.from == "<>" {
+		a.from = ""
+	}
 
 	toDomain := envelope.DomainOf(to)
-	mx, err := lookupMX(tr, toDomain)
+	mxs, err := lookupMXs(a.tr, toDomain)
 	if err != nil {
 		// Note this is considered a permanent error.
 		// This is in line with what other servers (Exim) do. However, the
 		// downside is that temporary DNS issues can affect delivery, so we
 		// have to make sure we try hard enough on the lookup above.
-		return tr.Errorf("Could not find mail server: %v", err), true
+		return a.tr.Errorf("Could not find mail server: %v", err), true
 	}
 
 	// Issue an EHLO with a valid domain; otherwise, some servers like postfix
 	// will complain.
-	helloDomain, err := idna.ToASCII(envelope.DomainOf(from))
+	a.helloDomain, err = idna.ToASCII(envelope.DomainOf(from))
 	if err != nil {
-		return tr.Errorf("Sender domain not IDNA compliant: %v", err), true
+		return a.tr.Errorf("Sender domain not IDNA compliant: %v", err), true
 	}
-	if helloDomain == "" {
+	if a.helloDomain == "" {
 		// This can happen when sending bounces. Last resort.
-		helloDomain, _ = os.Hostname()
+		a.helloDomain, _ = os.Hostname()
+	}
+
+	for _, mx := range mxs {
+		var permanent bool
+		err, permanent = a.deliver(mx)
+		if err == nil {
+			return nil, false
+		}
+		if permanent {
+			return err, true
+		}
+		a.tr.Errorf("%q returned transient error: %v", mx, err)
 	}
 
+	// We exhausted all MXs failed to deliver, try again later.
+	return a.tr.Errorf("all MXs returned transient failures (last: %v)", err), false
+}
+
+type attempt struct {
+	courier *SMTP
+
+	from string
+	to   string
+	data []byte
+
+	toDomain    string
+	helloDomain string
+
+	tr *trace.Trace
+}
+
+func (a *attempt) deliver(mx string) (error, bool) {
 	// Do we use insecure TLS?
 	// Set as fallback when retrying.
 	insecure := false
@@ -75,18 +116,18 @@ func (s *SMTP) Deliver(from string, to string, data []byte) (error, bool) {
 retry:
 	conn, err := net.DialTimeout("tcp", mx+":"+*smtpPort, smtpDialTimeout)
 	if err != nil {
-		return tr.Errorf("Could not dial: %v", err), false
+		return a.tr.Errorf("Could not dial: %v", err), false
 	}
 	defer conn.Close()
 	conn.SetDeadline(time.Now().Add(smtpTotalTimeout))
 
 	c, err := smtp.NewClient(conn, mx)
 	if err != nil {
-		return tr.Errorf("Error creating client: %v", err), false
+		return a.tr.Errorf("Error creating client: %v", err), false
 	}
 
-	if err = c.Hello(helloDomain); err != nil {
-		return tr.Errorf("Error saying hello: %v", err), false
+	if err = c.Hello(a.helloDomain); err != nil {
+		return a.tr.Errorf("Error saying hello: %v", err), false
 	}
 
 	if ok, _ := c.Extension("STARTTLS"); ok {
@@ -100,101 +141,110 @@ retry:
 			// fail verification we just try again without validating.
 			if insecure {
 				tlsCount.Add("tls:failed", 1)
-				return tr.Errorf("TLS error: %v", err), false
+				return a.tr.Errorf("TLS error: %v", err), false
 			}
 
 			insecure = true
-			tr.Debugf("TLS error, retrying insecurely")
+			a.tr.Debugf("TLS error, retrying insecurely")
 			goto retry
 		}
 
 		if config.InsecureSkipVerify {
-			tr.Debugf("Insecure - using TLS, but cert does not match %s", mx)
+			a.tr.Debugf("Insecure - using TLS, but cert does not match %s", mx)
 			tlsCount.Add("tls:insecure", 1)
 			secLevel = domaininfo.SecLevel_TLS_INSECURE
 		} else {
 			tlsCount.Add("tls:secure", 1)
-			tr.Debugf("Secure - using TLS")
+			a.tr.Debugf("Secure - using TLS")
 			secLevel = domaininfo.SecLevel_TLS_SECURE
 		}
 	} else {
 		tlsCount.Add("plain", 1)
-		tr.Debugf("Insecure - NOT using TLS")
+		a.tr.Debugf("Insecure - NOT using TLS")
 	}
 
-	if toDomain != "" && !s.Dinfo.OutgoingSecLevel(toDomain, secLevel) {
+	if a.toDomain != "" && !a.courier.Dinfo.OutgoingSecLevel(a.toDomain, secLevel) {
 		// We consider the failure transient, so transient misconfigurations
 		// do not affect deliveries.
 		slcResults.Add("fail", 1)
-		return tr.Errorf("Security level check failed (level:%s)", secLevel), false
+		return a.tr.Errorf("Security level check failed (level:%s)", secLevel), false
 	}
 	slcResults.Add("pass", 1)
 
-	// c.Mail will add the <> for us when the address is empty.
-	if from == "<>" {
-		from = ""
-	}
-	if err = c.MailAndRcpt(from, to); err != nil {
-		return tr.Errorf("MAIL+RCPT %v", err), smtp.IsPermanent(err)
+	if err = c.MailAndRcpt(a.from, a.to); err != nil {
+		return a.tr.Errorf("MAIL+RCPT %v", err), smtp.IsPermanent(err)
 	}
 
 	w, err := c.Data()
 	if err != nil {
-		return tr.Errorf("DATA %v", err), smtp.IsPermanent(err)
+		return a.tr.Errorf("DATA %v", err), smtp.IsPermanent(err)
 	}
-	_, err = w.Write(data)
+	_, err = w.Write(a.data)
 	if err != nil {
-		return tr.Errorf("DATA writing: %v", err), smtp.IsPermanent(err)
+		return a.tr.Errorf("DATA writing: %v", err), smtp.IsPermanent(err)
 	}
 
 	err = w.Close()
 	if err != nil {
-		return tr.Errorf("DATA closing %v", err), smtp.IsPermanent(err)
+		return a.tr.Errorf("DATA closing %v", err), smtp.IsPermanent(err)
 	}
 
 	c.Quit()
-	tr.Debugf("done")
+	a.tr.Debugf("done")
 
 	return nil, false
 }
 
-func lookupMX(tr *trace.Trace, domain string) (string, error) {
+func lookupMXs(tr *trace.Trace, domain string) ([]string, error) {
 	if v, ok := fakeMX[domain]; ok {
 		return v, nil
 	}
 
 	domain, err := idna.ToASCII(domain)
 	if err != nil {
-		return "", err
+		return nil, err
 	}
 
-	mxs, err := net.LookupMX(domain)
-	if err == nil {
-		if len(mxs) == 0 {
-			tr.Debugf("domain %q has no MX, falling back to A", domain)
-			return domain, nil
+	mxs := []string{}
+
+	mxRecords, err := net.LookupMX(domain)
+	if err != nil {
+		// There was an error. It could be that the domain has no MX, in which
+		// case we have to fall back to A, or a bigger problem.
+		// Unfortunately, go's API doesn't let us easily distinguish between
+		// them. For now, if the error is permanent, we assume it's because
+		// there was no MX and fall back, otherwise we return.
+		// TODO: Find a better way to do this.
+		dnsErr, ok := err.(*net.DNSError)
+		if !ok {
+			tr.Debugf("MX lookup error: %v", err)
+			return nil, err
+		} else if dnsErr.Temporary() {
+			tr.Debugf("temporary DNS error: %v", dnsErr)
+			return nil, err
 		}
 
-		tr.Debugf("MX %s", mxs[0].Host)
-		return mxs[0].Host, nil
+		// Permanent error, we assume MX does not exist and fall back to A.
+		tr.Debugf("failed to resolve MX for %s, falling back to A", domain)
+		mxs = []string{domain}
+	} else {
+		// Convert the DNS records to a plain string slice. They're already
+		// sorted by priority.
+		for _, r := range mxRecords {
+			mxs = append(mxs, r.Host)
+		}
 	}
 
-	// There was an error. It could be that the domain has no MX, in which
-	// case we have to fall back to A, or a bigger problem.
-	// Unfortunately, go's API doesn't let us easily distinguish between them.
-	// For now, if the error is permanent, we assume it's because there was no
-	// MX and fall back, otherwise we return.
-	// TODO: Find a better way to do this.
-	dnsErr, ok := err.(*net.DNSError)
-	if !ok {
-		tr.Debugf("MX lookup error: %v", err)
-		return "", err
-	} else if dnsErr.Temporary() {
-		tr.Debugf("temporary DNS error: %v", dnsErr)
-		return "", err
+	// Note that mxs could be empty; in that case we do NOT fall back to A.
+	// This case is explicitly covered by the SMTP RFC.
+	// https://tools.ietf.org/html/rfc5321#section-5.1
+
+	// Cap the list of MXs to 5 hosts, to keep delivery attempt times sane
+	// and prevent abuse.
+	if len(mxs) > 5 {
+		mxs = mxs[:5]
 	}
 
-	// Permanent error, we assume MX does not exist and fall back to A.
-	tr.Debugf("failed to resolve MX for %s, falling back to A", domain)
-	return domain, nil
+	tr.Debugf("MXs: %v", mxs)
+	return mxs, nil
 }
diff --git a/internal/courier/smtp_test.go b/internal/courier/smtp_test.go
index 1ba7ffd..703acc3 100644
--- a/internal/courier/smtp_test.go
+++ b/internal/courier/smtp_test.go
@@ -87,7 +87,9 @@ func TestSMTP(t *testing.T) {
 	addr := fakeServer(t, responses)
 	host, port, _ := net.SplitHostPort(addr)
 
-	fakeMX["to"] = host
+	// Put a non-existing host first, so we check that if the first host
+	// doesn't work, we try with the rest.
+	fakeMX["to"] = []string{"nonexistinghost", host}
 	*smtpPort = port
 
 	s, tmpDir := newSMTP(t)
@@ -148,7 +150,7 @@ func TestSMTPErrors(t *testing.T) {
 		addr := fakeServer(t, rs)
 		host, port, _ := net.SplitHostPort(addr)
 
-		fakeMX["to"] = host
+		fakeMX["to"] = []string{host}
 		*smtpPort = port
 
 		s, tmpDir := newSMTP(t)