Try to asynchronously connect to Fluentd before writing

Albin Kerouanton · Albin Kerouanton · commit f2bc148bb1a6 · 2020-04-22T04:01:14.000+02:00
PR fluent#77 introduced a new parameter named ForceStopAsyncSend. It can be used to tell the logger to not try to send all the log messages in its buffer before closing. Without this parameter, the logger hangs out whenever the target Fluentd server is down. Also the logger is currently lazily initializing the connection when it receives its first log. But this is a problem when the Fluentd server has never been available as the connection initialization blocks the select signaling the log channel should be drained.
diff --git a/fluent/fluent.go b/fluent/fluent.go
@@ -15,8 +15,9 @@ import (
 	"bytes"
 	"encoding/base64"
 	"encoding/binary"
-	"github.com/tinylib/msgp/msgp"
 	"math/rand"
+
+	"github.com/tinylib/msgp/msgp"
 )
 
 const (
@@ -340,6 +341,8 @@ func (f *Fluent) close(c net.Conn) {
 
 // connect establishes a new connection using the specified transport.
 func (f *Fluent) connect() (err error) {
+	f.muconn.Lock()
+	defer f.muconn.Unlock()
 
 	switch f.Config.FluentNetwork {
 	case "tcp":
@@ -355,6 +358,45 @@ func (f *Fluent) connect() (err error) {
 func (f *Fluent) run() {
 	drainEvents := false
 	var emitEventDrainMsg sync.Once
+
+	// First we need to wait for the connection to become ready to make sure
+	// it won't be initialized during the first for-select iteration. Otherwise
+	// this would block the select from f.pending without letting the change to
+	// the select on f.stopRunning to signal its end to this goroutine.
+	var wait <-chan time.Time
+	for i := 0; i < f.Config.MaxRetry; i++ {
+		select {
+		case stopRunning, ok := <-f.stopRunning:
+			if stopRunning || !ok {
+				drainEvents = true
+			}
+			break
+		case <-wait:
+		}
+
+		err := f.connect()
+		if err == nil {
+			break
+		}
+
+		if _, ok := err.(*ErrUnknownNetwork); ok {
+			// No need to retry on unknown network error. Thus ready channel
+			// is closed and received logs are discarded.
+			drainEvents = true
+			break
+		}
+
+		waitTime := f.Config.RetryWait * e(defaultReconnectWaitIncreRate, float64(i-1))
+		if waitTime > f.Config.MaxRetryWait {
+			waitTime = f.Config.MaxRetryWait
+		}
+
+		wait = time.After(time.Duration(waitTime) * time.Millisecond)
+	}
+
+	// At this point we can go ahead: even if a message is send to
+	// f.stopRunning right after the connection become ready, the following
+	// for-select loop will get it and proceed accordingly.
 	for {
 		select {
 		case entry, ok := <-f.pending:
@@ -389,31 +431,6 @@ func (f *Fluent) write(msg *msgToSend) error {
 	var c net.Conn
 	for i := 0; i < f.Config.MaxRetry; i++ {
 		c = f.conn
-		// Connect if needed
-		if c == nil {
-			f.muconn.Lock()
-			if f.conn == nil {
-				err := f.connect()
-				if err != nil {
-					f.muconn.Unlock()
-
-					if _, ok := err.(*ErrUnknownNetwork); ok {
-						// do not retry on unknown network error
-						break
-					}
-					waitTime := f.Config.RetryWait * e(defaultReconnectWaitIncreRate, float64(i-1))
-					if waitTime > f.Config.MaxRetryWait {
-						waitTime = f.Config.MaxRetryWait
-					}
-					time.Sleep(time.Duration(waitTime) * time.Millisecond)
-					continue
-				}
-			}
-			c = f.conn
-			f.muconn.Unlock()
-		}
-
-		// We're connected, write msg
 		t := f.Config.WriteTimeout
 		if time.Duration(0) < t {
 			c.SetWriteDeadline(time.Now().Add(t))