Simplify error handling

Rely on Serf for liveliness. In the event of a failure, simply cycle the server to the end of the list. If the server is unhealthy, Serf will reap the dead server. Additional simplifications: *) Only rebalance servers based on timers, not when a new server is readded to the cluster. *) Back out the failure count in server_details.ServerDetails
9 years ago · fdbb142c3f
parent c2c73bfeab
commit fdbb142c3f
5 changed files with 34 additions and 133 deletions
--- a/consul/client.go
+++ b/consul/client.go
@ -121,7 +121,7 @@ func NewClient(config *Config) (*Client, error) {
 	c.serverMgr = server_manager.NewServerManager(c.logger, c.shutdownCh, c.serf)

 	// Start consulServers maintenance
-	go c.serverMgr.StartServerManager()
+	go c.serverMgr.Start()

 	// Start the Serf listeners to prevent a deadlock
 	go c.lanEventHandler()
--- a/consul/server_details/server_details.go
+++ b/consul/server_details/server_details.go
@ -17,11 +17,6 @@ type ServerDetails struct {
 	Expect     int
 	Version    int
 	Addr       net.Addr
-
-	// Disabled is a uint64 in order to support atomic integer
-	// operations.  Zero means enabled, non-zero is the number of times
-	// this server has failed without being marked healthy.
-	Disabled uint64
 }

 func (s *ServerDetails) String() string {
@ -37,11 +32,6 @@ func IsConsulServer(m serf.Member) (bool, *ServerDetails) {

 	datacenter := m.Tags["dc"]
 	_, bootstrap := m.Tags["bootstrap"]
-	var disabled uint64 = 0
-	_, disabledStr := m.Tags["disabled"]
-	if disabledStr {
-		disabled = 1
-	}

 	expect := 0
 	expect_str, ok := m.Tags["expect"]
@ -75,7 +65,6 @@ func IsConsulServer(m serf.Member) (bool, *ServerDetails) {
 		Expect:     expect,
 		Addr:       addr,
 		Version:    vsn,
-		Disabled:   disabled,
 	}
 	return true, parts
 }
--- a/consul/server_details/server_details_test.go
+++ b/consul/server_details/server_details_test.go
@ -29,9 +29,6 @@ func TestIsConsulServer(t *testing.T) {
 	if parts.Bootstrap {
 		t.Fatalf("unexpected bootstrap")
 	}
-	if parts.Disabled > 0 {
-		t.Fatalf("unexpected disabled")
-	}
 	if parts.Expect != 0 {
 		t.Fatalf("bad: %v", parts.Expect)
 	}
@ -44,9 +41,6 @@ func TestIsConsulServer(t *testing.T) {
 	if !parts.Bootstrap {
 		t.Fatalf("expected bootstrap")
 	}
-	if parts.Disabled == 0 {
-		t.Fatalf("expected disabled")
-	}
 	if parts.Addr.String() != "127.0.0.1:10000" {
 		t.Fatalf("bad addr: %v", parts.Addr)
 	}
--- a/consul/server_manager/server_manager.go
+++ b/consul/server_manager/server_manager.go
@ -14,27 +14,6 @@ import (

 type consulServerEventTypes int

-const (
-	// consulServersNodeJoin is used to notify of a new consulServer.
-	// The primary effect of this is a reshuffling of consulServers and
-	// finding a new preferredServer.
-	consulServersNodeJoin = iota
-
-	// consulServersRebalance is used to signal we should rebalance our
-	// connection load across servers
-	consulServersRebalance
-
-	// consulServersRefreshRebalanceDuration is used to signal when we
-	// should reset the rebalance duration because the server list has
-	// changed and we don't need to proactively change our connection
-	consulServersRefreshRebalanceDuration
-
-	// consulServersRPCError is used to signal when a server has either
-	// timed out or returned an error and we would like to have the
-	// server manager find a new preferredServer.
-	consulServersRPCError
-)
-
 const (
 	// clientRPCJitterFraction determines the amount of jitter added to
 	// clientRPCMinReuseDuration before a connection is expired and a new
@ -149,34 +128,9 @@ func (sm *ServerManager) AddServer(server *server_details.ServerDetails) {
 		copy(newServers, serverCfg.servers)
 		newServers = append(newServers, server)
 		serverCfg.servers = newServers
-
-		// Notify the server maintenance task of a new server
-		sm.consulServersCh <- consulServersNodeJoin
-	}
-
-	sm.saveServerConfig(serverCfg)
-}
-
-// CycleFailedServers takes out an internal write lock and dequeues all
-// failed servers and re-enqueues them.  This method does not reshuffle the
-// server list, instead it requests the rebalance duration be refreshed/reset
-// further into the future.
-func (sm *ServerManager) CycleFailedServers() {
-	sm.serverConfigLock.Lock()
-	defer sm.serverConfigLock.Unlock()
-	serverCfg := sm.getServerConfig()
-
-	for i := range serverCfg.servers {
-		failCount := atomic.LoadUint64(&(serverCfg.servers[i].Disabled))
-		if failCount == 0 {
-			break
-		} else if failCount > 0 {
-			serverCfg.servers = serverCfg.cycleServer()
-		}
 	}

 	sm.saveServerConfig(serverCfg)
-	sm.requestRefreshRebalanceDuration()
 }

 // cycleServers returns a new list of servers that has dequeued the first
@ -197,27 +151,16 @@ func (sc *serverConfig) cycleServer() (servers []*server_details.ServerDetails)

 // FindHealthyServer takes out an internal "read lock" and searches through
 // the list of servers to find a healthy server.
-func (sm *ServerManager) FindHealthyServer() (server *server_details.ServerDetails) {
+func (sm *ServerManager) FindHealthyServer() *server_details.ServerDetails {
 	serverCfg := sm.getServerConfig()
 	numServers := len(serverCfg.servers)
 	if numServers == 0 {
 		sm.logger.Printf("[ERR] consul: No servers found in the server config")
 		return nil
+	} else {
+		// Return whatever is at the front of the list
+		return serverCfg.servers[0]
 	}
-
-	// Find the first non-failing server in the server list.  If this is
-	// not the first server a prior RPC call marked the first server as
-	// failed and we're waiting for the server management task to reorder
-	// a working server to the front of the list.
-	for i := range serverCfg.servers {
-		failCount := atomic.LoadUint64(&(serverCfg.servers[i].Disabled))
-		if failCount == 0 {
-			server = serverCfg.servers[i]
-			break
-		}
-	}
-
-	return server
 }

 // GetNumServers takes out an internal "read lock" and returns the number of
@ -254,13 +197,25 @@ func NewServerManager(logger *log.Logger, shutdownCh chan struct{}, serf *serf.S

 // NotifyFailedServer is an exported convenience function that allows callers
 // to pass in a server that has failed an RPC request and mark it as failed.
-// This will initiate a background task that will optimize the failed server
-// to the end of the serer list.  No locks are required here because we are
-// bypassing the serverConfig and sending a message to ServerManager's
-// channel.
+// If the server being failed is not the first server on the list, this is a
+// noop.  If, however, the server is failed and first on the list, acquire
+// the lock, retest, and take the penalty of moving the server to the end of
+// the list.
 func (sm *ServerManager) NotifyFailedServer(server *server_details.ServerDetails) {
-	atomic.AddUint64(&server.Disabled, 1)
-	sm.consulServersCh <- consulServersRPCError
+	serverCfg := sm.getServerConfig()
+
+	if len(serverCfg.servers) > 0 && serverCfg.servers[0] == server {
+		// Grab a lock, retest, and take the hit of cycling the first
+		// server to the end.
+		sm.serverConfigLock.Lock() // FIXME(sean@): wtb TryLock
+		defer sm.serverConfigLock.Unlock()
+		serverCfg = sm.getServerConfig()
+
+		if len(serverCfg.servers) > 0 && serverCfg.servers[0] == server {
+			serverCfg.cycleServer()
+			sm.saveServerConfig(serverCfg)
+		}
+	}
 }

 // RebalanceServers takes out an internal write lock and shuffles the list of
@ -287,7 +242,6 @@ func (sm *ServerManager) RebalanceServers() {
 	serverCfg.servers = newServers

 	sm.saveServerConfig(serverCfg)
-	sm.requestRefreshRebalanceDuration()
 }

 // RemoveServer takes out an internal write lock and removes a server from
@ -324,12 +278,6 @@ func (sm *ServerManager) requestRefreshRebalanceDuration() {
 	sm.refreshRebalanceDurationCh <- true
 }

-// requestServerRebalance sends a message to which causes a background thread
-// to reshuffle the list of servers
-func (sm *ServerManager) requestServerRebalance() {
-	sm.consulServersCh <- consulServersRebalance
-}
-
 // refreshServerRebalanceTimer is called
 func (sm *ServerManager) refreshServerRebalanceTimer(timer *time.Timer) {
 	serverCfg := sm.getServerConfig()
@ -359,11 +307,10 @@ func (sm *ServerManager) saveServerConfig(sc serverConfig) {
 	sm.serverConfigValue.Store(sc)
 }

-// StartServerManager is used to start and manage the task of automatically
-// shuffling and rebalance the list of consul servers.  This maintenance
-// happens either when a new server is added or when a duration has been
-// exceed.
-func (sm *ServerManager) StartServerManager() {
+// Start is used to start and manage the task of automatically shuffling and
+// rebalance the list of consul servers.  This maintenance happens either
+// when a new server is added or when a duration has been exceed.
+func (sm *ServerManager) Start() {
 	var rebalanceTimer *time.Timer = time.NewTimer(time.Duration(initialRebalanceTimeoutHours * time.Hour))
 	var rebalanceTaskDispatched int32

@ -382,51 +329,22 @@ func (sm *ServerManager) StartServerManager() {

 	for {
 		select {
-		case e := <-sm.consulServersCh:
-			switch e {
-			case consulServersNodeJoin:
-				sm.logger.Printf("[INFO] server manager: new node joined cluster")
-				// rebalance on new server
-				sm.requestServerRebalance()
-			case consulServersRebalance:
-				sm.logger.Printf("[INFO] server manager: rebalancing servers by request")
-				sm.RebalanceServers()
-			case consulServersRPCError:
-				sm.logger.Printf("[INFO] server manager: need to find a new server to talk with")
-				sm.CycleFailedServers()
-				// FIXME(sean@): wtb preemptive Status.Ping
-				// of servers, ideally parallel fan-out of N
-				// nodes, then settle on the first node which
-				// responds successfully.
-				//
-				// Is there a distinction between slow and
-				// offline?  Do we run the Status.Ping with a
-				// fixed timeout (say 30s) that way we can
-				// alert administrators that they've set
-				// their RPC time too low even though the
-				// Ping did return successfully?
-			default:
-				sm.logger.Printf("[WARN] server manager: unhandled LAN Serf Event: %#v", e)
-			}
-		case <-sm.refreshRebalanceDurationCh:
-			chanLen := len(sm.refreshRebalanceDurationCh)
-			// Drain all messages from the rebalance channel
-			for i := 0; i < chanLen; i++ {
-				<-sm.refreshRebalanceDurationCh
-			}
+		case <-rebalanceTimer.C:
+			sm.logger.Printf("[INFO] server manager: server rebalance timeout")
+			sm.RebalanceServers()
+
 			// Only run one rebalance task at a time, but do
 			// allow for the channel to be drained
 			if atomic.CompareAndSwapInt32(&rebalanceTaskDispatched, 0, 1) {
+				sm.logger.Printf("[INFO] server manager: Launching rebalance duration task")
 				go func() {
 					defer atomic.StoreInt32(&rebalanceTaskDispatched, 0)
 					sm.refreshServerRebalanceTimer(rebalanceTimer)
 				}()
 			}
-		case <-rebalanceTimer.C:
-			sm.logger.Printf("[INFO] consul: server rebalance timeout")
-			sm.RebalanceServers()

 		case <-sm.shutdownCh:
+			sm.logger.Printf("[INFO] server manager: shutting down")
 			return
 		}
 	}
--- a/consul/server_manager/server_manager_test.go
+++ b/consul/server_manager/server_manager_test.go
@ -72,4 +72,4 @@ func TestServerManager_NewServerManager(t *testing.T) {

 // func (sm *ServerManager) RemoveServer(server *server_details.ServerDetails) {

-// func (sm *ServerManager) StartServerManager() {
+// func (sm *ServerManager) Start() {