mirror of https://github.com/hashicorp/consul
Makes server manager shift away from failed servers from Serf events.
Because this code was doing pointer equality checks, it would work for the case of a failed attempted RPC because the objects are from the manager itself: https://github.com/hashicorp/consul/blob/v1.0.3/agent/consul/rpc.go#L283-L302 But the pointer check would always fail for events coming in from the Serf path because the server object is newly-created: https://github.com/hashicorp/consul/blob/v1.0.3/agent/router/serf_adapter.go#L14-L40 This means that we didn't proactively shift RPC traffic away from a failed server, we'd have to wait for an RPC to fail, which exposes the error to the calling client. By switching over to a name check vs. a pointer check we get the correct behavior. We added a DEBUG log as well to help observe this behavior during integrated testing. Related to #3863 since the fix here needed the same logic duplicated, owing to the complicated atomic stuff. /cc @dadgar for a heads up in case this also affects Nomad.pull/3864/head
parent
0123d9db2e
commit
d9a6e2a901
|
@ -256,7 +256,7 @@ func (m *Manager) NotifyFailedServer(s *metadata.Server) {
|
||||||
// the server to the end of the list.
|
// the server to the end of the list.
|
||||||
|
|
||||||
// Only rotate the server list when there is more than one server
|
// Only rotate the server list when there is more than one server
|
||||||
if len(l.servers) > 1 && l.servers[0] == s &&
|
if len(l.servers) > 1 && l.servers[0].Name == s.Name &&
|
||||||
// Use atomic.CAS to emulate a TryLock().
|
// Use atomic.CAS to emulate a TryLock().
|
||||||
atomic.CompareAndSwapInt32(&m.notifyFailedBarrier, 0, 1) {
|
atomic.CompareAndSwapInt32(&m.notifyFailedBarrier, 0, 1) {
|
||||||
defer atomic.StoreInt32(&m.notifyFailedBarrier, 0)
|
defer atomic.StoreInt32(&m.notifyFailedBarrier, 0)
|
||||||
|
@ -267,9 +267,10 @@ func (m *Manager) NotifyFailedServer(s *metadata.Server) {
|
||||||
defer m.listLock.Unlock()
|
defer m.listLock.Unlock()
|
||||||
l = m.getServerList()
|
l = m.getServerList()
|
||||||
|
|
||||||
if len(l.servers) > 1 && l.servers[0] == s {
|
if len(l.servers) > 1 && l.servers[0].Name == s.Name {
|
||||||
l.servers = l.cycleServer()
|
l.servers = l.cycleServer()
|
||||||
m.saveServerList(l)
|
m.saveServerList(l)
|
||||||
|
m.logger.Printf(`[DEBUG] manager: cycled away from server "%s"`, s.Name)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue