From ea14482376f59cee89b0e0b7ced2ebb44c85a429 Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Tue, 14 Aug 2018 14:23:52 -0700 Subject: [PATCH 1/2] Fix stats fetcher healthcheck RPCs not being independent --- agent/consul/stats_fetcher.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/agent/consul/stats_fetcher.go b/agent/consul/stats_fetcher.go index 51a03e1587..845792de2f 100644 --- a/agent/consul/stats_fetcher.go +++ b/agent/consul/stats_fetcher.go @@ -92,6 +92,14 @@ func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[str // canceled. replies := make(map[string]*autopilot.ServerStats) for _, workItem := range work { + // Drain the reply first if there is one. + select { + case reply := <-workItem.replyCh: + replies[workItem.server.ID] = reply + continue + default: + } + select { case reply := <-workItem.replyCh: replies[workItem.server.ID] = reply From 4b35d877ca636741380fba9b11328c44f0d7f67b Mon Sep 17 00:00:00 2001 From: Kyle Havlovitz Date: Tue, 14 Aug 2018 14:24:51 -0700 Subject: [PATCH 2/2] autopilot: don't follow the normal server removal rules for nonvoters --- agent/consul/autopilot/autopilot.go | 11 +++++++++-- agent/consul/autopilot_test.go | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/agent/consul/autopilot/autopilot.go b/agent/consul/autopilot/autopilot.go index b50252ce51..b54935b5e1 100644 --- a/agent/consul/autopilot/autopilot.go +++ b/agent/consul/autopilot/autopilot.go @@ -204,12 +204,19 @@ func (a *Autopilot) pruneDeadServers() error { } if server != nil { // todo(kyhavlov): change this to index by UUID - if _, ok := staleRaftServers[server.Addr.String()]; ok { + s, found := staleRaftServers[server.Addr.String()] + if found { delete(staleRaftServers, server.Addr.String()) } if member.Status == serf.StatusFailed { - failed = append(failed, member.Name) + // If the node is a nonvoter, we can remove it immediately. + if found && s.Suffrage == raft.Nonvoter { + a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", member.Name) + go serfLAN.RemoveFailedNode(member.Name) + } else { + failed = append(failed, member.Name) + } } } } diff --git a/agent/consul/autopilot_test.go b/agent/consul/autopilot_test.go index 084b86fda6..b5cf73fbf7 100644 --- a/agent/consul/autopilot_test.go +++ b/agent/consul/autopilot_test.go @@ -92,6 +92,30 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) { } } +func TestAutopilot_CleanupDeadNonvoter(t *testing.T) { + dir1, s1 := testServer(t) + defer os.RemoveAll(dir1) + defer s1.Shutdown() + + dir2, s2 := testServerDCBootstrap(t, "dc1", false) + defer os.RemoveAll(dir2) + defer s2.Shutdown() + + testrpc.WaitForLeader(t, s1.RPC, "dc1") + + // Have s2 join and then shut it down immediately before it gets a chance to + // be promoted to a voter. + joinLAN(t, s2, s1) + retry.Run(t, func(r *retry.R) { + r.Check(wantRaft([]*Server{s1, s2})) + }) + s2.Shutdown() + + retry.Run(t, func(r *retry.R) { + r.Check(wantRaft([]*Server{s1})) + }) +} + func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) { t.Parallel() dir1, s1 := testServerWithConfig(t, func(c *Config) {