From 85e678fbdd008a6ce2f1acfa099b9ee0a554f878 Mon Sep 17 00:00:00 2001 From: James Phillips Date: Tue, 7 Nov 2017 18:13:23 -0800 Subject: [PATCH] Saves the cycled server list after a failed ping when rebalancing. (#3662) Fixes #3463 --- agent/router/manager.go | 3 +- agent/router/manager_test.go | 57 ++++++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/agent/router/manager.go b/agent/router/manager.go index 288a30bd0a..f31f0bb775 100644 --- a/agent/router/manager.go +++ b/agent/router/manager.go @@ -315,8 +315,7 @@ func (m *Manager) RebalanceServers() { break } m.logger.Printf(`[DEBUG] manager: pinging server "%s" failed: %s`, srv, err) - - l.cycleServer() + l.servers = l.cycleServer() } // If no healthy servers were found, sleep and wait for Serf to make diff --git a/agent/router/manager_test.go b/agent/router/manager_test.go index 00d63938de..4ba9b1801e 100644 --- a/agent/router/manager_test.go +++ b/agent/router/manager_test.go @@ -13,17 +13,38 @@ import ( "github.com/hashicorp/consul/agent/router" ) +type fauxAddr struct { + addr string +} + +func (a *fauxAddr) Network() string { + return "faux" +} + +func (a *fauxAddr) String() string { + return a.addr +} + type fauxConnPool struct { - // failPct between 0.0 and 1.0 == pct of time a Ping should fail + // failPct between 0.0 and 1.0 == pct of time a Ping should fail. failPct float64 + + // failAddr fail whenever we see this address. + failAddr net.Addr } -func (cp *fauxConnPool) Ping(string, net.Addr, int, bool) (bool, error) { +func (cp *fauxConnPool) Ping(dc string, addr net.Addr, version int, useTLS bool) (bool, error) { var success bool + successProb := rand.Float64() if successProb > cp.failPct { success = true } + + if cp.failAddr != nil && addr.String() == cp.failAddr.String() { + success = false + } + return success, nil } @@ -48,6 +69,13 @@ func testManagerFailProb(failPct float64) (m *router.Manager) { return m } +func testManagerFailAddr(failAddr net.Addr) (m *router.Manager) { + logger := log.New(os.Stderr, "", log.LstdFlags) + shutdownCh := make(chan struct{}) + m = router.New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failAddr: failAddr}) + return m +} + // func (m *Manager) AddServer(server *metadata.Server) { func TestServers_AddServer(t *testing.T) { m := testManager() @@ -282,6 +310,31 @@ func TestServers_RebalanceServers(t *testing.T) { } } +func TestServers_RebalanceServers_AvoidFailed(t *testing.T) { + // Do a large number of rebalances with one failed server in the + // list and make sure we never have that one selected afterwards. + // This was added when fixing #3463, when we were just doing the + // shuffle and not actually cycling servers. We do a large number + // of trials with a small number of servers to try to make sure + // the shuffle alone won't give the right answer. + servers := []*metadata.Server{ + &metadata.Server{Name: "s1", Addr: &fauxAddr{"s1"}}, + &metadata.Server{Name: "s2", Addr: &fauxAddr{"s2"}}, + &metadata.Server{Name: "s3", Addr: &fauxAddr{"s3"}}, + } + for i := 0; i < 100; i++ { + m := testManagerFailAddr(&fauxAddr{"s2"}) + for _, s := range servers { + m.AddServer(s) + } + + m.RebalanceServers() + if front := m.FindServer().Name; front == "s2" { + t.Fatalf("should have avoided the failed server") + } + } +} + // func (m *Manager) RemoveServer(server *metadata.Server) { func TestManager_RemoveServer(t *testing.T) { const nodeNameFmt = "s%02d"