diff --git a/consul/serf.go b/consul/serf.go index d43146ccbd..7a01c1b3e9 100644 --- a/consul/serf.go +++ b/consul/serf.go @@ -1,7 +1,6 @@ package consul import ( - "net" "strings" "github.com/hashicorp/consul/consul/agent" @@ -192,7 +191,7 @@ func (s *Server) wanNodeJoin(me serf.MemberEvent) { } } -// maybeBootsrap is used to handle bootstrapping when a new consul server joins +// maybeBootstrap is used to handle bootstrapping when a new consul server joins. func (s *Server) maybeBootstrap() { // Bootstrap can only be done if there are no committed logs, remove our // expectations of bootstrapping. This is slightly cheaper than the full @@ -203,13 +202,14 @@ func (s *Server) maybeBootstrap() { return } if index != 0 { + s.logger.Printf("[INFO] consul: Raft data found, disabling bootstrap mode") s.config.BootstrapExpect = 0 return } // Scan for all the known servers. members := s.serfLAN.Members() - addrs := make([]string, 0) + var servers []agent.Server for _, member := range members { valid, p := agent.IsConsulServer(member) if !valid { @@ -227,34 +227,62 @@ func (s *Server) maybeBootstrap() { s.logger.Printf("[ERR] consul: Member %v has bootstrap mode. Expect disabled.", member) return } - addr := &net.TCPAddr{IP: member.Addr, Port: p.Port} - addrs = append(addrs, addr.String()) + servers = append(servers, *p) } // Skip if we haven't met the minimum expect count. - if len(addrs) < s.config.BootstrapExpect { + if len(servers) < s.config.BootstrapExpect { return } + // Query each of the servers and make sure they report no Raft peers. + for _, server := range servers { + var peers []string + if err := s.connPool.RPC(s.config.Datacenter, server.Addr, server.Version, + "Status.Peers", &struct{}{}, &peers); err != nil { + s.logger.Printf("[ERR] consul: Failed to confirm peer status for %s: %v", server.Name, err) + return + } + + // Found a node with some Raft peers, stop bootstrap since there's + // evidence of an existing cluster. We should get folded in by the + // existing servers if that's the case, so it's safer to sit as a + // candidate with no peers so we don't cause spurious elections. + // It's OK this is racy, because even with an initial bootstrap + // as long as one peer runs bootstrap things will work, and if we + // have multiple peers bootstrap in the same way, that's OK. We + // just don't want a server added much later to do a live bootstrap + // and interfere with the cluster. This isn't required for Raft's + // correctness because no server in the existing cluster will vote + // for this server, but it makes things much more stable. + if len(peers) > 0 { + s.logger.Printf("[INFO] consul: Existing Raft peers reported by %s, disabling bootstrap mode", server.Name) + s.config.BootstrapExpect = 0 + return + } + } + // Attempt a live bootstrap! var configuration raft.Configuration - for _, addr := range addrs { - // TODO (slackpad) - This will need to be updated once we support - // node IDs. - server := raft.Server{ + var addrs []string + for _, server := range servers { + addr := server.Addr.String() + addrs = append(addrs, addr) + peer := raft.Server{ ID: raft.ServerID(addr), Address: raft.ServerAddress(addr), } - configuration.Servers = append(configuration.Servers, server) + configuration.Servers = append(configuration.Servers, peer) } - s.logger.Printf("[INFO] consul: Found expected number of peers (%s), attempting to bootstrap cluster...", + s.logger.Printf("[INFO] consul: Found expected number of peers, attempting bootstrap: %s", strings.Join(addrs, ",")) future := s.raft.BootstrapCluster(configuration) if err := future.Error(); err != nil { s.logger.Printf("[ERR] consul: Failed to bootstrap cluster: %v", err) } - // Bootstrapping complete, don't enter this again. + // Bootstrapping complete, or failed for some reason, don't enter this + // again. s.config.BootstrapExpect = 0 } diff --git a/consul/server_test.go b/consul/server_test.go index a8d65d36fa..0416eee844 100644 --- a/consul/server_test.go +++ b/consul/server_test.go @@ -501,7 +501,9 @@ func TestServer_JoinLAN_TLS(t *testing.T) { } func TestServer_Expect(t *testing.T) { - // all test servers should be in expect=3 mode + // All test servers should be in expect=3 mode, except for the 3rd one, + // but one with expect=0 can cause a bootstrap to occur from the other + // servers as currently implemented. dir1, s1 := testServerDCExpect(t, "dc1", 3) defer os.RemoveAll(dir1) defer s1.Shutdown() @@ -514,7 +516,11 @@ func TestServer_Expect(t *testing.T) { defer os.RemoveAll(dir3) defer s3.Shutdown() - // Try to join + dir4, s4 := testServerDCExpect(t, "dc1", 3) + defer os.RemoveAll(dir4) + defer s4.Shutdown() + + // Join the first two servers. addr := fmt.Sprintf("127.0.0.1:%d", s1.config.SerfLANConfig.MemberlistConfig.BindPort) if _, err := s2.JoinLAN([]string{addr}); err != nil { @@ -524,7 +530,7 @@ func TestServer_Expect(t *testing.T) { var p1 int var p2 int - // should have no peers yet + // Should have no peers yet since the bootstrap didn't occur. testutil.WaitForResult(func() (bool, error) { p1, _ = s1.numPeers() return p1 == 0, errors.New(fmt.Sprintf("%d", p1)) @@ -539,14 +545,14 @@ func TestServer_Expect(t *testing.T) { t.Fatalf("should have 0 peers: %v", err) }) - // join the third node + // Join the third node. if _, err := s3.JoinLAN([]string{addr}); err != nil { t.Fatalf("err: %v", err) } var p3 int - // should now have all three peers + // Now we have three servers so we should bootstrap. testutil.WaitForResult(func() (bool, error) { p1, _ = s1.numPeers() return p1 == 3, errors.New(fmt.Sprintf("%d", p1)) @@ -568,8 +574,30 @@ func TestServer_Expect(t *testing.T) { t.Fatalf("should have 3 peers: %v", err) }) - // check if there is one leader now + // Make sure a leader is elected, grab the current term and then add in + // the fourth server. testutil.WaitForLeader(t, s1.RPC, "dc1") + termBefore := s1.raft.Stats()["last_log_term"] + if _, err := s4.JoinLAN([]string{addr}); err != nil { + t.Fatalf("err: %v", err) + } + + // Wait for the new server to see itself added to the cluster. + var p4 int + testutil.WaitForResult(func() (bool, error) { + p4, _ = s4.numPeers() + return p4 == 4, errors.New(fmt.Sprintf("%d", p3)) + }, func(err error) { + t.Fatalf("should have 4 peers: %v", err) + }) + + // Make sure there's still a leader and that the term didn't change, + // so we know an election didn't occur. + testutil.WaitForLeader(t, s1.RPC, "dc1") + termAfter := s1.raft.Stats()["last_log_term"] + if termAfter != termBefore { + t.Fatalf("looks like an election took place") + } } func TestServer_BadExpect(t *testing.T) {