From 5e26971864684043c3b8c39bf6a5de2b4b65ce77 Mon Sep 17 00:00:00 2001 From: Sarah Christoff Date: Fri, 4 Oct 2019 16:10:02 -0500 Subject: [PATCH] Prune Unhealthy Agents (#6571) * Add -prune flag to ForceLeave --- agent/acl_test.go | 2 +- agent/agent.go | 9 +- agent/agent_endpoint.go | 5 +- agent/agent_endpoint_test.go | 48 ++++++ agent/consul/client.go | 5 +- agent/consul/leader_test.go | 2 +- agent/consul/server.go | 13 +- api/agent.go | 13 ++ api/agent_test.go | 14 ++ api/api.go | 1 + command/forceleave/forceleave.go | 14 +- command/forceleave/forceleave_test.go | 36 +++++ go.mod | 4 +- go.sum | 4 + .../github.com/hashicorp/go-hclog/global.go | 16 +- .../github.com/hashicorp/serf/serf/config.go | 5 + .../hashicorp/serf/serf/messages.go | 1 + vendor/github.com/hashicorp/serf/serf/serf.go | 153 ++++++++++++++---- vendor/modules.txt | 4 +- website/source/api/agent.html.md | 8 + .../commands/force-leave.html.markdown.erb | 4 + 21 files changed, 310 insertions(+), 51 deletions(-) diff --git a/agent/acl_test.go b/agent/acl_test.go index e7a6b84510..ffc7e00ac9 100644 --- a/agent/acl_test.go +++ b/agent/acl_test.go @@ -127,7 +127,7 @@ func (a *TestACLAgent) LocalMember() serf.Member { func (a *TestACLAgent) JoinLAN(addrs []string) (n int, err error) { return 0, fmt.Errorf("Unimplemented") } -func (a *TestACLAgent) RemoveFailedNode(node string) error { +func (a *TestACLAgent) RemoveFailedNode(node string, prune bool) error { return fmt.Errorf("Unimplemented") } diff --git a/agent/agent.go b/agent/agent.go index f2e5c318e8..e5d7cb6d13 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -6,7 +6,6 @@ import ( "crypto/tls" "encoding/json" "fmt" - "github.com/hashicorp/go-memdb" "io" "io/ioutil" "log" @@ -20,6 +19,8 @@ import ( "sync" "time" + "github.com/hashicorp/go-memdb" + "google.golang.org/grpc" "github.com/armon/go-metrics" @@ -133,7 +134,7 @@ type delegate interface { LANSegmentMembers(segment string) ([]serf.Member, error) LocalMember() serf.Member JoinLAN(addrs []string) (n int, err error) - RemoveFailedNode(node string) error + RemoveFailedNode(node string, prune bool) error ResolveToken(secretID string) (acl.Authorizer, error) RPC(method string, args interface{}, reply interface{}) error ACLsEnabled() bool @@ -1778,9 +1779,9 @@ func (a *Agent) JoinWAN(addrs []string) (n int, err error) { } // ForceLeave is used to remove a failed node from the cluster -func (a *Agent) ForceLeave(node string) (err error) { +func (a *Agent) ForceLeave(node string, prune bool) (err error) { a.logger.Printf("[INFO] agent: Force leaving node: %v", node) - err = a.delegate.RemoveFailedNode(node) + err = a.delegate.RemoveFailedNode(node, prune) if err != nil { a.logger.Printf("[WARN] agent: Failed to remove node: %v", err) } diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index f1d85e2628..69f5271618 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -438,8 +438,11 @@ func (s *HTTPServer) AgentForceLeave(resp http.ResponseWriter, req *http.Request return nil, acl.ErrPermissionDenied } + //Check the value of the prune query + _, prune := req.URL.Query()["prune"] + addr := strings.TrimPrefix(req.URL.Path, "/v1/agent/force-leave/") - return nil, s.agent.ForceLeave(addr) + return nil, s.agent.ForceLeave(addr, prune) } // syncChanges is a helper function which wraps a blocking call to sync diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index 39a6cb44ca..6dc3b39a57 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -1655,6 +1655,54 @@ func TestAgent_ForceLeave_ACLDeny(t *testing.T) { }) } +func TestAgent_ForceLeavePrune(t *testing.T) { + t.Parallel() + a1 := NewTestAgent(t, t.Name()+"-a1", "") + defer a1.Shutdown() + a2 := NewTestAgent(t, t.Name()+"-a2", "") + testrpc.WaitForLeader(t, a1.RPC, "dc1") + testrpc.WaitForLeader(t, a2.RPC, "dc1") + + // Join first + addr := fmt.Sprintf("127.0.0.1:%d", a2.Config.SerfPortLAN) + _, err := a1.JoinLAN([]string{addr}) + if err != nil { + t.Fatalf("err: %v", err) + } + + // this test probably needs work + a2.Shutdown() + // Wait for agent being marked as failed, so we wait for full shutdown of Agent + retry.Run(t, func(r *retry.R) { + m := a1.LANMembers() + for _, member := range m { + if member.Name == a2.Config.NodeName { + if member.Status != serf.StatusFailed { + r.Fatalf("got status %q want %q", member.Status, serf.StatusFailed) + } + + } + } + }) + + // Force leave now + req, _ := http.NewRequest("PUT", fmt.Sprintf("/v1/agent/force-leave/%s?prune=true", a2.Config.NodeName), nil) + obj, err := a1.srv.AgentForceLeave(nil, req) + if err != nil { + t.Fatalf("Err: %v", err) + } + if obj != nil { + t.Fatalf("Err: %v", obj) + } + retry.Run(t, func(r *retry.R) { + m := len(a1.LANMembers()) + if m != 1 { + r.Fatalf("want one member, got %v", m) + } + }) + +} + func TestAgent_RegisterCheck(t *testing.T) { t.Parallel() a := NewTestAgent(t, t.Name(), "") diff --git a/agent/consul/client.go b/agent/consul/client.go index e444041b4c..7e5a0044f9 100644 --- a/agent/consul/client.go +++ b/agent/consul/client.go @@ -263,7 +263,10 @@ func (c *Client) LANSegmentMembers(segment string) ([]serf.Member, error) { } // RemoveFailedNode is used to remove a failed node from the cluster -func (c *Client) RemoveFailedNode(node string) error { +func (c *Client) RemoveFailedNode(node string, prune bool) error { + if prune { + c.serf.RemoveFailedNodePrune(node) + } return c.serf.RemoveFailedNode(node) } diff --git a/agent/consul/leader_test.go b/agent/consul/leader_test.go index 0e97a071d5..2d73a13121 100644 --- a/agent/consul/leader_test.go +++ b/agent/consul/leader_test.go @@ -549,7 +549,7 @@ func TestLeader_LeftServer(t *testing.T) { servers[0].Shutdown() // Force remove the non-leader (transition to left state) - if err := servers[1].RemoveFailedNode(servers[0].config.NodeName); err != nil { + if err := servers[1].RemoveFailedNode(servers[0].config.NodeName, false); err != nil { t.Fatalf("err: %v", err) } diff --git a/agent/consul/server.go b/agent/consul/server.go index cba2b58a7e..35c3f93a6d 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -1008,8 +1008,15 @@ func (s *Server) WANMembers() []serf.Member { } // RemoveFailedNode is used to remove a failed node from the cluster -func (s *Server) RemoveFailedNode(node string) error { - if err := s.serfLAN.RemoveFailedNode(node); err != nil { +func (s *Server) RemoveFailedNode(node string, prune bool) error { + var removeFn func(*serf.Serf, string) error + if prune { + removeFn = (*serf.Serf).RemoveFailedNodePrune + } else { + removeFn = (*serf.Serf).RemoveFailedNode + } + + if err := removeFn(s.serfLAN, node); err != nil { return err } // The Serf WAN pool stores members as node.datacenter @@ -1018,7 +1025,7 @@ func (s *Server) RemoveFailedNode(node string) error { node = node + "." + s.config.Datacenter } if s.serfWAN != nil { - if err := s.serfWAN.RemoveFailedNode(node); err != nil { + if err := removeFn(s.serfWAN, node); err != nil { return err } } diff --git a/api/agent.go b/api/agent.go index 7a5c427209..fa40ffa180 100644 --- a/api/agent.go +++ b/api/agent.go @@ -730,6 +730,19 @@ func (a *Agent) ForceLeave(node string) error { return nil } +//ForceLeavePrune is used to have an a failed agent removed +//from the list of members +func (a *Agent) ForceLeavePrune(node string) error { + r := a.c.newRequest("PUT", "/v1/agent/force-leave/"+node) + r.params.Set("prune", "1") + _, resp, err := requireOK(a.c.doRequest(r)) + if err != nil { + return err + } + resp.Body.Close() + return nil +} + // ConnectAuthorize is used to authorize an incoming connection // to a natively integrated Connect service. func (a *Agent) ConnectAuthorize(auth *AgentAuthorizeParams) (*AgentAuthorize, error) { diff --git a/api/agent_test.go b/api/agent_test.go index 3b8e61f4a7..4f86c9491b 100644 --- a/api/agent_test.go +++ b/api/agent_test.go @@ -1070,6 +1070,20 @@ func TestAPI_AgentForceLeave(t *testing.T) { } } +func TestAPI_AgentForceLeavePrune(t *testing.T) { + t.Parallel() + c, s := makeClient(t) + defer s.Stop() + + agent := c.Agent() + + // Eject somebody + err := agent.ForceLeavePrune("foo") + if err != nil { + t.Fatalf("err: %v", err) + } +} + func TestAPI_AgentMonitor(t *testing.T) { t.Parallel() c, s := makeClient(t) diff --git a/api/api.go b/api/api.go index 0444e9e75e..81072ebb75 100644 --- a/api/api.go +++ b/api/api.go @@ -690,6 +690,7 @@ func (r *request) setQueryOptions(q *QueryOptions) { r.header.Set("Cache-Control", strings.Join(cc, ", ")) } } + r.ctx = q.ctx } diff --git a/command/forceleave/forceleave.go b/command/forceleave/forceleave.go index 1f5d308e3a..fcf5cefe5d 100644 --- a/command/forceleave/forceleave.go +++ b/command/forceleave/forceleave.go @@ -19,10 +19,15 @@ type cmd struct { flags *flag.FlagSet http *flags.HTTPFlags help string + + //flags + prune bool } func (c *cmd) init() { c.flags = flag.NewFlagSet("", flag.ContinueOnError) + c.flags.BoolVar(&c.prune, "prune", false, + "Remove agent completely from list of members") c.http = &flags.HTTPFlags{} flags.Merge(c.flags, c.http.ClientFlags()) c.help = flags.Usage(help, c.flags) @@ -47,7 +52,12 @@ func (c *cmd) Run(args []string) int { return 1 } - err = client.Agent().ForceLeave(nodes[0]) + if c.prune { + err = client.Agent().ForceLeavePrune(nodes[0]) + } else { + err = client.Agent().ForceLeave(nodes[0]) + } + if err != nil { c.UI.Error(fmt.Sprintf("Error force leaving: %s", err)) return 1 @@ -74,4 +84,6 @@ Usage: consul force-leave [options] name that are never coming back. If you do not force leave a failed node, Consul will attempt to reconnect to those failed nodes for some period of time before eventually reaping them. + + -prune Remove agent completely from list of members ` diff --git a/command/forceleave/forceleave_test.go b/command/forceleave/forceleave_test.go index 89018a2b2f..eb923fa517 100644 --- a/command/forceleave/forceleave_test.go +++ b/command/forceleave/forceleave_test.go @@ -56,6 +56,42 @@ func TestForceLeaveCommand(t *testing.T) { }) } +func TestForceLeaveCommand_prune(t *testing.T) { + t.Parallel() + a1 := agent.NewTestAgent(t, t.Name()+"-a1", ``) + defer a1.Shutdown() + a2 := agent.NewTestAgent(t, t.Name()+"-a2", ``) + defer a2.Shutdown() + + _, err := a2.JoinLAN([]string{a1.Config.SerfBindAddrLAN.String()}) + if err != nil { + t.Fatalf("err: %s", err) + } + + // Forcibly shutdown a2 so that it appears "failed" in a1 + a2.Shutdown() + + ui := cli.NewMockUi() + c := New(ui) + args := []string{ + "-http-addr=" + a1.HTTPAddr(), + "-prune", + a2.Config.NodeName, + } + + code := c.Run(args) + if code != 0 { + t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String()) + } + retry.Run(t, func(r *retry.R) { + m := len(a1.LANMembers()) + if m != 1 { + r.Fatalf("should have 1 members, got %#v", m) + } + }) + +} + func TestForceLeaveCommand_noAddrs(t *testing.T) { t.Parallel() ui := cli.NewMockUi() diff --git a/go.mod b/go.mod index 4e48a62e22..449c31effc 100644 --- a/go.mod +++ b/go.mod @@ -30,7 +30,7 @@ require ( github.com/hashicorp/go-checkpoint v0.0.0-20171009173528-1545e56e46de github.com/hashicorp/go-cleanhttp v0.5.1 github.com/hashicorp/go-discover v0.0.0-20190403160810-22221edb15cd - github.com/hashicorp/go-hclog v0.9.1 + github.com/hashicorp/go-hclog v0.9.2 github.com/hashicorp/go-memdb v0.0.0-20180223233045-1289e7fffe71 github.com/hashicorp/go-msgpack v0.5.5 github.com/hashicorp/go-multierror v1.0.0 @@ -49,7 +49,7 @@ require ( github.com/hashicorp/net-rpc-msgpackrpc v0.0.0-20151116020338-a14192a58a69 github.com/hashicorp/raft v1.1.1 github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea - github.com/hashicorp/serf v0.8.2 + github.com/hashicorp/serf v0.8.5 github.com/hashicorp/vault/api v1.0.4 github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d github.com/imdario/mergo v0.3.6 diff --git a/go.sum b/go.sum index 1deec3f25a..b44c3ba722 100644 --- a/go.sum +++ b/go.sum @@ -123,6 +123,8 @@ github.com/hashicorp/go-hclog v0.0.0-20180709165350-ff2cf002a8dd/go.mod h1:9bjs9 github.com/hashicorp/go-hclog v0.8.0/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ= github.com/hashicorp/go-hclog v0.9.1 h1:9PZfAcVEvez4yhLH2TBU64/h/z4xlFI80cWXRrxuKuM= github.com/hashicorp/go-hclog v0.9.1/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ= +github.com/hashicorp/go-hclog v0.9.2 h1:CG6TE5H9/JXsFWJCfoIVpKFIkFe6ysEuHirp4DxCsHI= +github.com/hashicorp/go-hclog v0.9.2/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ= github.com/hashicorp/go-immutable-radix v1.0.0 h1:AKDB1HM5PWEA7i4nhcpwOrO2byshxBjXVn/J/3+z5/0= github.com/hashicorp/go-immutable-radix v1.0.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= github.com/hashicorp/go-memdb v0.0.0-20180223233045-1289e7fffe71 h1:yxxFgVz31vFoKKTtRUNbXLNe4GFnbLKqg+0N7yG42L8= @@ -184,6 +186,8 @@ github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea h1:xykPFhrBA github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea/go.mod h1:pNv7Wc3ycL6F5oOWn+tPGo2gWD4a5X+yp/ntwdKLjRk= github.com/hashicorp/serf v0.8.2 h1:YZ7UKsJv+hKjqGVUUbtE3HNj79Eln2oQ75tniF6iPt0= github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= +github.com/hashicorp/serf v0.8.5 h1:ZynDUIQiA8usmRgPdGPHFdPnb1wgGI9tK3mO9hcAJjc= +github.com/hashicorp/serf v0.8.5/go.mod h1:UpNcs7fFbpKIyZaUuSW6EPiH+eZC7OuyFD+wc1oal+k= github.com/hashicorp/vault/api v1.0.4 h1:j08Or/wryXT4AcHj1oCbMd7IijXcKzYUGw59LGu9onU= github.com/hashicorp/vault/api v1.0.4/go.mod h1:gDcqh3WGcR1cpF5AJz/B1UFheUEneMoIospckxBxk6Q= github.com/hashicorp/vault/sdk v0.1.13 h1:mOEPeOhT7jl0J4AMl1E705+BcmeRs1VmKNb9F0sMLy8= diff --git a/vendor/github.com/hashicorp/go-hclog/global.go b/vendor/github.com/hashicorp/go-hclog/global.go index e5f7f95ff0..3efc54c129 100644 --- a/vendor/github.com/hashicorp/go-hclog/global.go +++ b/vendor/github.com/hashicorp/go-hclog/global.go @@ -22,7 +22,11 @@ var ( // to be used in more specific contexts. func Default() Logger { protect.Do(func() { - def = New(DefaultOptions) + // If SetDefault was used before Default() was called, we need to + // detect that here. + if def == nil { + def = New(DefaultOptions) + } }) return def @@ -32,3 +36,13 @@ func Default() Logger { func L() Logger { return Default() } + +// SetDefault changes the logger to be returned by Default()and L() +// to the one given. This allows packages to use the default logger +// and have higher level packages change it to match the execution +// environment. It returns any old default if there is one. +func SetDefault(log Logger) Logger { + old := def + def = log + return old +} diff --git a/vendor/github.com/hashicorp/serf/serf/config.go b/vendor/github.com/hashicorp/serf/serf/config.go index 79f36f57c7..0de4247c5b 100644 --- a/vendor/github.com/hashicorp/serf/serf/config.go +++ b/vendor/github.com/hashicorp/serf/serf/config.go @@ -242,6 +242,10 @@ type Config struct { // Merge can be optionally provided to intercept a cluster merge // and conditionally abort the merge. Merge MergeDelegate + + // UserEventSizeLimit is maximum byte size limit of user event `name` + `payload` in bytes. + // It's optimal to be relatively small, since it's going to be gossiped through the cluster. + UserEventSizeLimit int } // Init allocates the subdata structures @@ -282,5 +286,6 @@ func DefaultConfig() *Config { QuerySizeLimit: 1024, EnableNameConflictResolution: true, DisableCoordinates: false, + UserEventSizeLimit: 512, } } diff --git a/vendor/github.com/hashicorp/serf/serf/messages.go b/vendor/github.com/hashicorp/serf/serf/messages.go index 20df5b8e83..138817c50e 100644 --- a/vendor/github.com/hashicorp/serf/serf/messages.go +++ b/vendor/github.com/hashicorp/serf/serf/messages.go @@ -55,6 +55,7 @@ type messageJoin struct { type messageLeave struct { LTime LamportTime Node string + Prune bool } // messagePushPullType is used when doing a state exchange. This diff --git a/vendor/github.com/hashicorp/serf/serf/serf.go b/vendor/github.com/hashicorp/serf/serf/serf.go index bb6c22fe7b..9fe4cc3a71 100644 --- a/vendor/github.com/hashicorp/serf/serf/serf.go +++ b/vendor/github.com/hashicorp/serf/serf/serf.go @@ -223,8 +223,8 @@ type queries struct { } const ( - UserEventSizeLimit = 512 // Maximum byte size for event name and payload snapshotSizeLimit = 128 * 1024 // Maximum 128 KB snapshot + UserEventSizeLimit = 9 * 1024 // Maximum 9KB for event name and payload ) // Create creates a new Serf instance, starting all the background tasks @@ -242,6 +242,10 @@ func Create(conf *Config) (*Serf, error) { conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) } + if conf.UserEventSizeLimit > UserEventSizeLimit { + return nil, fmt.Errorf("user event size limit exceeds limit of %d bytes", UserEventSizeLimit) + } + logger := conf.Logger if logger == nil { logOutput := conf.LogOutput @@ -437,14 +441,25 @@ func (s *Serf) KeyManager() *KeyManager { } // UserEvent is used to broadcast a custom user event with a given -// name and payload. The events must be fairly small, and if the -// size limit is exceeded and error will be returned. If coalesce is enabled, -// nodes are allowed to coalesce this event. Coalescing is only available -// starting in v0.2 +// name and payload. If the configured size limit is exceeded and error will be returned. +// If coalesce is enabled, nodes are allowed to coalesce this event. +// Coalescing is only available starting in v0.2 func (s *Serf) UserEvent(name string, payload []byte, coalesce bool) error { - // Check the size limit - if len(name)+len(payload) > UserEventSizeLimit { - return fmt.Errorf("user event exceeds limit of %d bytes", UserEventSizeLimit) + payloadSizeBeforeEncoding := len(name) + len(payload) + + // Check size before encoding to prevent needless encoding and return early if it's over the specified limit. + if payloadSizeBeforeEncoding > s.config.UserEventSizeLimit { + return fmt.Errorf( + "user event exceeds configured limit of %d bytes before encoding", + s.config.UserEventSizeLimit, + ) + } + + if payloadSizeBeforeEncoding > UserEventSizeLimit { + return fmt.Errorf( + "user event exceeds sane limit of %d bytes before encoding", + UserEventSizeLimit, + ) } // Create a message @@ -454,16 +469,34 @@ func (s *Serf) UserEvent(name string, payload []byte, coalesce bool) error { Payload: payload, CC: coalesce, } - s.eventClock.Increment() - - // Process update locally - s.handleUserEvent(&msg) // Start broadcasting the event raw, err := encodeMessage(messageUserEventType, &msg) if err != nil { return err } + + // Check the size after encoding to be sure again that + // we're not attempting to send over the specified size limit. + if len(raw) > s.config.UserEventSizeLimit { + return fmt.Errorf( + "encoded user event exceeds configured limit of %d bytes after encoding", + s.config.UserEventSizeLimit, + ) + } + + if len(raw) > UserEventSizeLimit { + return fmt.Errorf( + "encoded user event exceeds sane limit of %d bytes before encoding", + UserEventSizeLimit, + ) + } + + s.eventClock.Increment() + + // Process update locally + s.handleUserEvent(&msg) + s.eventBroadcasts.QueueBroadcast(&broadcast{ msg: raw, }) @@ -748,15 +781,26 @@ func (s *Serf) Members() []Member { return members } -// RemoveFailedNode forcibly removes a failed node from the cluster +// RemoveFailedNode is a backwards compatible form +// of forceleave +func (s *Serf) RemoveFailedNode(node string) error { + return s.forceLeave(node, false) +} + +func (s *Serf) RemoveFailedNodePrune(node string) error { + return s.forceLeave(node, true) +} + +// ForceLeave forcibly removes a failed node from the cluster // immediately, instead of waiting for the reaper to eventually reclaim it. // This also has the effect that Serf will no longer attempt to reconnect // to this node. -func (s *Serf) RemoveFailedNode(node string) error { +func (s *Serf) forceLeave(node string, prune bool) error { // Construct the message to broadcast msg := messageLeave{ LTime: s.clock.Time(), Node: node, + Prune: prune, } s.clock.Increment() @@ -1027,6 +1071,7 @@ func (s *Serf) handleNodeUpdate(n *memberlist.Node) { // handleNodeLeaveIntent is called when an intent to leave is received. func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool { + // Witness a potentially newer time s.clock.Witness(leaveMsg.LTime) @@ -1057,6 +1102,10 @@ func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool { case StatusAlive: member.Status = StatusLeaving member.statusLTime = leaveMsg.LTime + + if leaveMsg.Prune { + s.handlePrune(member) + } return true case StatusFailed: member.Status = StatusLeft @@ -1065,6 +1114,7 @@ func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool { // Remove from the failed list and add to the left list. We add // to the left list so that when we do a sync, other nodes will // remove it from their failed list. + s.failedMembers = removeOldMember(s.failedMembers, member.Name) s.leftMembers = append(s.leftMembers, member) @@ -1079,12 +1129,40 @@ func (s *Serf) handleNodeLeaveIntent(leaveMsg *messageLeave) bool { Members: []Member{member.Member}, } } + + if leaveMsg.Prune { + s.handlePrune(member) + } + + return true + + case StatusLeaving, StatusLeft: + if leaveMsg.Prune { + s.handlePrune(member) + } return true default: return false } } +// handlePrune waits for nodes that are leaving and then forcibly +// erases a member from the list of members +func (s *Serf) handlePrune(member *memberState) { + if member.Status == StatusLeaving { + time.Sleep(s.config.BroadcastTimeout + s.config.LeavePropagateDelay) + } + + s.logger.Printf("[INFO] serf: EventMemberReap (forced): %s %s", member.Name, member.Member.Addr) + + //If we are leaving or left we may be in that list of members + if member.Status == StatusLeaving || member.Status == StatusLeft { + s.leftMembers = removeOldMember(s.leftMembers, member.Name) + } + s.eraseNode(member) + +} + // handleNodeJoinIntent is called when a node broadcasts a // join message to set the lamport time of its join func (s *Serf) handleNodeJoinIntent(joinMsg *messageJoin) bool { @@ -1405,6 +1483,30 @@ func (s *Serf) resolveNodeConflict() { } } +//eraseNode takes a node completely out of the member list +func (s *Serf) eraseNode(m *memberState) { + // Delete from members + delete(s.members, m.Name) + + // Tell the coordinate client the node has gone away and delete + // its cached coordinates. + if !s.config.DisableCoordinates { + s.coordClient.ForgetNode(m.Name) + + s.coordCacheLock.Lock() + delete(s.coordCache, m.Name) + s.coordCacheLock.Unlock() + } + + // Send an event along + if s.config.EventCh != nil { + s.config.EventCh <- MemberEvent{ + Type: EventMemberReap, + Members: []Member{m.Member}, + } + } +} + // handleReap periodically reaps the list of failed and left members, as well // as old buffered intents. func (s *Serf) handleReap() { @@ -1455,27 +1557,10 @@ func (s *Serf) reap(old []*memberState, now time.Time, timeout time.Duration) [] n-- i-- - // Delete from members - delete(s.members, m.Name) - - // Tell the coordinate client the node has gone away and delete - // its cached coordinates. - if !s.config.DisableCoordinates { - s.coordClient.ForgetNode(m.Name) - - s.coordCacheLock.Lock() - delete(s.coordCache, m.Name) - s.coordCacheLock.Unlock() - } - - // Send an event along + // Delete from members and send out event s.logger.Printf("[INFO] serf: EventMemberReap: %s", m.Name) - if s.config.EventCh != nil { - s.config.EventCh <- MemberEvent{ - Type: EventMemberReap, - Members: []Member{m.Member}, - } - } + s.eraseNode(m) + } return old diff --git a/vendor/modules.txt b/vendor/modules.txt index 18484b9b13..097a46cb2b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -184,7 +184,7 @@ github.com/hashicorp/go-discover/provider/scaleway github.com/hashicorp/go-discover/provider/softlayer github.com/hashicorp/go-discover/provider/triton github.com/hashicorp/go-discover/provider/vsphere -# github.com/hashicorp/go-hclog v0.9.1 +# github.com/hashicorp/go-hclog v0.9.2 github.com/hashicorp/go-hclog # github.com/hashicorp/go-immutable-radix v1.0.0 github.com/hashicorp/go-immutable-radix @@ -242,7 +242,7 @@ github.com/hashicorp/net-rpc-msgpackrpc github.com/hashicorp/raft # github.com/hashicorp/raft-boltdb v0.0.0-20171010151810-6e5ba93211ea github.com/hashicorp/raft-boltdb -# github.com/hashicorp/serf v0.8.2 +# github.com/hashicorp/serf v0.8.5 github.com/hashicorp/serf/coordinate github.com/hashicorp/serf/serf # github.com/hashicorp/vault/api v1.0.4 diff --git a/website/source/api/agent.html.md b/website/source/api/agent.html.md index 9a52b2201e..aed49e7e2b 100644 --- a/website/source/api/agent.html.md +++ b/website/source/api/agent.html.md @@ -490,6 +490,14 @@ state allows its old entries to be removed. | ------ | ---------------------------- | -------------------------- | | `PUT` | `/agent/force-leave/:node` | `application/json` | +Additionally, by specifying the `prune` flag, a node can be forcibly removed from +the list of members entirely. + +| Method | Path | Produces | +| ------ | --------------------------------------- | -------------------------- | +| `PUT` | `/agent/force-leave/:node?prune` | `application/json` | + + The table below shows this endpoint's support for [blocking queries](/api/features/blocking.html), [consistency modes](/api/features/consistency.html), diff --git a/website/source/docs/commands/force-leave.html.markdown.erb b/website/source/docs/commands/force-leave.html.markdown.erb index 941c30fa1b..836d405ad9 100644 --- a/website/source/docs/commands/force-leave.html.markdown.erb +++ b/website/source/docs/commands/force-leave.html.markdown.erb @@ -56,3 +56,7 @@ datacenter `us-east1`, run: ``` consul force-leave server1.us-east1 ``` +#### Command Options + +* `-prune` - Removes failed or left agent from the list of +members entirely