From ea5b0f2c7c9691a05699f902258dd1867463a89b Mon Sep 17 00:00:00 2001 From: Frank Schroeder Date: Tue, 20 Jun 2017 09:29:20 +0200 Subject: [PATCH] agent: fix 'consul leave' shutdown race (#2880) When the agent is triggered to shutdown via an external 'consul leave' command delivered via the HTTP API then the client expects to receive a response when the agent is down. This creates a race on when to shutdown the agent itself like the RPC server, the checks and the state and the external endpoints like DNS and HTTP. This patch splits the shutdown process into two parts: * shutdown the agent * shutdown the endpoints (http and dns) They can be executed multiple times, concurrently and in any order but should be executed first agent, then endpoints to provide consistent behavior across all use cases. Both calls have to be executed for a proper shutdown. This could be partially hidden in a single function but would introduce some magic that happens behind the scenes which one has to know of but isn't obvious. Fixes #2880 --- agent/agent.go | 64 +++++++++++++++++++++++------------------ agent/agent_endpoint.go | 2 +- agent/testagent.go | 8 ++++-- command/agent.go | 5 +++- 4 files changed, 47 insertions(+), 32 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 94c9b6578a..a36115b431 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1112,9 +1112,10 @@ func (a *Agent) Leave() error { return a.delegate.Leave() } -// Shutdown is used to hard stop the agent. Should be -// preceded by a call to Leave to do it gracefully. -func (a *Agent) Shutdown() error { +// ShutdownAgent is used to hard stop the agent. Should be preceded by +// Leave to do it gracefully. Should be followed by ShutdownEndpoints to +// terminate the HTTP and DNS servers as well. +func (a *Agent) ShutdownAgent() error { a.shutdownLock.Lock() defer a.shutdownLock.Unlock() @@ -1123,29 +1124,6 @@ func (a *Agent) Shutdown() error { } a.logger.Println("[INFO] agent: Requesting shutdown") - // Stop all API endpoints - for _, srv := range a.dnsServers { - a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net) - srv.Shutdown() - } - for _, srv := range a.httpServers { - // http server is HTTPS if TLSConfig is not nil and NextProtos does not only contain "h2" - // the latter seems to be a side effect of HTTP/2 support in go 1.8. TLSConfig != nil is - // no longer sufficient to check for an HTTPS server. - a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr) - - // graceful shutdown - ctx, cancel := context.WithTimeout(context.Background(), time.Second) - defer cancel() - srv.Shutdown(ctx) - if ctx.Err() == context.DeadlineExceeded { - a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr) - } - } - a.logger.Println("[INFO] agent: Waiting for endpoints to shut down") - a.wgServers.Wait() - a.logger.Print("[INFO] agent: Endpoints down") - // Stop all the checks a.checkLock.Lock() defer a.checkLock.Unlock() @@ -1155,11 +1133,9 @@ func (a *Agent) Shutdown() error { for _, chk := range a.checkTTLs { chk.Stop() } - for _, chk := range a.checkHTTPs { chk.Stop() } - for _, chk := range a.checkTCPs { chk.Stop() } @@ -1185,6 +1161,38 @@ func (a *Agent) Shutdown() error { return err } +// ShutdownEndpoints terminates the HTTP and DNS servers. Should be +// preceeded by ShutdownAgent. +func (a *Agent) ShutdownEndpoints() { + a.shutdownLock.Lock() + defer a.shutdownLock.Unlock() + + if len(a.dnsServers) == 0 || len(a.httpServers) == 0 { + return + } + + for _, srv := range a.dnsServers { + a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net) + srv.Shutdown() + } + a.dnsServers = nil + + for _, srv := range a.httpServers { + a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr) + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + srv.Shutdown(ctx) + if ctx.Err() == context.DeadlineExceeded { + a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr) + } + } + a.httpServers = nil + + a.logger.Println("[INFO] agent: Waiting for endpoints to shut down") + a.wgServers.Wait() + a.logger.Print("[INFO] agent: Endpoints down") +} + // ReloadCh is used to return a channel that can be // used for triggering reloads and returning a response. func (a *Agent) ReloadCh() chan chan error { diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index 78fd52c4d3..f4b4f7b82b 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -199,7 +199,7 @@ func (s *HTTPServer) AgentLeave(resp http.ResponseWriter, req *http.Request) (in if err := s.agent.Leave(); err != nil { return nil, err } - return nil, s.agent.Shutdown() + return nil, s.agent.ShutdownAgent() } func (s *HTTPServer) AgentForceLeave(resp http.ResponseWriter, req *http.Request) (interface{}, error) { diff --git a/agent/testagent.go b/agent/testagent.go index 4d77eaf696..e6a7272320 100644 --- a/agent/testagent.go +++ b/agent/testagent.go @@ -156,7 +156,8 @@ func (a *TestAgent) Start() *TestAgent { fmt.Println(id, a.Name, "Error starting agent:", err) runtime.Goexit() } else { - agent.Shutdown() + agent.ShutdownAgent() + agent.ShutdownEndpoints() wait := time.Duration(rand.Int31n(2000)) * time.Millisecond fmt.Println(id, a.Name, "retrying in", wait) time.Sleep(wait) @@ -221,7 +222,10 @@ func (a *TestAgent) Shutdown() error { os.RemoveAll(a.DataDir) } }() - return a.Agent.Shutdown() + + // shutdown agent before endpoints + defer a.Agent.ShutdownEndpoints() + return a.Agent.ShutdownAgent() } func (a *TestAgent) HTTPAddr() string { diff --git a/command/agent.go b/command/agent.go index a1da848c5f..de2fc42a5c 100644 --- a/command/agent.go +++ b/command/agent.go @@ -691,7 +691,10 @@ func (cmd *AgentCommand) run(args []string) int { cmd.UI.Error(fmt.Sprintf("Error starting agent: %s", err)) return 1 } - defer agent.Shutdown() + + // shutdown agent before endpoints + defer agent.ShutdownEndpoints() + defer agent.ShutdownAgent() if !config.DisableUpdateCheck { cmd.startupUpdateCheck(config)