Browse Source

agent: fix 'consul leave' shutdown race (#2880)

When the agent is triggered to shutdown via an external 'consul leave'
command delivered via the HTTP API then the client expects to receive a
response when the agent is down. This creates a race on when to shutdown
the agent itself like the RPC server, the checks and the state and the
external endpoints like DNS and HTTP.

This patch splits the shutdown process into two parts:

 * shutdown the agent
 * shutdown the endpoints (http and dns)

They can be executed multiple times, concurrently and in any order but
should be executed first agent, then endpoints to provide consistent
behavior across all use cases. Both calls have to be executed for a
proper shutdown.

This could be partially hidden in a single function but would introduce
some magic that happens behind the scenes which one has to know of but
isn't obvious.

Fixes #2880
pull/3164/head
Frank Schroeder 8 years ago committed by Frank Schröder
parent
commit
ea5b0f2c7c
  1. 64
      agent/agent.go
  2. 2
      agent/agent_endpoint.go
  3. 8
      agent/testagent.go
  4. 5
      command/agent.go

64
agent/agent.go

@ -1112,9 +1112,10 @@ func (a *Agent) Leave() error {
return a.delegate.Leave()
}
// Shutdown is used to hard stop the agent. Should be
// preceded by a call to Leave to do it gracefully.
func (a *Agent) Shutdown() error {
// ShutdownAgent is used to hard stop the agent. Should be preceded by
// Leave to do it gracefully. Should be followed by ShutdownEndpoints to
// terminate the HTTP and DNS servers as well.
func (a *Agent) ShutdownAgent() error {
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()
@ -1123,29 +1124,6 @@ func (a *Agent) Shutdown() error {
}
a.logger.Println("[INFO] agent: Requesting shutdown")
// Stop all API endpoints
for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
for _, srv := range a.httpServers {
// http server is HTTPS if TLSConfig is not nil and NextProtos does not only contain "h2"
// the latter seems to be a side effect of HTTP/2 support in go 1.8. TLSConfig != nil is
// no longer sufficient to check for an HTTPS server.
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
// graceful shutdown
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
}
}
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")
// Stop all the checks
a.checkLock.Lock()
defer a.checkLock.Unlock()
@ -1155,11 +1133,9 @@ func (a *Agent) Shutdown() error {
for _, chk := range a.checkTTLs {
chk.Stop()
}
for _, chk := range a.checkHTTPs {
chk.Stop()
}
for _, chk := range a.checkTCPs {
chk.Stop()
}
@ -1185,6 +1161,38 @@ func (a *Agent) Shutdown() error {
return err
}
// ShutdownEndpoints terminates the HTTP and DNS servers. Should be
// preceeded by ShutdownAgent.
func (a *Agent) ShutdownEndpoints() {
a.shutdownLock.Lock()
defer a.shutdownLock.Unlock()
if len(a.dnsServers) == 0 || len(a.httpServers) == 0 {
return
}
for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
a.dnsServers = nil
for _, srv := range a.httpServers {
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
}
}
a.httpServers = nil
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")
}
// ReloadCh is used to return a channel that can be
// used for triggering reloads and returning a response.
func (a *Agent) ReloadCh() chan chan error {

2
agent/agent_endpoint.go

@ -199,7 +199,7 @@ func (s *HTTPServer) AgentLeave(resp http.ResponseWriter, req *http.Request) (in
if err := s.agent.Leave(); err != nil {
return nil, err
}
return nil, s.agent.Shutdown()
return nil, s.agent.ShutdownAgent()
}
func (s *HTTPServer) AgentForceLeave(resp http.ResponseWriter, req *http.Request) (interface{}, error) {

8
agent/testagent.go

@ -156,7 +156,8 @@ func (a *TestAgent) Start() *TestAgent {
fmt.Println(id, a.Name, "Error starting agent:", err)
runtime.Goexit()
} else {
agent.Shutdown()
agent.ShutdownAgent()
agent.ShutdownEndpoints()
wait := time.Duration(rand.Int31n(2000)) * time.Millisecond
fmt.Println(id, a.Name, "retrying in", wait)
time.Sleep(wait)
@ -221,7 +222,10 @@ func (a *TestAgent) Shutdown() error {
os.RemoveAll(a.DataDir)
}
}()
return a.Agent.Shutdown()
// shutdown agent before endpoints
defer a.Agent.ShutdownEndpoints()
return a.Agent.ShutdownAgent()
}
func (a *TestAgent) HTTPAddr() string {

5
command/agent.go

@ -691,7 +691,10 @@ func (cmd *AgentCommand) run(args []string) int {
cmd.UI.Error(fmt.Sprintf("Error starting agent: %s", err))
return 1
}
defer agent.Shutdown()
// shutdown agent before endpoints
defer agent.ShutdownEndpoints()
defer agent.ShutdownAgent()
if !config.DisableUpdateCheck {
cmd.startupUpdateCheck(config)

Loading…
Cancel
Save