diff --git a/command/agent/agent.go b/command/agent/agent.go index ea3c515079..19a0a6303b 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -12,6 +12,7 @@ import ( "regexp" "strconv" "sync" + "time" "github.com/hashicorp/consul/consul" "github.com/hashicorp/consul/consul/structs" @@ -23,7 +24,8 @@ const ( servicesDir = "services" // Path to save local agent checks - checksDir = "checks" + checksDir = "checks" + checkStateDir = "checks/state" // The ID of the faux health checks for maintenance mode serviceMaintCheckPrefix = "_service_maintenance" @@ -757,6 +759,13 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist TTL: chkType.TTL, Logger: a.logger, } + + // Restore persisted state, if any + if err := a.recallCheckState(check); err != nil { + a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", + check.CheckID, err) + } + ttl.Start() a.checkTTLs[check.CheckID] = ttl @@ -861,6 +870,75 @@ func (a *Agent) UpdateCheck(checkID, status, output string) error { // Set the status through CheckTTL to reset the TTL check.SetStatus(status, output) + + // Always persist the state for TTL checks + if err := a.persistCheckState(check, status, output); err != nil { + return fmt.Errorf("failed persisting state for check %q: %s", checkID, err) + } + + return nil +} + +// persistCheckState is used to record the check status into the data dir. +// This allows the state to be restored on a later agent start. Currently +// only useful for TTL based checks. +func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error { + // Create the persisted state + state := persistedCheckState{ + CheckID: check.CheckID, + Status: status, + Output: output, + Expires: time.Now().Add(check.TTL).Unix(), + } + + // Encode the state + buf, err := json.Marshal(state) + if err != nil { + return err + } + + // Create the state dir if it doesn't exist + dir := filepath.Join(a.config.DataDir, checkStateDir) + if err := os.MkdirAll(dir, 0700); err != nil { + return fmt.Errorf("failed creating check state dir %q: %s", dir, err) + } + + // Write the state to the file + file := filepath.Join(dir, stringHash(check.CheckID)) + if err := ioutil.WriteFile(file, buf, 0600); err != nil { + return fmt.Errorf("failed writing file %q: %s", file, err) + } + + return nil +} + +// recallCheckState is used to restore the persisted state of a check. +func (a *Agent) recallCheckState(check *structs.HealthCheck) error { + // Try to read the persisted state for this check + file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID)) + buf, err := ioutil.ReadFile(file) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("failed reading file %q: %s", file, err) + } + + // Decode the state data + var p persistedCheckState + if err := json.Unmarshal(buf, &p); err != nil { + return fmt.Errorf("failed decoding check state: %s", err) + } + + // Check if the state has expired + if time.Now().Unix() > p.Expires { + a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) + return nil + } + + // Restore the fields from the state + check.Output = p.Output + check.Status = p.Status return nil } diff --git a/command/agent/check.go b/command/agent/check.go index 66578db8e7..6677483886 100644 --- a/command/agent/check.go +++ b/command/agent/check.go @@ -266,6 +266,17 @@ type persistedCheck struct { Token string } +// persistedCheckState is used to persist the current state of a given +// check. This is different from the check definition, and includes an +// expiration timestamp which is used to determine staleness on later +// agent restarts. +type persistedCheckState struct { + CheckID string + Output string + Status string + Expires int64 +} + // CheckHTTP is used to periodically make an HTTP request to // determine the health of a given check. // The check is passing if the response code is 2XX.