Merge pull request #1009 from hashicorp/f-persist-status

Persist check state for TTL checks
pull/1016/head
Ryan Uber 2015-06-08 09:35:22 -07:00
commit 6fe5941278
4 changed files with 296 additions and 3 deletions

View File

@ -12,6 +12,7 @@ import (
"regexp" "regexp"
"strconv" "strconv"
"sync" "sync"
"time"
"github.com/hashicorp/consul/consul" "github.com/hashicorp/consul/consul"
"github.com/hashicorp/consul/consul/structs" "github.com/hashicorp/consul/consul/structs"
@ -24,6 +25,7 @@ const (
// Path to save local agent checks // Path to save local agent checks
checksDir = "checks" checksDir = "checks"
checkStateDir = "checks/state"
// The ID of the faux health checks for maintenance mode // The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance" serviceMaintCheckPrefix = "_service_maintenance"
@ -757,6 +759,13 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
TTL: chkType.TTL, TTL: chkType.TTL,
Logger: a.logger, Logger: a.logger,
} }
// Restore persisted state, if any
if err := a.loadCheckState(check); err != nil {
a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
check.CheckID, err)
}
ttl.Start() ttl.Start()
a.checkTTLs[check.CheckID] = ttl a.checkTTLs[check.CheckID] = ttl
@ -842,7 +851,12 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error {
delete(a.checkTTLs, checkID) delete(a.checkTTLs, checkID)
} }
if persist { if persist {
return a.purgeCheck(checkID) if err := a.purgeCheck(checkID); err != nil {
return err
}
if err := a.purgeCheckState(checkID); err != nil {
return err
}
} }
log.Printf("[DEBUG] agent: removed check %q", checkID) log.Printf("[DEBUG] agent: removed check %q", checkID)
return nil return nil
@ -861,9 +875,88 @@ func (a *Agent) UpdateCheck(checkID, status, output string) error {
// Set the status through CheckTTL to reset the TTL // Set the status through CheckTTL to reset the TTL
check.SetStatus(status, output) check.SetStatus(status, output)
// Always persist the state for TTL checks
if err := a.persistCheckState(check, status, output); err != nil {
return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
}
return nil return nil
} }
// persistCheckState is used to record the check status into the data dir.
// This allows the state to be restored on a later agent start. Currently
// only useful for TTL based checks.
func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error {
// Create the persisted state
state := persistedCheckState{
CheckID: check.CheckID,
Status: status,
Output: output,
Expires: time.Now().Add(check.TTL).Unix(),
}
// Encode the state
buf, err := json.Marshal(state)
if err != nil {
return err
}
// Create the state dir if it doesn't exist
dir := filepath.Join(a.config.DataDir, checkStateDir)
if err := os.MkdirAll(dir, 0700); err != nil {
return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
}
// Write the state to the file
file := filepath.Join(dir, stringHash(check.CheckID))
if err := ioutil.WriteFile(file, buf, 0600); err != nil {
return fmt.Errorf("failed writing file %q: %s", file, err)
}
return nil
}
// loadCheckState is used to restore the persisted state of a check.
func (a *Agent) loadCheckState(check *structs.HealthCheck) error {
// Try to read the persisted state for this check
file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID))
buf, err := ioutil.ReadFile(file)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return fmt.Errorf("failed reading file %q: %s", file, err)
}
// Decode the state data
var p persistedCheckState
if err := json.Unmarshal(buf, &p); err != nil {
return fmt.Errorf("failed decoding check state: %s", err)
}
// Check if the state has expired
if time.Now().Unix() >= p.Expires {
a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
return a.purgeCheckState(check.CheckID)
}
// Restore the fields from the state
check.Output = p.Output
check.Status = p.Status
return nil
}
// purgeCheckState is used to purge the state of a check from the data dir
func (a *Agent) purgeCheckState(checkID string) error {
file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(checkID))
err := os.Remove(file)
if os.IsNotExist(err) {
return nil
}
return err
}
// Stats is used to get various debugging state from the sub-systems // Stats is used to get various debugging state from the sub-systems
func (a *Agent) Stats() map[string]map[string]string { func (a *Agent) Stats() map[string]map[string]string {
toString := func(v uint64) string { toString := func(v uint64) string {

View File

@ -459,6 +459,49 @@ func TestAgent_AddCheck_MissingService(t *testing.T) {
} }
} }
func TestAgent_AddCheck_RestoreState(t *testing.T) {
dir, agent := makeAgent(t, nextConfig())
defer os.RemoveAll(dir)
defer agent.Shutdown()
// Create some state and persist it
ttl := &CheckTTL{
CheckID: "baz",
TTL: time.Minute,
}
err := agent.persistCheckState(ttl, structs.HealthPassing, "yup")
if err != nil {
t.Fatalf("err: %s", err)
}
// Build and register the check definition and initial state
health := &structs.HealthCheck{
Node: "foo",
CheckID: "baz",
Name: "baz check 1",
}
chk := &CheckType{
TTL: time.Minute,
}
err = agent.AddCheck(health, chk, false, "")
if err != nil {
t.Fatalf("err: %s", err)
}
// Ensure the check status was restored during registration
checks := agent.state.Checks()
check, ok := checks["baz"]
if !ok {
t.Fatalf("missing check")
}
if check.Status != structs.HealthPassing {
t.Fatalf("bad: %#v", check)
}
if check.Output != "yup" {
t.Fatalf("bad: %#v", check)
}
}
func TestAgent_RemoveCheck(t *testing.T) { func TestAgent_RemoveCheck(t *testing.T) {
dir, agent := makeAgent(t, nextConfig()) dir, agent := makeAgent(t, nextConfig())
defer os.RemoveAll(dir) defer os.RemoveAll(dir)
@ -1349,3 +1392,146 @@ func TestAgent_loadChecks_checkFails(t *testing.T) {
t.Fatalf("should have purged check") t.Fatalf("should have purged check")
} }
} }
func TestAgent_persistCheckState(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()
// Create the TTL check to persist
check := &CheckTTL{
CheckID: "check1",
TTL: 10 * time.Minute,
}
// Persist some check state for the check
err := agent.persistCheckState(check, structs.HealthCritical, "nope")
if err != nil {
t.Fatalf("err: %s", err)
}
// Check the persisted file exists and has the content
file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1"))
buf, err := ioutil.ReadFile(file)
if err != nil {
t.Fatalf("err: %s", err)
}
// Decode the state
var p persistedCheckState
if err := json.Unmarshal(buf, &p); err != nil {
t.Fatalf("err: %s", err)
}
// Check the fields
if p.CheckID != "check1" {
t.Fatalf("bad: %#v", p)
}
if p.Output != "nope" {
t.Fatalf("bad: %#v", p)
}
if p.Status != structs.HealthCritical {
t.Fatalf("bad: %#v", p)
}
// Check the expiration time was set
if p.Expires < time.Now().Unix() {
t.Fatalf("bad: %#v", p)
}
}
func TestAgent_loadCheckState(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()
// Create a check whose state will expire immediately
check := &CheckTTL{
CheckID: "check1",
TTL: 0,
}
// Persist the check state
err := agent.persistCheckState(check, structs.HealthPassing, "yup")
if err != nil {
t.Fatalf("err: %s", err)
}
// Try to load the state
health := &structs.HealthCheck{
CheckID: "check1",
Status: structs.HealthCritical,
}
if err := agent.loadCheckState(health); err != nil {
t.Fatalf("err: %s", err)
}
// Should not have restored the status due to expiration
if health.Status != structs.HealthCritical {
t.Fatalf("bad: %#v", health)
}
if health.Output != "" {
t.Fatalf("bad: %#v", health)
}
// Should have purged the state
file := filepath.Join(agent.config.DataDir, checksDir, stringHash("check1"))
if _, err := os.Stat(file); !os.IsNotExist(err) {
t.Fatalf("should have purged state")
}
// Set a TTL which will not expire before we check it
check.TTL = time.Minute
err = agent.persistCheckState(check, structs.HealthPassing, "yup")
if err != nil {
t.Fatalf("err: %s", err)
}
// Try to load
if err := agent.loadCheckState(health); err != nil {
t.Fatalf("err: %s", err)
}
// Should have restored
if health.Status != structs.HealthPassing {
t.Fatalf("bad: %#v", health)
}
if health.Output != "yup" {
t.Fatalf("bad: %#v", health)
}
}
func TestAgent_purgeCheckState(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()
// No error if the state does not exist
if err := agent.purgeCheckState("check1"); err != nil {
t.Fatalf("err: %s", err)
}
// Persist some state to the data dir
check := &CheckTTL{
CheckID: "check1",
TTL: time.Minute,
}
err := agent.persistCheckState(check, structs.HealthPassing, "yup")
if err != nil {
t.Fatalf("err: %s", err)
}
// Purge the check state
if err := agent.purgeCheckState("check1"); err != nil {
t.Fatalf("err: %s", err)
}
// Removed the file
file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1"))
if _, err := os.Stat(file); !os.IsNotExist(err) {
t.Fatalf("should have removed file")
}
}

View File

@ -266,6 +266,17 @@ type persistedCheck struct {
Token string Token string
} }
// persistedCheckState is used to persist the current state of a given
// check. This is different from the check definition, and includes an
// expiration timestamp which is used to determine staleness on later
// agent restarts.
type persistedCheckState struct {
CheckID string
Output string
Status string
Expires int64
}
// CheckHTTP is used to periodically make an HTTP request to // CheckHTTP is used to periodically make an HTTP request to
// determine the health of a given check. // determine the health of a given check.
// The check is passing if the response code is 2XX. // The check is passing if the response code is 2XX.

View File

@ -37,7 +37,10 @@ There are three different kinds of checks:
set to the failed state. This mechanism, conceptually similar to a dead man's switch, set to the failed state. This mechanism, conceptually similar to a dead man's switch,
relies on the application to directly report its health. For example, a healthy app relies on the application to directly report its health. For example, a healthy app
can periodically `PUT` a status update to the HTTP endpoint; if the app fails, the TTL will can periodically `PUT` a status update to the HTTP endpoint; if the app fails, the TTL will
expire and the health check enters a critical state. expire and the health check enters a critical state. TTL checks also persist
their last known status to disk. This allows the Consul agent to restore the
last known status of the check across restarts. Persisted check status is
valid through the end of the TTL from the time of the last check.
## Check Definition ## Check Definition