mirror of https://github.com/hashicorp/consul
Merge pull request #1009 from hashicorp/f-persist-status
Persist check state for TTL checkspull/1016/head
commit
6fe5941278
|
@ -12,6 +12,7 @@ import (
|
||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/hashicorp/consul/consul"
|
"github.com/hashicorp/consul/consul"
|
||||||
"github.com/hashicorp/consul/consul/structs"
|
"github.com/hashicorp/consul/consul/structs"
|
||||||
|
@ -24,6 +25,7 @@ const (
|
||||||
|
|
||||||
// Path to save local agent checks
|
// Path to save local agent checks
|
||||||
checksDir = "checks"
|
checksDir = "checks"
|
||||||
|
checkStateDir = "checks/state"
|
||||||
|
|
||||||
// The ID of the faux health checks for maintenance mode
|
// The ID of the faux health checks for maintenance mode
|
||||||
serviceMaintCheckPrefix = "_service_maintenance"
|
serviceMaintCheckPrefix = "_service_maintenance"
|
||||||
|
@ -757,6 +759,13 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
|
||||||
TTL: chkType.TTL,
|
TTL: chkType.TTL,
|
||||||
Logger: a.logger,
|
Logger: a.logger,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Restore persisted state, if any
|
||||||
|
if err := a.loadCheckState(check); err != nil {
|
||||||
|
a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
|
||||||
|
check.CheckID, err)
|
||||||
|
}
|
||||||
|
|
||||||
ttl.Start()
|
ttl.Start()
|
||||||
a.checkTTLs[check.CheckID] = ttl
|
a.checkTTLs[check.CheckID] = ttl
|
||||||
|
|
||||||
|
@ -842,7 +851,12 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error {
|
||||||
delete(a.checkTTLs, checkID)
|
delete(a.checkTTLs, checkID)
|
||||||
}
|
}
|
||||||
if persist {
|
if persist {
|
||||||
return a.purgeCheck(checkID)
|
if err := a.purgeCheck(checkID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := a.purgeCheckState(checkID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
log.Printf("[DEBUG] agent: removed check %q", checkID)
|
log.Printf("[DEBUG] agent: removed check %q", checkID)
|
||||||
return nil
|
return nil
|
||||||
|
@ -861,9 +875,88 @@ func (a *Agent) UpdateCheck(checkID, status, output string) error {
|
||||||
|
|
||||||
// Set the status through CheckTTL to reset the TTL
|
// Set the status through CheckTTL to reset the TTL
|
||||||
check.SetStatus(status, output)
|
check.SetStatus(status, output)
|
||||||
|
|
||||||
|
// Always persist the state for TTL checks
|
||||||
|
if err := a.persistCheckState(check, status, output); err != nil {
|
||||||
|
return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// persistCheckState is used to record the check status into the data dir.
|
||||||
|
// This allows the state to be restored on a later agent start. Currently
|
||||||
|
// only useful for TTL based checks.
|
||||||
|
func (a *Agent) persistCheckState(check *CheckTTL, status, output string) error {
|
||||||
|
// Create the persisted state
|
||||||
|
state := persistedCheckState{
|
||||||
|
CheckID: check.CheckID,
|
||||||
|
Status: status,
|
||||||
|
Output: output,
|
||||||
|
Expires: time.Now().Add(check.TTL).Unix(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encode the state
|
||||||
|
buf, err := json.Marshal(state)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the state dir if it doesn't exist
|
||||||
|
dir := filepath.Join(a.config.DataDir, checkStateDir)
|
||||||
|
if err := os.MkdirAll(dir, 0700); err != nil {
|
||||||
|
return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the state to the file
|
||||||
|
file := filepath.Join(dir, stringHash(check.CheckID))
|
||||||
|
if err := ioutil.WriteFile(file, buf, 0600); err != nil {
|
||||||
|
return fmt.Errorf("failed writing file %q: %s", file, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadCheckState is used to restore the persisted state of a check.
|
||||||
|
func (a *Agent) loadCheckState(check *structs.HealthCheck) error {
|
||||||
|
// Try to read the persisted state for this check
|
||||||
|
file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(check.CheckID))
|
||||||
|
buf, err := ioutil.ReadFile(file)
|
||||||
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("failed reading file %q: %s", file, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the state data
|
||||||
|
var p persistedCheckState
|
||||||
|
if err := json.Unmarshal(buf, &p); err != nil {
|
||||||
|
return fmt.Errorf("failed decoding check state: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the state has expired
|
||||||
|
if time.Now().Unix() >= p.Expires {
|
||||||
|
a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
|
||||||
|
return a.purgeCheckState(check.CheckID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restore the fields from the state
|
||||||
|
check.Output = p.Output
|
||||||
|
check.Status = p.Status
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// purgeCheckState is used to purge the state of a check from the data dir
|
||||||
|
func (a *Agent) purgeCheckState(checkID string) error {
|
||||||
|
file := filepath.Join(a.config.DataDir, checkStateDir, stringHash(checkID))
|
||||||
|
err := os.Remove(file)
|
||||||
|
if os.IsNotExist(err) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// Stats is used to get various debugging state from the sub-systems
|
// Stats is used to get various debugging state from the sub-systems
|
||||||
func (a *Agent) Stats() map[string]map[string]string {
|
func (a *Agent) Stats() map[string]map[string]string {
|
||||||
toString := func(v uint64) string {
|
toString := func(v uint64) string {
|
||||||
|
|
|
@ -459,6 +459,49 @@ func TestAgent_AddCheck_MissingService(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAgent_AddCheck_RestoreState(t *testing.T) {
|
||||||
|
dir, agent := makeAgent(t, nextConfig())
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
defer agent.Shutdown()
|
||||||
|
|
||||||
|
// Create some state and persist it
|
||||||
|
ttl := &CheckTTL{
|
||||||
|
CheckID: "baz",
|
||||||
|
TTL: time.Minute,
|
||||||
|
}
|
||||||
|
err := agent.persistCheckState(ttl, structs.HealthPassing, "yup")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build and register the check definition and initial state
|
||||||
|
health := &structs.HealthCheck{
|
||||||
|
Node: "foo",
|
||||||
|
CheckID: "baz",
|
||||||
|
Name: "baz check 1",
|
||||||
|
}
|
||||||
|
chk := &CheckType{
|
||||||
|
TTL: time.Minute,
|
||||||
|
}
|
||||||
|
err = agent.AddCheck(health, chk, false, "")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the check status was restored during registration
|
||||||
|
checks := agent.state.Checks()
|
||||||
|
check, ok := checks["baz"]
|
||||||
|
if !ok {
|
||||||
|
t.Fatalf("missing check")
|
||||||
|
}
|
||||||
|
if check.Status != structs.HealthPassing {
|
||||||
|
t.Fatalf("bad: %#v", check)
|
||||||
|
}
|
||||||
|
if check.Output != "yup" {
|
||||||
|
t.Fatalf("bad: %#v", check)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestAgent_RemoveCheck(t *testing.T) {
|
func TestAgent_RemoveCheck(t *testing.T) {
|
||||||
dir, agent := makeAgent(t, nextConfig())
|
dir, agent := makeAgent(t, nextConfig())
|
||||||
defer os.RemoveAll(dir)
|
defer os.RemoveAll(dir)
|
||||||
|
@ -1349,3 +1392,146 @@ func TestAgent_loadChecks_checkFails(t *testing.T) {
|
||||||
t.Fatalf("should have purged check")
|
t.Fatalf("should have purged check")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAgent_persistCheckState(t *testing.T) {
|
||||||
|
config := nextConfig()
|
||||||
|
dir, agent := makeAgent(t, config)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
defer agent.Shutdown()
|
||||||
|
|
||||||
|
// Create the TTL check to persist
|
||||||
|
check := &CheckTTL{
|
||||||
|
CheckID: "check1",
|
||||||
|
TTL: 10 * time.Minute,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Persist some check state for the check
|
||||||
|
err := agent.persistCheckState(check, structs.HealthCritical, "nope")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the persisted file exists and has the content
|
||||||
|
file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1"))
|
||||||
|
buf, err := ioutil.ReadFile(file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the state
|
||||||
|
var p persistedCheckState
|
||||||
|
if err := json.Unmarshal(buf, &p); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the fields
|
||||||
|
if p.CheckID != "check1" {
|
||||||
|
t.Fatalf("bad: %#v", p)
|
||||||
|
}
|
||||||
|
if p.Output != "nope" {
|
||||||
|
t.Fatalf("bad: %#v", p)
|
||||||
|
}
|
||||||
|
if p.Status != structs.HealthCritical {
|
||||||
|
t.Fatalf("bad: %#v", p)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the expiration time was set
|
||||||
|
if p.Expires < time.Now().Unix() {
|
||||||
|
t.Fatalf("bad: %#v", p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAgent_loadCheckState(t *testing.T) {
|
||||||
|
config := nextConfig()
|
||||||
|
dir, agent := makeAgent(t, config)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
defer agent.Shutdown()
|
||||||
|
|
||||||
|
// Create a check whose state will expire immediately
|
||||||
|
check := &CheckTTL{
|
||||||
|
CheckID: "check1",
|
||||||
|
TTL: 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Persist the check state
|
||||||
|
err := agent.persistCheckState(check, structs.HealthPassing, "yup")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to load the state
|
||||||
|
health := &structs.HealthCheck{
|
||||||
|
CheckID: "check1",
|
||||||
|
Status: structs.HealthCritical,
|
||||||
|
}
|
||||||
|
if err := agent.loadCheckState(health); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should not have restored the status due to expiration
|
||||||
|
if health.Status != structs.HealthCritical {
|
||||||
|
t.Fatalf("bad: %#v", health)
|
||||||
|
}
|
||||||
|
if health.Output != "" {
|
||||||
|
t.Fatalf("bad: %#v", health)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should have purged the state
|
||||||
|
file := filepath.Join(agent.config.DataDir, checksDir, stringHash("check1"))
|
||||||
|
if _, err := os.Stat(file); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("should have purged state")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set a TTL which will not expire before we check it
|
||||||
|
check.TTL = time.Minute
|
||||||
|
err = agent.persistCheckState(check, structs.HealthPassing, "yup")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to load
|
||||||
|
if err := agent.loadCheckState(health); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Should have restored
|
||||||
|
if health.Status != structs.HealthPassing {
|
||||||
|
t.Fatalf("bad: %#v", health)
|
||||||
|
}
|
||||||
|
if health.Output != "yup" {
|
||||||
|
t.Fatalf("bad: %#v", health)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAgent_purgeCheckState(t *testing.T) {
|
||||||
|
config := nextConfig()
|
||||||
|
dir, agent := makeAgent(t, config)
|
||||||
|
defer os.RemoveAll(dir)
|
||||||
|
defer agent.Shutdown()
|
||||||
|
|
||||||
|
// No error if the state does not exist
|
||||||
|
if err := agent.purgeCheckState("check1"); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Persist some state to the data dir
|
||||||
|
check := &CheckTTL{
|
||||||
|
CheckID: "check1",
|
||||||
|
TTL: time.Minute,
|
||||||
|
}
|
||||||
|
err := agent.persistCheckState(check, structs.HealthPassing, "yup")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Purge the check state
|
||||||
|
if err := agent.purgeCheckState("check1"); err != nil {
|
||||||
|
t.Fatalf("err: %s", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Removed the file
|
||||||
|
file := filepath.Join(agent.config.DataDir, checkStateDir, stringHash("check1"))
|
||||||
|
if _, err := os.Stat(file); !os.IsNotExist(err) {
|
||||||
|
t.Fatalf("should have removed file")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -266,6 +266,17 @@ type persistedCheck struct {
|
||||||
Token string
|
Token string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// persistedCheckState is used to persist the current state of a given
|
||||||
|
// check. This is different from the check definition, and includes an
|
||||||
|
// expiration timestamp which is used to determine staleness on later
|
||||||
|
// agent restarts.
|
||||||
|
type persistedCheckState struct {
|
||||||
|
CheckID string
|
||||||
|
Output string
|
||||||
|
Status string
|
||||||
|
Expires int64
|
||||||
|
}
|
||||||
|
|
||||||
// CheckHTTP is used to periodically make an HTTP request to
|
// CheckHTTP is used to periodically make an HTTP request to
|
||||||
// determine the health of a given check.
|
// determine the health of a given check.
|
||||||
// The check is passing if the response code is 2XX.
|
// The check is passing if the response code is 2XX.
|
||||||
|
|
|
@ -37,7 +37,10 @@ There are three different kinds of checks:
|
||||||
set to the failed state. This mechanism, conceptually similar to a dead man's switch,
|
set to the failed state. This mechanism, conceptually similar to a dead man's switch,
|
||||||
relies on the application to directly report its health. For example, a healthy app
|
relies on the application to directly report its health. For example, a healthy app
|
||||||
can periodically `PUT` a status update to the HTTP endpoint; if the app fails, the TTL will
|
can periodically `PUT` a status update to the HTTP endpoint; if the app fails, the TTL will
|
||||||
expire and the health check enters a critical state.
|
expire and the health check enters a critical state. TTL checks also persist
|
||||||
|
their last known status to disk. This allows the Consul agent to restore the
|
||||||
|
last known status of the check across restarts. Persisted check status is
|
||||||
|
valid through the end of the TTL from the time of the last check.
|
||||||
|
|
||||||
## Check Definition
|
## Check Definition
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue