From 4179aacf11b73ab0b5700df269923c1eb8ea6fa5 Mon Sep 17 00:00:00 2001 From: Seth Vargo Date: Tue, 29 Nov 2016 16:15:20 -0500 Subject: [PATCH] Add an API method for determining the best status Given a list of HealthChecks, this determines the "best" status for the collective group. This is useful for nodes and services, which may have multiple checks associated with them. --- api/health.go | 57 ++++++++++++ api/health_test.go | 133 +++++++++++++++++++++++++++ command/agent/agent.go | 14 +-- command/agent/agent_endpoint_test.go | 6 +- command/agent/agent_test.go | 8 +- consul/structs/structs.go | 12 ++- 6 files changed, 213 insertions(+), 17 deletions(-) diff --git a/api/health.go b/api/health.go index 74da949c8d..94fbfc46e2 100644 --- a/api/health.go +++ b/api/health.go @@ -2,6 +2,8 @@ package api import ( "fmt" + "log" + "strings" ) const ( @@ -11,6 +13,15 @@ const ( HealthPassing = "passing" HealthWarning = "warning" HealthCritical = "critical" + HealthMaint = "maintenance" +) + +const ( + // NodeMaint is the special key set by a node in maintenance mode. + NodeMaint = "_node_maintenance" + + // ServiceMaintPrefix is the prefix for a service in maintenance mode. + ServiceMaintPrefix = "_service_maintenance:" ) // HealthCheck is used to represent a single check @@ -25,6 +36,52 @@ type HealthCheck struct { ServiceName string } +// HealthChecks is a collection of HealthCheck structs. +type HealthChecks []*HealthCheck + +// AggregatedStatus returns the "best" status for the list of health checks. +// Because a given entry may have many service and node-level health checks +// attached, this function determines the best representative of the status as +// as single string using the following heuristic: +// +// maintenance > critical > warning > passing +// +func (c HealthChecks) AggregatedStatus() string { + var passing, warning, critical, maintenance bool + for _, check := range c { + id := string(check.CheckID) + if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) { + maintenance = true + continue + } + + switch check.Status { + case HealthPassing: + passing = true + case HealthWarning: + warning = true + case HealthCritical: + critical = true + default: + log.Printf("[WARN] unknown status %q", check.Status) + return "" + } + } + + switch { + case maintenance: + return HealthMaint + case critical: + return HealthCritical + case warning: + return HealthWarning + case passing: + return HealthPassing + default: + return HealthPassing + } +} + // ServiceEntry is used for the health service endpoint type ServiceEntry struct { Node *Node diff --git a/api/health_test.go b/api/health_test.go index 1d3247b18e..e971d8023b 100644 --- a/api/health_test.go +++ b/api/health_test.go @@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) { }) } +func TestHealthChecks_AggregatedStatus(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + checks HealthChecks + exp string + }{ + { + "empty", + nil, + HealthPassing, + }, + { + "passing", + HealthChecks{ + &HealthCheck{ + Status: HealthPassing, + }, + }, + HealthPassing, + }, + { + "warning", + HealthChecks{ + &HealthCheck{ + Status: HealthWarning, + }, + }, + HealthWarning, + }, + { + "critical", + HealthChecks{ + &HealthCheck{ + Status: HealthCritical, + }, + }, + HealthCritical, + }, + { + "node_maintenance", + HealthChecks{ + &HealthCheck{ + CheckID: NodeMaint, + }, + }, + HealthMaint, + }, + { + "service_maintenance", + HealthChecks{ + &HealthCheck{ + CheckID: ServiceMaintPrefix + "service", + }, + }, + HealthMaint, + }, + { + "unknown", + HealthChecks{ + &HealthCheck{ + Status: "nope-nope-noper", + }, + }, + "", + }, + { + "maintenance_over_critical", + HealthChecks{ + &HealthCheck{ + CheckID: NodeMaint, + }, + &HealthCheck{ + Status: HealthCritical, + }, + }, + HealthMaint, + }, + { + "critical_over_warning", + HealthChecks{ + &HealthCheck{ + Status: HealthCritical, + }, + &HealthCheck{ + Status: HealthWarning, + }, + }, + HealthCritical, + }, + { + "warning_over_passing", + HealthChecks{ + &HealthCheck{ + Status: HealthWarning, + }, + &HealthCheck{ + Status: HealthPassing, + }, + }, + HealthWarning, + }, + { + "lots", + HealthChecks{ + &HealthCheck{ + Status: HealthPassing, + }, + &HealthCheck{ + Status: HealthPassing, + }, + &HealthCheck{ + Status: HealthPassing, + }, + &HealthCheck{ + Status: HealthWarning, + }, + }, + HealthWarning, + }, + } + + for i, tc := range cases { + t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) { + act := tc.checks.AggregatedStatus() + if tc.exp != act { + t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act) + } + }) + } +} + func TestHealth_Checks(t *testing.T) { t.Parallel() c, s := makeClient(t) diff --git a/command/agent/agent.go b/command/agent/agent.go index f3c5461b84..11d1670bae 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -34,10 +34,6 @@ const ( checksDir = "checks" checkStateDir = "checks/state" - // The ID of the faux health checks for maintenance mode - serviceMaintCheckPrefix = "_service_maintenance" - nodeMaintCheckID = "_node_maintenance" - // Default reasons for node/service maintenance mode defaultNodeMaintReason = "Maintenance mode is enabled for this node, " + "but no reason was provided. This is a default message." @@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) { // serviceMaintCheckID returns the ID of a given service's maintenance check func serviceMaintCheckID(serviceID string) types.CheckID { - return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID)) + return types.CheckID(structs.ServiceMaintPrefix + serviceID) } // EnableServiceMaintenance will register a false health check against the given @@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error { // EnableNodeMaintenance places a node into maintenance mode. func (a *Agent) EnableNodeMaintenance(reason, token string) { // Ensure node maintenance is not already enabled - if _, ok := a.state.Checks()[nodeMaintCheckID]; ok { + if _, ok := a.state.Checks()[structs.NodeMaint]; ok { return } @@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) { // Create and register the node maintenance check check := &structs.HealthCheck{ Node: a.config.NodeName, - CheckID: nodeMaintCheckID, + CheckID: structs.NodeMaint, Name: "Node Maintenance Mode", Notes: reason, Status: structs.HealthCritical, @@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) { // DisableNodeMaintenance removes a node from maintenance mode func (a *Agent) DisableNodeMaintenance() { - if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok { + if _, ok := a.state.Checks()[structs.NodeMaint]; !ok { return } - a.RemoveCheck(nodeMaintCheckID, true) + a.RemoveCheck(structs.NodeMaint, true) a.logger.Printf("[INFO] agent: Node left maintenance mode") } diff --git a/command/agent/agent_endpoint_test.go b/command/agent/agent_endpoint_test.go index 1919855e56..0de3483270 100644 --- a/command/agent/agent_endpoint_test.go +++ b/command/agent/agent_endpoint_test.go @@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) { } // Ensure the maintenance check was registered - check, ok := srv.agent.state.Checks()[nodeMaintCheckID] + check, ok := srv.agent.state.Checks()[structs.NodeMaint] if !ok { t.Fatalf("should have registered maintenance check") } // Check that the token was used - if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" { + if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" { t.Fatalf("expected 'mytoken', got '%s'", token) } @@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) { } // Ensure the maintenance check was removed - if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok { + if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok { t.Fatalf("should have removed maintenance check") } } diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 42aed2f8f5..3481991161 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) { agent.EnableNodeMaintenance("broken", "mytoken") // Make sure the critical health check was added - check, ok := agent.state.Checks()[nodeMaintCheckID] + check, ok := agent.state.Checks()[structs.NodeMaint] if !ok { t.Fatalf("should have registered critical node check") } // Check that the token was used to register the check - if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" { + if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" { t.Fatalf("expected 'mytoken', got: '%s'", token) } @@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) { agent.DisableNodeMaintenance() // Ensure the check was deregistered - if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok { + if _, ok := agent.state.Checks()[structs.NodeMaint]; ok { t.Fatalf("should have deregistered critical node check") } @@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) { agent.EnableNodeMaintenance("", "") // Make sure the check was registered with the default note - check, ok = agent.state.Checks()[nodeMaintCheckID] + check, ok = agent.state.Checks()[structs.NodeMaint] if !ok { t.Fatalf("should have registered critical node check") } diff --git a/consul/structs/structs.go b/consul/structs/structs.go index a51658ddcb..9c28077c38 100644 --- a/consul/structs/structs.go +++ b/consul/structs/structs.go @@ -56,6 +56,15 @@ const ( HealthPassing = "passing" HealthWarning = "warning" HealthCritical = "critical" + HealthMaint = "maintenance" +) + +const ( + // NodeMaint is the special key set by a node in maintenance mode. + NodeMaint = "_node_maintenance" + + // ServiceMaintPrefix is the prefix for a service in maintenance mode. + ServiceMaintPrefix = "_service_maintenance:" ) func ValidStatus(s string) bool { @@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck { return clone } +// HealthChecks is a collection of HealthCheck structs. type HealthChecks []*HealthCheck // CheckServiceNode is used to provide the node, its service @@ -460,7 +470,7 @@ type NodeInfo struct { Address string TaggedAddresses map[string]string Services []*NodeService - Checks []*HealthCheck + Checks HealthChecks } // NodeDump is used to dump all the nodes with all their