Add an API method for determining the best status

Given a list of HealthChecks, this determines the "best" status for the
collective group. This is useful for nodes and services, which may have
multiple checks associated with them.
pull/2544/head
Seth Vargo 2016-11-29 16:15:20 -05:00
parent 916f3c85b0
commit 4179aacf11
No known key found for this signature in database
GPG Key ID: 905A90C2949E8787
6 changed files with 213 additions and 17 deletions

View File

@ -2,6 +2,8 @@ package api
import (
"fmt"
"log"
"strings"
)
const (
@ -11,6 +13,15 @@ const (
HealthPassing = "passing"
HealthWarning = "warning"
HealthCritical = "critical"
HealthMaint = "maintenance"
)
const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
)
// HealthCheck is used to represent a single check
@ -25,6 +36,52 @@ type HealthCheck struct {
ServiceName string
}
// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck
// AggregatedStatus returns the "best" status for the list of health checks.
// Because a given entry may have many service and node-level health checks
// attached, this function determines the best representative of the status as
// as single string using the following heuristic:
//
// maintenance > critical > warning > passing
//
func (c HealthChecks) AggregatedStatus() string {
var passing, warning, critical, maintenance bool
for _, check := range c {
id := string(check.CheckID)
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
maintenance = true
continue
}
switch check.Status {
case HealthPassing:
passing = true
case HealthWarning:
warning = true
case HealthCritical:
critical = true
default:
log.Printf("[WARN] unknown status %q", check.Status)
return ""
}
}
switch {
case maintenance:
return HealthMaint
case critical:
return HealthCritical
case warning:
return HealthWarning
case passing:
return HealthPassing
default:
return HealthPassing
}
}
// ServiceEntry is used for the health service endpoint
type ServiceEntry struct {
Node *Node

View File

@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
})
}
func TestHealthChecks_AggregatedStatus(t *testing.T) {
t.Parallel()
cases := []struct {
name string
checks HealthChecks
exp string
}{
{
"empty",
nil,
HealthPassing,
},
{
"passing",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
},
HealthPassing,
},
{
"warning",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
{
"critical",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
},
HealthCritical,
},
{
"node_maintenance",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
},
HealthMaint,
},
{
"service_maintenance",
HealthChecks{
&HealthCheck{
CheckID: ServiceMaintPrefix + "service",
},
},
HealthMaint,
},
{
"unknown",
HealthChecks{
&HealthCheck{
Status: "nope-nope-noper",
},
},
"",
},
{
"maintenance_over_critical",
HealthChecks{
&HealthCheck{
CheckID: NodeMaint,
},
&HealthCheck{
Status: HealthCritical,
},
},
HealthMaint,
},
{
"critical_over_warning",
HealthChecks{
&HealthCheck{
Status: HealthCritical,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthCritical,
},
{
"warning_over_passing",
HealthChecks{
&HealthCheck{
Status: HealthWarning,
},
&HealthCheck{
Status: HealthPassing,
},
},
HealthWarning,
},
{
"lots",
HealthChecks{
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthPassing,
},
&HealthCheck{
Status: HealthWarning,
},
},
HealthWarning,
},
}
for i, tc := range cases {
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
act := tc.checks.AggregatedStatus()
if tc.exp != act {
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
}
})
}
}
func TestHealth_Checks(t *testing.T) {
t.Parallel()
c, s := makeClient(t)

View File

@ -34,10 +34,6 @@ const (
checksDir = "checks"
checkStateDir = "checks/state"
// The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance"
nodeMaintCheckID = "_node_maintenance"
// Default reasons for node/service maintenance mode
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
"but no reason was provided. This is a default message."
@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
// serviceMaintCheckID returns the ID of a given service's maintenance check
func serviceMaintCheckID(serviceID string) types.CheckID {
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
}
// EnableServiceMaintenance will register a false health check against the given
@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
// EnableNodeMaintenance places a node into maintenance mode.
func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Ensure node maintenance is not already enabled
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
return
}
@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
// Create and register the node maintenance check
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: nodeMaintCheckID,
CheckID: structs.NodeMaint,
Name: "Node Maintenance Mode",
Notes: reason,
Status: structs.HealthCritical,
@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
// DisableNodeMaintenance removes a node from maintenance mode
func (a *Agent) DisableNodeMaintenance() {
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
return
}
a.RemoveCheck(nodeMaintCheckID, true)
a.RemoveCheck(structs.NodeMaint, true)
a.logger.Printf("[INFO] agent: Node left maintenance mode")
}

View File

@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
}
// Ensure the maintenance check was registered
check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
check, ok := srv.agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered maintenance check")
}
// Check that the token was used
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got '%s'", token)
}
@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
}
// Ensure the maintenance check was removed
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have removed maintenance check")
}
}

View File

@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.EnableNodeMaintenance("broken", "mytoken")
// Make sure the critical health check was added
check, ok := agent.state.Checks()[nodeMaintCheckID]
check, ok := agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered critical node check")
}
// Check that the token was used to register the check
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
t.Fatalf("expected 'mytoken', got: '%s'", token)
}
@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.DisableNodeMaintenance()
// Ensure the check was deregistered
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
t.Fatalf("should have deregistered critical node check")
}
@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
agent.EnableNodeMaintenance("", "")
// Make sure the check was registered with the default note
check, ok = agent.state.Checks()[nodeMaintCheckID]
check, ok = agent.state.Checks()[structs.NodeMaint]
if !ok {
t.Fatalf("should have registered critical node check")
}

View File

@ -56,6 +56,15 @@ const (
HealthPassing = "passing"
HealthWarning = "warning"
HealthCritical = "critical"
HealthMaint = "maintenance"
)
const (
// NodeMaint is the special key set by a node in maintenance mode.
NodeMaint = "_node_maintenance"
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
ServiceMaintPrefix = "_service_maintenance:"
)
func ValidStatus(s string) bool {
@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
return clone
}
// HealthChecks is a collection of HealthCheck structs.
type HealthChecks []*HealthCheck
// CheckServiceNode is used to provide the node, its service
@ -460,7 +470,7 @@ type NodeInfo struct {
Address string
TaggedAddresses map[string]string
Services []*NodeService
Checks []*HealthCheck
Checks HealthChecks
}
// NodeDump is used to dump all the nodes with all their