mirror of https://github.com/hashicorp/consul
Add an API method for determining the best status
Given a list of HealthChecks, this determines the "best" status for the collective group. This is useful for nodes and services, which may have multiple checks associated with them.pull/2544/head
parent
916f3c85b0
commit
4179aacf11
|
@ -2,6 +2,8 @@ package api
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -11,6 +13,15 @@ const (
|
|||
HealthPassing = "passing"
|
||||
HealthWarning = "warning"
|
||||
HealthCritical = "critical"
|
||||
HealthMaint = "maintenance"
|
||||
)
|
||||
|
||||
const (
|
||||
// NodeMaint is the special key set by a node in maintenance mode.
|
||||
NodeMaint = "_node_maintenance"
|
||||
|
||||
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
|
||||
ServiceMaintPrefix = "_service_maintenance:"
|
||||
)
|
||||
|
||||
// HealthCheck is used to represent a single check
|
||||
|
@ -25,6 +36,52 @@ type HealthCheck struct {
|
|||
ServiceName string
|
||||
}
|
||||
|
||||
// HealthChecks is a collection of HealthCheck structs.
|
||||
type HealthChecks []*HealthCheck
|
||||
|
||||
// AggregatedStatus returns the "best" status for the list of health checks.
|
||||
// Because a given entry may have many service and node-level health checks
|
||||
// attached, this function determines the best representative of the status as
|
||||
// as single string using the following heuristic:
|
||||
//
|
||||
// maintenance > critical > warning > passing
|
||||
//
|
||||
func (c HealthChecks) AggregatedStatus() string {
|
||||
var passing, warning, critical, maintenance bool
|
||||
for _, check := range c {
|
||||
id := string(check.CheckID)
|
||||
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
|
||||
maintenance = true
|
||||
continue
|
||||
}
|
||||
|
||||
switch check.Status {
|
||||
case HealthPassing:
|
||||
passing = true
|
||||
case HealthWarning:
|
||||
warning = true
|
||||
case HealthCritical:
|
||||
critical = true
|
||||
default:
|
||||
log.Printf("[WARN] unknown status %q", check.Status)
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
switch {
|
||||
case maintenance:
|
||||
return HealthMaint
|
||||
case critical:
|
||||
return HealthCritical
|
||||
case warning:
|
||||
return HealthWarning
|
||||
case passing:
|
||||
return HealthPassing
|
||||
default:
|
||||
return HealthPassing
|
||||
}
|
||||
}
|
||||
|
||||
// ServiceEntry is used for the health service endpoint
|
||||
type ServiceEntry struct {
|
||||
Node *Node
|
||||
|
|
|
@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
|
|||
})
|
||||
}
|
||||
|
||||
func TestHealthChecks_AggregatedStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cases := []struct {
|
||||
name string
|
||||
checks HealthChecks
|
||||
exp string
|
||||
}{
|
||||
{
|
||||
"empty",
|
||||
nil,
|
||||
HealthPassing,
|
||||
},
|
||||
{
|
||||
"passing",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
},
|
||||
HealthPassing,
|
||||
},
|
||||
{
|
||||
"warning",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthWarning,
|
||||
},
|
||||
},
|
||||
HealthWarning,
|
||||
},
|
||||
{
|
||||
"critical",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthCritical,
|
||||
},
|
||||
},
|
||||
HealthCritical,
|
||||
},
|
||||
{
|
||||
"node_maintenance",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
CheckID: NodeMaint,
|
||||
},
|
||||
},
|
||||
HealthMaint,
|
||||
},
|
||||
{
|
||||
"service_maintenance",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
CheckID: ServiceMaintPrefix + "service",
|
||||
},
|
||||
},
|
||||
HealthMaint,
|
||||
},
|
||||
{
|
||||
"unknown",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: "nope-nope-noper",
|
||||
},
|
||||
},
|
||||
"",
|
||||
},
|
||||
{
|
||||
"maintenance_over_critical",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
CheckID: NodeMaint,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthCritical,
|
||||
},
|
||||
},
|
||||
HealthMaint,
|
||||
},
|
||||
{
|
||||
"critical_over_warning",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthCritical,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthWarning,
|
||||
},
|
||||
},
|
||||
HealthCritical,
|
||||
},
|
||||
{
|
||||
"warning_over_passing",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthWarning,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
},
|
||||
HealthWarning,
|
||||
},
|
||||
{
|
||||
"lots",
|
||||
HealthChecks{
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthPassing,
|
||||
},
|
||||
&HealthCheck{
|
||||
Status: HealthWarning,
|
||||
},
|
||||
},
|
||||
HealthWarning,
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range cases {
|
||||
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
|
||||
act := tc.checks.AggregatedStatus()
|
||||
if tc.exp != act {
|
||||
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestHealth_Checks(t *testing.T) {
|
||||
t.Parallel()
|
||||
c, s := makeClient(t)
|
||||
|
|
|
@ -34,10 +34,6 @@ const (
|
|||
checksDir = "checks"
|
||||
checkStateDir = "checks/state"
|
||||
|
||||
// The ID of the faux health checks for maintenance mode
|
||||
serviceMaintCheckPrefix = "_service_maintenance"
|
||||
nodeMaintCheckID = "_node_maintenance"
|
||||
|
||||
// Default reasons for node/service maintenance mode
|
||||
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
|
||||
"but no reason was provided. This is a default message."
|
||||
|
@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
|
|||
|
||||
// serviceMaintCheckID returns the ID of a given service's maintenance check
|
||||
func serviceMaintCheckID(serviceID string) types.CheckID {
|
||||
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
|
||||
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
|
||||
}
|
||||
|
||||
// EnableServiceMaintenance will register a false health check against the given
|
||||
|
@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
|
|||
// EnableNodeMaintenance places a node into maintenance mode.
|
||||
func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
||||
// Ensure node maintenance is not already enabled
|
||||
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
|
||||
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
|||
// Create and register the node maintenance check
|
||||
check := &structs.HealthCheck{
|
||||
Node: a.config.NodeName,
|
||||
CheckID: nodeMaintCheckID,
|
||||
CheckID: structs.NodeMaint,
|
||||
Name: "Node Maintenance Mode",
|
||||
Notes: reason,
|
||||
Status: structs.HealthCritical,
|
||||
|
@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
|||
|
||||
// DisableNodeMaintenance removes a node from maintenance mode
|
||||
func (a *Agent) DisableNodeMaintenance() {
|
||||
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
|
||||
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
|
||||
return
|
||||
}
|
||||
a.RemoveCheck(nodeMaintCheckID, true)
|
||||
a.RemoveCheck(structs.NodeMaint, true)
|
||||
a.logger.Printf("[INFO] agent: Node left maintenance mode")
|
||||
}
|
||||
|
||||
|
|
|
@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
|
|||
}
|
||||
|
||||
// Ensure the maintenance check was registered
|
||||
check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
|
||||
check, ok := srv.agent.state.Checks()[structs.NodeMaint]
|
||||
if !ok {
|
||||
t.Fatalf("should have registered maintenance check")
|
||||
}
|
||||
|
||||
// Check that the token was used
|
||||
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
|
||||
if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
|
||||
t.Fatalf("expected 'mytoken', got '%s'", token)
|
||||
}
|
||||
|
||||
|
@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
|
|||
}
|
||||
|
||||
// Ensure the maintenance check was removed
|
||||
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
|
||||
if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
|
||||
t.Fatalf("should have removed maintenance check")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
|||
agent.EnableNodeMaintenance("broken", "mytoken")
|
||||
|
||||
// Make sure the critical health check was added
|
||||
check, ok := agent.state.Checks()[nodeMaintCheckID]
|
||||
check, ok := agent.state.Checks()[structs.NodeMaint]
|
||||
if !ok {
|
||||
t.Fatalf("should have registered critical node check")
|
||||
}
|
||||
|
||||
// Check that the token was used to register the check
|
||||
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
|
||||
if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
|
||||
t.Fatalf("expected 'mytoken', got: '%s'", token)
|
||||
}
|
||||
|
||||
|
@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
|||
agent.DisableNodeMaintenance()
|
||||
|
||||
// Ensure the check was deregistered
|
||||
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
|
||||
if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
|
||||
t.Fatalf("should have deregistered critical node check")
|
||||
}
|
||||
|
||||
|
@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
|||
agent.EnableNodeMaintenance("", "")
|
||||
|
||||
// Make sure the check was registered with the default note
|
||||
check, ok = agent.state.Checks()[nodeMaintCheckID]
|
||||
check, ok = agent.state.Checks()[structs.NodeMaint]
|
||||
if !ok {
|
||||
t.Fatalf("should have registered critical node check")
|
||||
}
|
||||
|
|
|
@ -56,6 +56,15 @@ const (
|
|||
HealthPassing = "passing"
|
||||
HealthWarning = "warning"
|
||||
HealthCritical = "critical"
|
||||
HealthMaint = "maintenance"
|
||||
)
|
||||
|
||||
const (
|
||||
// NodeMaint is the special key set by a node in maintenance mode.
|
||||
NodeMaint = "_node_maintenance"
|
||||
|
||||
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
|
||||
ServiceMaintPrefix = "_service_maintenance:"
|
||||
)
|
||||
|
||||
func ValidStatus(s string) bool {
|
||||
|
@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
|
|||
return clone
|
||||
}
|
||||
|
||||
// HealthChecks is a collection of HealthCheck structs.
|
||||
type HealthChecks []*HealthCheck
|
||||
|
||||
// CheckServiceNode is used to provide the node, its service
|
||||
|
@ -460,7 +470,7 @@ type NodeInfo struct {
|
|||
Address string
|
||||
TaggedAddresses map[string]string
|
||||
Services []*NodeService
|
||||
Checks []*HealthCheck
|
||||
Checks HealthChecks
|
||||
}
|
||||
|
||||
// NodeDump is used to dump all the nodes with all their
|
||||
|
|
Loading…
Reference in New Issue