mirror of https://github.com/hashicorp/consul
Add an API method for determining the best status
Given a list of HealthChecks, this determines the "best" status for the collective group. This is useful for nodes and services, which may have multiple checks associated with them.pull/2544/head
parent
916f3c85b0
commit
4179aacf11
|
@ -2,6 +2,8 @@ package api
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -11,6 +13,15 @@ const (
|
||||||
HealthPassing = "passing"
|
HealthPassing = "passing"
|
||||||
HealthWarning = "warning"
|
HealthWarning = "warning"
|
||||||
HealthCritical = "critical"
|
HealthCritical = "critical"
|
||||||
|
HealthMaint = "maintenance"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// NodeMaint is the special key set by a node in maintenance mode.
|
||||||
|
NodeMaint = "_node_maintenance"
|
||||||
|
|
||||||
|
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
|
||||||
|
ServiceMaintPrefix = "_service_maintenance:"
|
||||||
)
|
)
|
||||||
|
|
||||||
// HealthCheck is used to represent a single check
|
// HealthCheck is used to represent a single check
|
||||||
|
@ -25,6 +36,52 @@ type HealthCheck struct {
|
||||||
ServiceName string
|
ServiceName string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HealthChecks is a collection of HealthCheck structs.
|
||||||
|
type HealthChecks []*HealthCheck
|
||||||
|
|
||||||
|
// AggregatedStatus returns the "best" status for the list of health checks.
|
||||||
|
// Because a given entry may have many service and node-level health checks
|
||||||
|
// attached, this function determines the best representative of the status as
|
||||||
|
// as single string using the following heuristic:
|
||||||
|
//
|
||||||
|
// maintenance > critical > warning > passing
|
||||||
|
//
|
||||||
|
func (c HealthChecks) AggregatedStatus() string {
|
||||||
|
var passing, warning, critical, maintenance bool
|
||||||
|
for _, check := range c {
|
||||||
|
id := string(check.CheckID)
|
||||||
|
if id == NodeMaint || strings.HasPrefix(id, ServiceMaintPrefix) {
|
||||||
|
maintenance = true
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
switch check.Status {
|
||||||
|
case HealthPassing:
|
||||||
|
passing = true
|
||||||
|
case HealthWarning:
|
||||||
|
warning = true
|
||||||
|
case HealthCritical:
|
||||||
|
critical = true
|
||||||
|
default:
|
||||||
|
log.Printf("[WARN] unknown status %q", check.Status)
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case maintenance:
|
||||||
|
return HealthMaint
|
||||||
|
case critical:
|
||||||
|
return HealthCritical
|
||||||
|
case warning:
|
||||||
|
return HealthWarning
|
||||||
|
case passing:
|
||||||
|
return HealthPassing
|
||||||
|
default:
|
||||||
|
return HealthPassing
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ServiceEntry is used for the health service endpoint
|
// ServiceEntry is used for the health service endpoint
|
||||||
type ServiceEntry struct {
|
type ServiceEntry struct {
|
||||||
Node *Node
|
Node *Node
|
||||||
|
|
|
@ -38,6 +38,139 @@ func TestHealth_Node(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHealthChecks_AggregatedStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
checks HealthChecks
|
||||||
|
exp string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
"empty",
|
||||||
|
nil,
|
||||||
|
HealthPassing,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"passing",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthPassing,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"warning",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthWarning,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthWarning,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"critical",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthCritical,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthCritical,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"node_maintenance",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
CheckID: NodeMaint,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthMaint,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"service_maintenance",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
CheckID: ServiceMaintPrefix + "service",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthMaint,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"unknown",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: "nope-nope-noper",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"maintenance_over_critical",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
CheckID: NodeMaint,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthCritical,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthMaint,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"critical_over_warning",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthCritical,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthWarning,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthCritical,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"warning_over_passing",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthWarning,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthWarning,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"lots",
|
||||||
|
HealthChecks{
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthPassing,
|
||||||
|
},
|
||||||
|
&HealthCheck{
|
||||||
|
Status: HealthWarning,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
HealthWarning,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, tc := range cases {
|
||||||
|
t.Run(fmt.Sprintf("%d_%s", i, tc.name), func(t *testing.T) {
|
||||||
|
act := tc.checks.AggregatedStatus()
|
||||||
|
if tc.exp != act {
|
||||||
|
t.Errorf("\nexp: %#v\nact: %#v", tc.exp, act)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHealth_Checks(t *testing.T) {
|
func TestHealth_Checks(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
c, s := makeClient(t)
|
c, s := makeClient(t)
|
||||||
|
|
|
@ -34,10 +34,6 @@ const (
|
||||||
checksDir = "checks"
|
checksDir = "checks"
|
||||||
checkStateDir = "checks/state"
|
checkStateDir = "checks/state"
|
||||||
|
|
||||||
// The ID of the faux health checks for maintenance mode
|
|
||||||
serviceMaintCheckPrefix = "_service_maintenance"
|
|
||||||
nodeMaintCheckID = "_node_maintenance"
|
|
||||||
|
|
||||||
// Default reasons for node/service maintenance mode
|
// Default reasons for node/service maintenance mode
|
||||||
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
|
defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
|
||||||
"but no reason was provided. This is a default message."
|
"but no reason was provided. This is a default message."
|
||||||
|
@ -1532,7 +1528,7 @@ func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
|
||||||
|
|
||||||
// serviceMaintCheckID returns the ID of a given service's maintenance check
|
// serviceMaintCheckID returns the ID of a given service's maintenance check
|
||||||
func serviceMaintCheckID(serviceID string) types.CheckID {
|
func serviceMaintCheckID(serviceID string) types.CheckID {
|
||||||
return types.CheckID(fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID))
|
return types.CheckID(structs.ServiceMaintPrefix + serviceID)
|
||||||
}
|
}
|
||||||
|
|
||||||
// EnableServiceMaintenance will register a false health check against the given
|
// EnableServiceMaintenance will register a false health check against the given
|
||||||
|
@ -1593,7 +1589,7 @@ func (a *Agent) DisableServiceMaintenance(serviceID string) error {
|
||||||
// EnableNodeMaintenance places a node into maintenance mode.
|
// EnableNodeMaintenance places a node into maintenance mode.
|
||||||
func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
||||||
// Ensure node maintenance is not already enabled
|
// Ensure node maintenance is not already enabled
|
||||||
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
|
if _, ok := a.state.Checks()[structs.NodeMaint]; ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1605,7 +1601,7 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
||||||
// Create and register the node maintenance check
|
// Create and register the node maintenance check
|
||||||
check := &structs.HealthCheck{
|
check := &structs.HealthCheck{
|
||||||
Node: a.config.NodeName,
|
Node: a.config.NodeName,
|
||||||
CheckID: nodeMaintCheckID,
|
CheckID: structs.NodeMaint,
|
||||||
Name: "Node Maintenance Mode",
|
Name: "Node Maintenance Mode",
|
||||||
Notes: reason,
|
Notes: reason,
|
||||||
Status: structs.HealthCritical,
|
Status: structs.HealthCritical,
|
||||||
|
@ -1616,10 +1612,10 @@ func (a *Agent) EnableNodeMaintenance(reason, token string) {
|
||||||
|
|
||||||
// DisableNodeMaintenance removes a node from maintenance mode
|
// DisableNodeMaintenance removes a node from maintenance mode
|
||||||
func (a *Agent) DisableNodeMaintenance() {
|
func (a *Agent) DisableNodeMaintenance() {
|
||||||
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
|
if _, ok := a.state.Checks()[structs.NodeMaint]; !ok {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
a.RemoveCheck(nodeMaintCheckID, true)
|
a.RemoveCheck(structs.NodeMaint, true)
|
||||||
a.logger.Printf("[INFO] agent: Node left maintenance mode")
|
a.logger.Printf("[INFO] agent: Node left maintenance mode")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -926,13 +926,13 @@ func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure the maintenance check was registered
|
// Ensure the maintenance check was registered
|
||||||
check, ok := srv.agent.state.Checks()[nodeMaintCheckID]
|
check, ok := srv.agent.state.Checks()[structs.NodeMaint]
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("should have registered maintenance check")
|
t.Fatalf("should have registered maintenance check")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that the token was used
|
// Check that the token was used
|
||||||
if token := srv.agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
|
if token := srv.agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
|
||||||
t.Fatalf("expected 'mytoken', got '%s'", token)
|
t.Fatalf("expected 'mytoken', got '%s'", token)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -962,7 +962,7 @@ func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure the maintenance check was removed
|
// Ensure the maintenance check was removed
|
||||||
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
|
if _, ok := srv.agent.state.Checks()[structs.NodeMaint]; ok {
|
||||||
t.Fatalf("should have removed maintenance check")
|
t.Fatalf("should have removed maintenance check")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1577,13 +1577,13 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
||||||
agent.EnableNodeMaintenance("broken", "mytoken")
|
agent.EnableNodeMaintenance("broken", "mytoken")
|
||||||
|
|
||||||
// Make sure the critical health check was added
|
// Make sure the critical health check was added
|
||||||
check, ok := agent.state.Checks()[nodeMaintCheckID]
|
check, ok := agent.state.Checks()[structs.NodeMaint]
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("should have registered critical node check")
|
t.Fatalf("should have registered critical node check")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check that the token was used to register the check
|
// Check that the token was used to register the check
|
||||||
if token := agent.state.CheckToken(nodeMaintCheckID); token != "mytoken" {
|
if token := agent.state.CheckToken(structs.NodeMaint); token != "mytoken" {
|
||||||
t.Fatalf("expected 'mytoken', got: '%s'", token)
|
t.Fatalf("expected 'mytoken', got: '%s'", token)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1596,7 +1596,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
||||||
agent.DisableNodeMaintenance()
|
agent.DisableNodeMaintenance()
|
||||||
|
|
||||||
// Ensure the check was deregistered
|
// Ensure the check was deregistered
|
||||||
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
|
if _, ok := agent.state.Checks()[structs.NodeMaint]; ok {
|
||||||
t.Fatalf("should have deregistered critical node check")
|
t.Fatalf("should have deregistered critical node check")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1604,7 +1604,7 @@ func TestAgent_NodeMaintenanceMode(t *testing.T) {
|
||||||
agent.EnableNodeMaintenance("", "")
|
agent.EnableNodeMaintenance("", "")
|
||||||
|
|
||||||
// Make sure the check was registered with the default note
|
// Make sure the check was registered with the default note
|
||||||
check, ok = agent.state.Checks()[nodeMaintCheckID]
|
check, ok = agent.state.Checks()[structs.NodeMaint]
|
||||||
if !ok {
|
if !ok {
|
||||||
t.Fatalf("should have registered critical node check")
|
t.Fatalf("should have registered critical node check")
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,6 +56,15 @@ const (
|
||||||
HealthPassing = "passing"
|
HealthPassing = "passing"
|
||||||
HealthWarning = "warning"
|
HealthWarning = "warning"
|
||||||
HealthCritical = "critical"
|
HealthCritical = "critical"
|
||||||
|
HealthMaint = "maintenance"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// NodeMaint is the special key set by a node in maintenance mode.
|
||||||
|
NodeMaint = "_node_maintenance"
|
||||||
|
|
||||||
|
// ServiceMaintPrefix is the prefix for a service in maintenance mode.
|
||||||
|
ServiceMaintPrefix = "_service_maintenance:"
|
||||||
)
|
)
|
||||||
|
|
||||||
func ValidStatus(s string) bool {
|
func ValidStatus(s string) bool {
|
||||||
|
@ -412,6 +421,7 @@ func (c *HealthCheck) Clone() *HealthCheck {
|
||||||
return clone
|
return clone
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// HealthChecks is a collection of HealthCheck structs.
|
||||||
type HealthChecks []*HealthCheck
|
type HealthChecks []*HealthCheck
|
||||||
|
|
||||||
// CheckServiceNode is used to provide the node, its service
|
// CheckServiceNode is used to provide the node, its service
|
||||||
|
@ -460,7 +470,7 @@ type NodeInfo struct {
|
||||||
Address string
|
Address string
|
||||||
TaggedAddresses map[string]string
|
TaggedAddresses map[string]string
|
||||||
Services []*NodeService
|
Services []*NodeService
|
||||||
Checks []*HealthCheck
|
Checks HealthChecks
|
||||||
}
|
}
|
||||||
|
|
||||||
// NodeDump is used to dump all the nodes with all their
|
// NodeDump is used to dump all the nodes with all their
|
||||||
|
|
Loading…
Reference in New Issue