Merge pull request #606 from hashicorp/f-maint

Support node and service maintenance mode
pull/614/head
Ryan Uber 2015-01-16 15:59:01 -08:00
commit 60bb23c19a
6 changed files with 457 additions and 0 deletions

View File

@ -29,6 +29,10 @@ const (
"If Consul was not shut down properly, the socket file may " +
"be left behind. If the path looks correct, remove the file " +
"and try again."
// The ID of the faux health checks for maintenance mode
serviceMaintCheckPrefix = "_service_maintenance"
nodeMaintCheckID = "_node_maintenenace"
)
/*
@ -995,3 +999,86 @@ func (a *Agent) unloadChecks() error {
return nil
}
// serviceMaintCheckID returns the ID of a given service's maintenance check
func serviceMaintCheckID(serviceID string) string {
return fmt.Sprintf("%s:%s", serviceMaintCheckPrefix, serviceID)
}
// EnableServiceMaintenance will register a false health check against the given
// service ID with critical status. This will exclude the service from queries.
func (a *Agent) EnableServiceMaintenance(serviceID string) error {
service, ok := a.state.Services()[serviceID]
if !ok {
return fmt.Errorf("No service registered with ID %q", serviceID)
}
// Check if maintenance mode is not already enabled
checkID := serviceMaintCheckID(serviceID)
if _, ok := a.state.Checks()[checkID]; ok {
return nil
}
// Create and register the critical health check
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: checkID,
Name: "Service Maintenance Mode",
Notes: "Maintenance mode is enabled for this service",
ServiceID: service.ID,
ServiceName: service.Service,
Status: structs.HealthCritical,
}
a.AddCheck(check, nil, true)
a.logger.Printf("[INFO] agent: service %q entered maintenance mode", serviceID)
return nil
}
// DisableServiceMaintenance will deregister the fake maintenance mode check
// if the service has been marked as in maintenance.
func (a *Agent) DisableServiceMaintenance(serviceID string) error {
if _, ok := a.state.Services()[serviceID]; !ok {
return fmt.Errorf("No service registered with ID %q", serviceID)
}
// Check if maintenance mode is enabled
checkID := serviceMaintCheckID(serviceID)
if _, ok := a.state.Checks()[checkID]; !ok {
return nil
}
// Deregister the maintenance check
a.RemoveCheck(checkID, true)
a.logger.Printf("[INFO] agent: service %q left maintenance mode", serviceID)
return nil
}
// EnableNodeMaintenance places a node into maintenance mode.
func (a *Agent) EnableNodeMaintenance() {
// Ensure node maintenance is not already enabled
if _, ok := a.state.Checks()[nodeMaintCheckID]; ok {
return
}
// Create and register the node maintenance check
check := &structs.HealthCheck{
Node: a.config.NodeName,
CheckID: nodeMaintCheckID,
Name: "Node Maintenance Mode",
Notes: "Maintenance mode is enabled for this node",
Status: structs.HealthCritical,
}
a.AddCheck(check, nil, true)
a.logger.Printf("[INFO] agent: node entered maintenance mode")
}
// DisableNodeMaintenance removes a node from maintenance mode
func (a *Agent) DisableNodeMaintenance() {
if _, ok := a.state.Checks()[nodeMaintCheckID]; !ok {
return
}
a.RemoveCheck(nodeMaintCheckID, true)
a.logger.Printf("[INFO] agent: node left maintenance mode")
}

View File

@ -176,3 +176,90 @@ func (s *HTTPServer) AgentDeregisterService(resp http.ResponseWriter, req *http.
serviceID := strings.TrimPrefix(req.URL.Path, "/v1/agent/service/deregister/")
return nil, s.agent.RemoveService(serviceID, true)
}
func (s *HTTPServer) AgentServiceMaintenance(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Only PUT supported
if req.Method != "PUT" {
resp.WriteHeader(405)
return nil, nil
}
// Ensure we have a service ID
serviceID := strings.TrimPrefix(req.URL.Path, "/v1/agent/service/maintenance/")
if serviceID == "" {
resp.WriteHeader(400)
resp.Write([]byte("Missing service ID"))
return nil, nil
}
// Ensure we have some action
params := req.URL.Query()
if _, ok := params["enable"]; !ok {
resp.WriteHeader(400)
resp.Write([]byte("Missing value for enable"))
return nil, nil
}
var enable bool
raw := params.Get("enable")
switch raw {
case "true":
enable = true
case "false":
enable = false
default:
resp.WriteHeader(400)
resp.Write([]byte(fmt.Sprintf("Invalid value for enable: %q", raw)))
return nil, nil
}
var err error
if enable {
if err = s.agent.EnableServiceMaintenance(serviceID); err != nil {
resp.WriteHeader(404)
resp.Write([]byte(err.Error()))
}
} else {
if err = s.agent.DisableServiceMaintenance(serviceID); err != nil {
resp.WriteHeader(404)
resp.Write([]byte(err.Error()))
}
}
return nil, err
}
func (s *HTTPServer) AgentNodeMaintenance(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Only PUT supported
if req.Method != "PUT" {
resp.WriteHeader(405)
return nil, nil
}
// Ensure we have some action
params := req.URL.Query()
if _, ok := params["enable"]; !ok {
resp.WriteHeader(400)
resp.Write([]byte("Missing value for enable"))
return nil, nil
}
var enable bool
raw := params.Get("enable")
switch raw {
case "true":
enable = true
case "false":
enable = false
default:
resp.WriteHeader(400)
resp.Write([]byte(fmt.Sprintf("Invalid value for enable: %q", raw)))
return nil, nil
}
if enable {
s.agent.EnableNodeMaintenance()
} else {
s.agent.DisableNodeMaintenance()
}
return nil, nil
}

View File

@ -7,6 +7,7 @@ import (
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/serf/serf"
"net/http"
"net/http/httptest"
"os"
"testing"
"time"
@ -492,3 +493,193 @@ func TestHTTPAgentDeregisterService(t *testing.T) {
t.Fatalf("have test check")
}
}
func TestHTTPAgent_ServiceMaintenanceEndpoint_BadRequest(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
// Fails on non-PUT
req, _ := http.NewRequest("GET", "/v1/agent/service/maintenance/test?enable=true", nil)
resp := httptest.NewRecorder()
if _, err := srv.AgentServiceMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 405 {
t.Fatalf("expected 405, got %d", resp.Code)
}
// Fails when no enable flag provided
req, _ = http.NewRequest("PUT", "/v1/agent/service/maintenance/test", nil)
resp = httptest.NewRecorder()
if _, err := srv.AgentServiceMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 400 {
t.Fatalf("expected 400, got %d", resp.Code)
}
// Fails when no service ID provided
req, _ = http.NewRequest("PUT", "/v1/agent/service/maintenance/?enable=true", nil)
resp = httptest.NewRecorder()
if _, err := srv.AgentServiceMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 400 {
t.Fatalf("expected 400, got %d", resp.Code)
}
// Fails when bad service ID provided
req, _ = http.NewRequest("PUT", "/v1/agent/service/maintenance/_nope_?enable=true", nil)
resp = httptest.NewRecorder()
if _, err := srv.AgentServiceMaintenance(resp, req); err == nil {
t.Fatalf("should have errored")
}
if resp.Code != 404 {
t.Fatalf("expected 404, got %d", resp.Code)
}
}
func TestHTTPAgent_EnableServiceMaintenance(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
// Register the service
service := &structs.NodeService{
ID: "test",
Service: "test",
}
if err := srv.agent.AddService(service, nil, false); err != nil {
t.Fatalf("err: %v", err)
}
// Force the service into maintenance mode
req, _ := http.NewRequest("PUT", "/v1/agent/service/maintenance/test?enable=true", nil)
resp := httptest.NewRecorder()
if _, err := srv.AgentServiceMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}
// Ensure the maintenance check was registered
checkID := serviceMaintCheckID("test")
if _, ok := srv.agent.state.Checks()[checkID]; !ok {
t.Fatalf("should have registered maintenance check")
}
}
func TestHTTPAgent_DisableServiceMaintenance(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
// Register the service
service := &structs.NodeService{
ID: "test",
Service: "test",
}
if err := srv.agent.AddService(service, nil, false); err != nil {
t.Fatalf("err: %v", err)
}
// Force the service into maintenance mode
if err := srv.agent.EnableServiceMaintenance("test"); err != nil {
t.Fatalf("err: %s", err)
}
// Leave maintenance mode
req, _ := http.NewRequest("PUT", "/v1/agent/service/maintenance/test?enable=false", nil)
resp := httptest.NewRecorder()
if _, err := srv.AgentServiceMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}
// Ensure the maintenance check was removed
checkID := serviceMaintCheckID("test")
if _, ok := srv.agent.state.Checks()[checkID]; ok {
t.Fatalf("should have removed maintenance check")
}
}
func TestHTTPAgent_NodeMaintenanceEndpoint_BadRequest(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
// Fails on non-PUT
req, _ := http.NewRequest("GET", "/v1/agent/self/maintenance?enable=true", nil)
resp := httptest.NewRecorder()
if _, err := srv.AgentNodeMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 405 {
t.Fatalf("expected 405, got %d", resp.Code)
}
// Fails when no enable flag provided
req, _ = http.NewRequest("PUT", "/v1/agent/self/maintenance", nil)
resp = httptest.NewRecorder()
if _, err := srv.AgentNodeMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 400 {
t.Fatalf("expected 400, got %d", resp.Code)
}
}
func TestHTTPAgent_EnableNodeMaintenance(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
// Force the node into maintenance mode
req, _ := http.NewRequest("PUT", "/v1/agent/self/maintenance?enable=true", nil)
resp := httptest.NewRecorder()
if _, err := srv.AgentNodeMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}
// Ensure the maintenance check was registered
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; !ok {
t.Fatalf("should have registered maintenance check")
}
}
func TestHTTPAgent_DisableNodeMaintenance(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
// Force the node into maintenance mode
srv.agent.EnableNodeMaintenance()
// Leave maintenance mode
req, _ := http.NewRequest("PUT", "/v1/agent/self/maintenance?enable=false", nil)
resp := httptest.NewRecorder()
if _, err := srv.AgentNodeMaintenance(resp, req); err != nil {
t.Fatalf("err: %s", err)
}
if resp.Code != 200 {
t.Fatalf("expected 200, got %d", resp.Code)
}
// Ensure the maintenance check was removed
if _, ok := srv.agent.state.Checks()[nodeMaintCheckID]; ok {
t.Fatalf("should have removed maintenance check")
}
}

View File

@ -781,3 +781,66 @@ func TestAgent_unloadServices(t *testing.T) {
t.Fatalf("consul service should not be removed")
}
}
func TestAgent_ServiceMaintenanceMode(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()
svc := &structs.NodeService{
ID: "redis",
Service: "redis",
Tags: []string{"foo"},
Port: 8000,
}
// Register the service
if err := agent.AddService(svc, nil, false); err != nil {
t.Fatalf("err: %v", err)
}
// Enter maintenance mode for the service
if err := agent.EnableServiceMaintenance("redis"); err != nil {
t.Fatalf("err: %s", err)
}
// Make sure the critical health check was added
checkID := serviceMaintCheckID("redis")
if _, ok := agent.state.Checks()[checkID]; !ok {
t.Fatalf("should have registered critical maintenance check")
}
// Leave maintenance mode
if err := agent.DisableServiceMaintenance("redis"); err != nil {
t.Fatalf("err: %s", err)
}
// Ensure the check was deregistered
if _, ok := agent.state.Checks()[checkID]; ok {
t.Fatalf("should have deregistered maintenance check")
}
}
func TestAgent_NodeMaintenanceMode(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()
// Enter maintenance mode for the node
agent.EnableNodeMaintenance()
// Make sure the critical health check was added
if _, ok := agent.state.Checks()[nodeMaintCheckID]; !ok {
t.Fatalf("should have registered critical node check")
}
// Leave maintenance mode
agent.DisableNodeMaintenance()
// Ensure the check was deregistered
if _, ok := agent.state.Checks()[nodeMaintCheckID]; ok {
t.Fatalf("should have deregistered critical node check")
}
}

View File

@ -181,6 +181,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/health/service/", s.wrap(s.HealthServiceNodes))
s.mux.HandleFunc("/v1/agent/self", s.wrap(s.AgentSelf))
s.mux.HandleFunc("/v1/agent/self/maintenance", s.wrap(s.AgentNodeMaintenance))
s.mux.HandleFunc("/v1/agent/services", s.wrap(s.AgentServices))
s.mux.HandleFunc("/v1/agent/checks", s.wrap(s.AgentChecks))
s.mux.HandleFunc("/v1/agent/members", s.wrap(s.AgentMembers))
@ -195,6 +196,7 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/agent/service/register", s.wrap(s.AgentRegisterService))
s.mux.HandleFunc("/v1/agent/service/deregister/", s.wrap(s.AgentDeregisterService))
s.mux.HandleFunc("/v1/agent/service/maintenance/", s.wrap(s.AgentServiceMaintenance))
s.mux.HandleFunc("/v1/event/fire/", s.wrap(s.EventFire))
s.mux.HandleFunc("/v1/event/list", s.wrap(s.EventList))

View File

@ -238,6 +238,7 @@ The following endpoints are supported:
* [`/v1/agent/services`](#agent_services) : Returns the services local agent is managing
* [`/v1/agent/members`](#agent_members) : Returns the members as seen by the local serf agent
* [`/v1/agent/self`](#agent_self) : Returns the local node configuration
* [`/v1/agent/self/maintenance`](#agent_self_maintenance) : Node maintenance mode
* [`/v1/agent/join/<address>`](#agent_join) : Trigger local agent to join a node
* [`/v1/agent/force-leave/<node>`](#agent_force_leave)>: Force remove node
* [`/v1/agent/check/register`](#agent_check_register) : Registers a new local check
@ -247,6 +248,7 @@ The following endpoints are supported:
* [`/v1/agent/check/fail/<checkID>`](#agent_check_fail) : Mark a local test as critical
* [`/v1/agent/service/register`](#agent_service_register) : Registers a new local service
* [`/v1/agent/service/deregister/<serviceID>`](#agent_service_deregister) : Deregister a local service
* [`/v1/agent/service/maintenance/<serviceID>`](#agent_service_maintenance) : Service maintenance mode
### <a name="agent_checks"></a> /v1/agent/checks
@ -401,6 +403,18 @@ It returns a JSON body like this:
}
```
### <a name="agent_self_maintenance"></a> /v1/agent/self/maintenance
The node maintenance endpoint allows placing the agent into "maintenance mode".
During maintenance mode, the node will be marked as unavailable, and will not be
present in DNS or API queries. This API call is idempotent. Maintenance mode is
persistent and will be automatically restored on agent restart.
The `?enable` flag is required, and its value must be `true` (to enter
maintenance mode), or `false` (to resume normal operation).
The return code is 200 on success.
### <a name="agent_join"></a> /v1/agent/join/\<address\>
This endpoint is hit with a GET and is used to instruct the agent to attempt to
@ -548,6 +562,19 @@ check, that is also deregistered.
The return code is 200 on success.
### <a name="agent_service_maintenance"></a> /v1/agent/service/maintenance/\<serviceId\>
The service maintenance endpoint allows placing a given service into
"maintenance mode". During maintenance mode, the service will be marked as
unavailable, and will not be present in DNS or API queries. This API call is
idempotent. Maintenance mode is persistent and will be automatically restored
on agent restart.
The `?enable` flag is required, and its value must be `true` (to enter
maintenance mode), or `false` (to resume normal operation).
The return code is 200 on success.
## <a name="catalog"></a> Catalog
The Catalog is the endpoint used to register and deregister nodes,