diff --git a/command/agent/agent.go b/command/agent/agent.go index c34063a861..778d071354 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -51,11 +51,16 @@ type Agent struct { state localState // checkMonitors maps the check ID to an associated monitor - // checkTTLs maps the check ID to an associated check TTL - // checkLock protects updates to either checkMonitors map[string]*CheckMonitor - checkTTLs map[string]*CheckTTL - checkLock sync.Mutex + + // checkHTTPs maps the check ID to an associated HTTP check + checkHTTPs map[string]*CheckHTTP + + // checkTTLs maps the check ID to an associated check TTL + checkTTLs map[string]*CheckTTL + + // checkLock protects updates to the check* maps + checkLock sync.Mutex // eventCh is used to receive user events eventCh chan serf.UserEvent @@ -111,6 +116,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) { logOutput: logOutput, checkMonitors: make(map[string]*CheckMonitor), checkTTLs: make(map[string]*CheckTTL), + checkHTTPs: make(map[string]*CheckHTTP), eventCh: make(chan serf.UserEvent, 1024), eventBuf: make([]*UserEvent, 256), shutdownCh: make(chan struct{}), @@ -382,6 +388,10 @@ func (a *Agent) Shutdown() error { chk.Stop() } + for _, chk := range a.checkHTTPs { + chk.Stop() + } + a.logger.Println("[INFO] agent: requesting shutdown") var err error if a.server != nil { @@ -661,6 +671,26 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist ttl.Start() a.checkTTLs[check.CheckID] = ttl + } else if chkType.IsHTTP() { + if existing, ok := a.checkHTTPs[check.CheckID]; ok { + existing.Stop() + } + if chkType.Interval < MinInterval { + a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", + check.CheckID, MinInterval)) + chkType.Interval = MinInterval + } + + http := &CheckHTTP{ + Notify: &a.state, + CheckID: check.CheckID, + HTTP: chkType.HTTP, + Interval: chkType.Interval, + Logger: a.logger, + } + http.Start() + a.checkHTTPs[check.CheckID] = http + } else { if existing, ok := a.checkMonitors[check.CheckID]; ok { existing.Stop() @@ -708,6 +738,10 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error { check.Stop() delete(a.checkMonitors, checkID) } + if check, ok := a.checkHTTPs[checkID]; ok { + check.Stop() + delete(a.checkHTTPs, checkID) + } if check, ok := a.checkTTLs[checkID]; ok { check.Stop() delete(a.checkTTLs, checkID) diff --git a/command/agent/check.go b/command/agent/check.go index 17e2fb5f0a..2b0153585b 100644 --- a/command/agent/check.go +++ b/command/agent/check.go @@ -4,7 +4,9 @@ import ( "fmt" "github.com/armon/circbuf" "github.com/hashicorp/consul/consul/structs" + "io/ioutil" "log" + "net/http" "os/exec" "sync" "syscall" @@ -23,10 +25,14 @@ const ( ) // CheckType is used to create either the CheckMonitor -// or the CheckTTL. Only one of TTL or Script/Interval -// needs to be provided +// or the CheckTTL. +// Three types are supported: Script, HTTP, and TTL +// Script and HTTP both require Interval +// Only one of the types needs to be provided +// TTL or Script/Interval or HTTP/Interval type CheckType struct { Script string + HTTP string Interval time.Duration TTL time.Duration @@ -36,7 +42,7 @@ type CheckType struct { // Valid checks if the CheckType is valid func (c *CheckType) Valid() bool { - return c.IsTTL() || c.IsMonitor() + return c.IsTTL() || c.IsMonitor() || c.IsHTTP() } // IsTTL checks if this is a TTL type @@ -49,6 +55,11 @@ func (c *CheckType) IsMonitor() bool { return c.Script != "" && c.Interval != 0 } +// IsHTTP checks if this is a HTTP type +func (c *CheckType) IsHTTP() bool { + return c.HTTP != "" && c.Interval != 0 +} + // CheckNotifier interface is used by the CheckMonitor // to notify when a check has a status update. The update // should take care to be idempotent. @@ -244,3 +255,106 @@ type persistedCheck struct { Check *structs.HealthCheck ChkType *CheckType } + +// CheckHTTP is used to periodically make an HTTP request to +// determine the health of a given check. +// The check is passing if the response code is 200. +// The check is warning if the response code is 503. +// The check is critical if the response code is anything else +// or if the request returns an error +type CheckHTTP struct { + Notify CheckNotifier + CheckID string + HTTP string + Interval time.Duration + Logger *log.Logger + + httpClient *http.Client + stop bool + stopCh chan struct{} + stopLock sync.Mutex +} + +// Start is used to start an HTTP check. +// The check runs until stop is called +func (c *CheckHTTP) Start() { + c.stopLock.Lock() + defer c.stopLock.Unlock() + + if c.httpClient == nil { + // For long (>10s) interval checks the http timeout is 10s, otherwise the + // timeout is the interval. This means that a check *should* return + // before the next check begins. + if c.Interval < 10*time.Second { + c.httpClient = &http.Client{Timeout: c.Interval} + } else { + c.httpClient = &http.Client{Timeout: 10 * time.Second} + } + } + + c.stop = false + c.stopCh = make(chan struct{}) + go c.run() +} + +// Stop is used to stop an HTTP check. +func (c *CheckHTTP) Stop() { + c.stopLock.Lock() + defer c.stopLock.Unlock() + if !c.stop { + c.stop = true + close(c.stopCh) + } +} + +// run is invoked by a goroutine to run until Stop() is called +func (c *CheckHTTP) run() { + // Get the randomized initial pause time + initialPauseTime := randomStagger(c.Interval) + c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP) + next := time.After(initialPauseTime) + for { + select { + case <-next: + c.check() + next = time.After(c.Interval) + case <-c.stopCh: + return + } + } +} + +// check is invoked periodically to perform the HTTP check +func (c *CheckHTTP) check() { + resp, err := c.httpClient.Get(c.HTTP) + if err != nil { + c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err) + c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error()) + return + } + defer resp.Body.Close() + + if resp.StatusCode >= 200 && resp.StatusCode <= 299 { + // PASSING (2xx) + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + c.Logger.Printf("[WARN] check '%v': Get error while reading body: %s", c.CheckID, err) + body = []byte{} + } + result := fmt.Sprintf("HTTP GET %s: %s Output: %s", c.HTTP, resp.Status, body) + c.Logger.Printf("[DEBUG] agent: http check '%v' is passing: %s", c.CheckID, result) + c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, result) + + } else if resp.StatusCode == 429 { + // WARNING + // 429 Too Many Requests (RFC 6585) + // The user has sent too many requests in a given amount of time. + c.Logger.Printf("[WARN] check '%v' is now warning", c.CheckID) + c.Notify.UpdateCheck(c.CheckID, structs.HealthWarning, resp.Status) + + } else { + // CRITICAL + c.Logger.Printf("[WARN] check '%v' is now critical", c.CheckID) + c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, resp.Status) + } +} diff --git a/command/agent/check_test.go b/command/agent/check_test.go index 6a6f80afc3..d74994b175 100644 --- a/command/agent/check_test.go +++ b/command/agent/check_test.go @@ -1,8 +1,11 @@ package agent import ( + "fmt" "github.com/hashicorp/consul/consul/structs" "log" + "net/http" + "net/http/httptest" "os" "testing" "time" @@ -160,3 +163,94 @@ func TestCheckTTL(t *testing.T) { t.Fatalf("should be critical %v", mock.state) } } + +func mockHTTPServer(responseCode int) *httptest.Server { + mux := http.NewServeMux() + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(responseCode) + return + }) + + return httptest.NewServer(mux) +} + +func expectHTTPStatus(t *testing.T, url string, status string) { + mock := &MockNotify{ + state: make(map[string]string), + updates: make(map[string]int), + output: make(map[string]string), + } + check := &CheckHTTP{ + Notify: mock, + CheckID: "foo", + HTTP: url, + Interval: 10 * time.Millisecond, + Logger: log.New(os.Stderr, "", log.LstdFlags), + } + check.Start() + defer check.Stop() + + time.Sleep(50 * time.Millisecond) + + // Should have at least 2 updates + if mock.updates["foo"] < 2 { + t.Fatalf("should have 2 updates %v", mock.updates) + } + + if mock.state["foo"] != status { + t.Fatalf("should be %v %v", status, mock.state) + } +} + +func TestCheckHTTPCritical(t *testing.T) { + // var server *httptest.Server + + server := mockHTTPServer(150) + fmt.Println(server.URL) + expectHTTPStatus(t, server.URL, "critical") + server.Close() + + // 2xx - 1 + server = mockHTTPServer(199) + expectHTTPStatus(t, server.URL, "critical") + server.Close() + + // 2xx + 1 + server = mockHTTPServer(300) + expectHTTPStatus(t, server.URL, "critical") + server.Close() + + server = mockHTTPServer(400) + expectHTTPStatus(t, server.URL, "critical") + server.Close() + + server = mockHTTPServer(500) + expectHTTPStatus(t, server.URL, "critical") + server.Close() +} + +func TestCheckHTTPPassing(t *testing.T) { + var server *httptest.Server + + server = mockHTTPServer(200) + expectHTTPStatus(t, server.URL, "passing") + server.Close() + + server = mockHTTPServer(201) + expectHTTPStatus(t, server.URL, "passing") + server.Close() + + server = mockHTTPServer(250) + expectHTTPStatus(t, server.URL, "passing") + server.Close() + + server = mockHTTPServer(299) + expectHTTPStatus(t, server.URL, "passing") + server.Close() +} + +func TestCheckHTTPWarning(t *testing.T) { + server := mockHTTPServer(429) + expectHTTPStatus(t, server.URL, "warning") + server.Close() +} diff --git a/website/source/docs/agent/checks.html.markdown b/website/source/docs/agent/checks.html.markdown index 8a31a04620..2102ebb607 100644 --- a/website/source/docs/agent/checks.html.markdown +++ b/website/source/docs/agent/checks.html.markdown @@ -13,13 +13,18 @@ application level health checks. A health check is considered to be application level if it associated with a service. A check is defined in a configuration file, or added at runtime over the HTTP interface. -There are two different kinds of checks: +There are three different kinds of checks: * Script + Interval - These checks depend on invoking an external application that does the health check and exits with an appropriate exit code, potentially generating some output. A script is paired with an invocation interval (e.g. every 30 seconds). This is similar to the Nagios plugin system. + * HTTP + Interval - These checks make an `HTTP GET` request every Interval (e.g. + every 30 seconds) to the specified URL. The status of the service depends on the HTTP Response Code. + any `2xx` code is passing, `429 Too Many Requests` is warning and anything else is failing. + This type of check should be preferred over a script that for example uses `curl`. + * Time to Live (TTL) - These checks retain their last known state for a given TTL. The state of the check must be updated periodically over the HTTP interface. If an external system fails to update the status within a given TTL, the check is @@ -43,6 +48,19 @@ A check definition that is a script looks like: } ``` +An HTTP based check looks like: + +```javascript +{ + "check": { + "id": "api", + "name": "HTTP API on port 5000", + "http": "http://localhost:5000/health", + "interval": "10s" + } +} +``` + A TTL based check is very similar: ```javascript @@ -56,7 +74,7 @@ A TTL based check is very similar: } ``` -Both types of definitions must include a `name`, and may optionally +Each type of definitions must include a `name`, and may optionally provide an `id` and `notes` field. The `id` is set to the `name` if not provided. It is required that all checks have a unique ID per node, so if names might conflict then unique ID's should be provided. @@ -102,6 +120,12 @@ key in your configuration file. }, { "id": "chk2", + "name": "/health", + "http": "http://localhost:5000/health", + "interval": "15s" + }, + { + "id": "chk3", "name": "cpu", "script": "/bin/check_cpu", "interval": "10s" diff --git a/website/source/docs/agent/http.html.markdown b/website/source/docs/agent/http.html.markdown index 4ffb070a46..5c620d0d6b 100644 --- a/website/source/docs/agent/http.html.markdown +++ b/website/source/docs/agent/http.html.markdown @@ -422,7 +422,7 @@ The endpoint always returns 200. The register endpoint is used to add a new check to the local agent. There is more documentation on checks [here](/docs/agent/checks.html). -Checks are either a script or TTL type. The agent is responsible for managing +Checks are of script, HTTP, or TTL type. The agent is responsible for managing the status of the check and keeping the Catalog in sync. The register endpoint expects a JSON request body to be PUT. The request @@ -434,20 +434,25 @@ body must look like: "Name": "Memory utilization", "Notes": "Ensure we don't oversubscribe memory", "Script": "/usr/local/bin/check_mem.py", + "HTTP": "http://example.com", "Interval": "10s", "TTL": "15s" } ``` -The `Name` field is mandatory, as is either `Script` and `Interval` -or `TTL`. Only one of `Script` and `Interval` or `TTL` should be provided. +The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`. +`Script` and `HTTP` also require that `Interval` be set. + If an `ID` is not provided, it is set to `Name`. You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID. The `Notes` field is not used by Consul, and is meant to be human readable. If a `Script` is provided, the check type is a script, and Consul will -evaluate the script every `Interval` to update the status. If a `TTL` type -is used, then the TTL update APIs must be used to periodically update +evaluate the script every `Interval` to update the status. + +An `HTTP` check will preform an HTTP GET request to the value of `HTTP` (expected to be a URL) every `Interval`. If the response is any `2xx` code the check is passing, if the response is `429 Too Many Requests` the check is warning, otherwise the check is critical. + +If a `TTL` type is used, then the TTL update APIs must be used to periodically update the state of the check. The return code is 200 on success. @@ -515,6 +520,7 @@ body must look like: "Port": 8000, "Check": { "Script": "/usr/local/bin/check_redis.py", + "HTTP": "http://localhost:5000/health", "Interval": "10s", "TTL": "15s" } @@ -523,8 +529,10 @@ body must look like: The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`. You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID. -`Tags`, `Address`, `Port` and `Check` are optional. If `Check` is provided, only one of `Script` and `Interval` -or `TTL` should be provided. There is more information about checks [here](/docs/agent/checks.html). +`Tags`, `Address`, `Port` and `Check` are optional. +If `Check` is provided, only one of `Script`, `HTTP` or `TTL` should be provided. +`Script` and `HTTP` also require `Interval`. +There is more information about checks [here](/docs/agent/checks.html). The `Address` will default to that of the agent if not provided. The created check will be named "service:\". diff --git a/website/source/docs/agent/services.html.markdown b/website/source/docs/agent/services.html.markdown index 442710ec73..230de7594a 100644 --- a/website/source/docs/agent/services.html.markdown +++ b/website/source/docs/agent/services.html.markdown @@ -55,7 +55,8 @@ node has any failing system-level check, the DNS interface will omit that node from any service query. There is more information about [checks here](/docs/agent/checks.html). The -check must be of the script or TTL type. If it is a script type, `script` and +check must be of the script, HTTP or TTL type. If it is a script type, `script` and +`interval` must be provided. If it is a HTTP type, `http` and `interval` must be provided. If it is a TTL type, then only `ttl` must be provided. The check name is automatically generated as "service:".