Merge pull request #592 from nicholascapo/check-http

command/agent: Add simple HTTP check type
pull/604/head
Armon Dadgar 2015-01-13 12:11:50 -08:00
commit 713d30c73e
6 changed files with 292 additions and 17 deletions

View File

@ -51,11 +51,16 @@ type Agent struct {
state localState state localState
// checkMonitors maps the check ID to an associated monitor // checkMonitors maps the check ID to an associated monitor
// checkTTLs maps the check ID to an associated check TTL
// checkLock protects updates to either
checkMonitors map[string]*CheckMonitor checkMonitors map[string]*CheckMonitor
checkTTLs map[string]*CheckTTL
checkLock sync.Mutex // checkHTTPs maps the check ID to an associated HTTP check
checkHTTPs map[string]*CheckHTTP
// checkTTLs maps the check ID to an associated check TTL
checkTTLs map[string]*CheckTTL
// checkLock protects updates to the check* maps
checkLock sync.Mutex
// eventCh is used to receive user events // eventCh is used to receive user events
eventCh chan serf.UserEvent eventCh chan serf.UserEvent
@ -111,6 +116,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
logOutput: logOutput, logOutput: logOutput,
checkMonitors: make(map[string]*CheckMonitor), checkMonitors: make(map[string]*CheckMonitor),
checkTTLs: make(map[string]*CheckTTL), checkTTLs: make(map[string]*CheckTTL),
checkHTTPs: make(map[string]*CheckHTTP),
eventCh: make(chan serf.UserEvent, 1024), eventCh: make(chan serf.UserEvent, 1024),
eventBuf: make([]*UserEvent, 256), eventBuf: make([]*UserEvent, 256),
shutdownCh: make(chan struct{}), shutdownCh: make(chan struct{}),
@ -382,6 +388,10 @@ func (a *Agent) Shutdown() error {
chk.Stop() chk.Stop()
} }
for _, chk := range a.checkHTTPs {
chk.Stop()
}
a.logger.Println("[INFO] agent: requesting shutdown") a.logger.Println("[INFO] agent: requesting shutdown")
var err error var err error
if a.server != nil { if a.server != nil {
@ -661,6 +671,26 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
ttl.Start() ttl.Start()
a.checkTTLs[check.CheckID] = ttl a.checkTTLs[check.CheckID] = ttl
} else if chkType.IsHTTP() {
if existing, ok := a.checkHTTPs[check.CheckID]; ok {
existing.Stop()
}
if chkType.Interval < MinInterval {
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
check.CheckID, MinInterval))
chkType.Interval = MinInterval
}
http := &CheckHTTP{
Notify: &a.state,
CheckID: check.CheckID,
HTTP: chkType.HTTP,
Interval: chkType.Interval,
Logger: a.logger,
}
http.Start()
a.checkHTTPs[check.CheckID] = http
} else { } else {
if existing, ok := a.checkMonitors[check.CheckID]; ok { if existing, ok := a.checkMonitors[check.CheckID]; ok {
existing.Stop() existing.Stop()
@ -708,6 +738,10 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error {
check.Stop() check.Stop()
delete(a.checkMonitors, checkID) delete(a.checkMonitors, checkID)
} }
if check, ok := a.checkHTTPs[checkID]; ok {
check.Stop()
delete(a.checkHTTPs, checkID)
}
if check, ok := a.checkTTLs[checkID]; ok { if check, ok := a.checkTTLs[checkID]; ok {
check.Stop() check.Stop()
delete(a.checkTTLs, checkID) delete(a.checkTTLs, checkID)

View File

@ -4,7 +4,9 @@ import (
"fmt" "fmt"
"github.com/armon/circbuf" "github.com/armon/circbuf"
"github.com/hashicorp/consul/consul/structs" "github.com/hashicorp/consul/consul/structs"
"io/ioutil"
"log" "log"
"net/http"
"os/exec" "os/exec"
"sync" "sync"
"syscall" "syscall"
@ -23,10 +25,14 @@ const (
) )
// CheckType is used to create either the CheckMonitor // CheckType is used to create either the CheckMonitor
// or the CheckTTL. Only one of TTL or Script/Interval // or the CheckTTL.
// needs to be provided // Three types are supported: Script, HTTP, and TTL
// Script and HTTP both require Interval
// Only one of the types needs to be provided
// TTL or Script/Interval or HTTP/Interval
type CheckType struct { type CheckType struct {
Script string Script string
HTTP string
Interval time.Duration Interval time.Duration
TTL time.Duration TTL time.Duration
@ -36,7 +42,7 @@ type CheckType struct {
// Valid checks if the CheckType is valid // Valid checks if the CheckType is valid
func (c *CheckType) Valid() bool { func (c *CheckType) Valid() bool {
return c.IsTTL() || c.IsMonitor() return c.IsTTL() || c.IsMonitor() || c.IsHTTP()
} }
// IsTTL checks if this is a TTL type // IsTTL checks if this is a TTL type
@ -49,6 +55,11 @@ func (c *CheckType) IsMonitor() bool {
return c.Script != "" && c.Interval != 0 return c.Script != "" && c.Interval != 0
} }
// IsHTTP checks if this is a HTTP type
func (c *CheckType) IsHTTP() bool {
return c.HTTP != "" && c.Interval != 0
}
// CheckNotifier interface is used by the CheckMonitor // CheckNotifier interface is used by the CheckMonitor
// to notify when a check has a status update. The update // to notify when a check has a status update. The update
// should take care to be idempotent. // should take care to be idempotent.
@ -244,3 +255,106 @@ type persistedCheck struct {
Check *structs.HealthCheck Check *structs.HealthCheck
ChkType *CheckType ChkType *CheckType
} }
// CheckHTTP is used to periodically make an HTTP request to
// determine the health of a given check.
// The check is passing if the response code is 200.
// The check is warning if the response code is 503.
// The check is critical if the response code is anything else
// or if the request returns an error
type CheckHTTP struct {
Notify CheckNotifier
CheckID string
HTTP string
Interval time.Duration
Logger *log.Logger
httpClient *http.Client
stop bool
stopCh chan struct{}
stopLock sync.Mutex
}
// Start is used to start an HTTP check.
// The check runs until stop is called
func (c *CheckHTTP) Start() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if c.httpClient == nil {
// For long (>10s) interval checks the http timeout is 10s, otherwise the
// timeout is the interval. This means that a check *should* return
// before the next check begins.
if c.Interval < 10*time.Second {
c.httpClient = &http.Client{Timeout: c.Interval}
} else {
c.httpClient = &http.Client{Timeout: 10 * time.Second}
}
}
c.stop = false
c.stopCh = make(chan struct{})
go c.run()
}
// Stop is used to stop an HTTP check.
func (c *CheckHTTP) Stop() {
c.stopLock.Lock()
defer c.stopLock.Unlock()
if !c.stop {
c.stop = true
close(c.stopCh)
}
}
// run is invoked by a goroutine to run until Stop() is called
func (c *CheckHTTP) run() {
// Get the randomized initial pause time
initialPauseTime := randomStagger(c.Interval)
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
next := time.After(initialPauseTime)
for {
select {
case <-next:
c.check()
next = time.After(c.Interval)
case <-c.stopCh:
return
}
}
}
// check is invoked periodically to perform the HTTP check
func (c *CheckHTTP) check() {
resp, err := c.httpClient.Get(c.HTTP)
if err != nil {
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error())
return
}
defer resp.Body.Close()
if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
// PASSING (2xx)
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
c.Logger.Printf("[WARN] check '%v': Get error while reading body: %s", c.CheckID, err)
body = []byte{}
}
result := fmt.Sprintf("HTTP GET %s: %s Output: %s", c.HTTP, resp.Status, body)
c.Logger.Printf("[DEBUG] agent: http check '%v' is passing: %s", c.CheckID, result)
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, result)
} else if resp.StatusCode == 429 {
// WARNING
// 429 Too Many Requests (RFC 6585)
// The user has sent too many requests in a given amount of time.
c.Logger.Printf("[WARN] check '%v' is now warning", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthWarning, resp.Status)
} else {
// CRITICAL
c.Logger.Printf("[WARN] check '%v' is now critical", c.CheckID)
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, resp.Status)
}
}

View File

@ -1,8 +1,11 @@
package agent package agent
import ( import (
"fmt"
"github.com/hashicorp/consul/consul/structs" "github.com/hashicorp/consul/consul/structs"
"log" "log"
"net/http"
"net/http/httptest"
"os" "os"
"testing" "testing"
"time" "time"
@ -160,3 +163,94 @@ func TestCheckTTL(t *testing.T) {
t.Fatalf("should be critical %v", mock.state) t.Fatalf("should be critical %v", mock.state)
} }
} }
func mockHTTPServer(responseCode int) *httptest.Server {
mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(responseCode)
return
})
return httptest.NewServer(mux)
}
func expectHTTPStatus(t *testing.T, url string, status string) {
mock := &MockNotify{
state: make(map[string]string),
updates: make(map[string]int),
output: make(map[string]string),
}
check := &CheckHTTP{
Notify: mock,
CheckID: "foo",
HTTP: url,
Interval: 10 * time.Millisecond,
Logger: log.New(os.Stderr, "", log.LstdFlags),
}
check.Start()
defer check.Stop()
time.Sleep(50 * time.Millisecond)
// Should have at least 2 updates
if mock.updates["foo"] < 2 {
t.Fatalf("should have 2 updates %v", mock.updates)
}
if mock.state["foo"] != status {
t.Fatalf("should be %v %v", status, mock.state)
}
}
func TestCheckHTTPCritical(t *testing.T) {
// var server *httptest.Server
server := mockHTTPServer(150)
fmt.Println(server.URL)
expectHTTPStatus(t, server.URL, "critical")
server.Close()
// 2xx - 1
server = mockHTTPServer(199)
expectHTTPStatus(t, server.URL, "critical")
server.Close()
// 2xx + 1
server = mockHTTPServer(300)
expectHTTPStatus(t, server.URL, "critical")
server.Close()
server = mockHTTPServer(400)
expectHTTPStatus(t, server.URL, "critical")
server.Close()
server = mockHTTPServer(500)
expectHTTPStatus(t, server.URL, "critical")
server.Close()
}
func TestCheckHTTPPassing(t *testing.T) {
var server *httptest.Server
server = mockHTTPServer(200)
expectHTTPStatus(t, server.URL, "passing")
server.Close()
server = mockHTTPServer(201)
expectHTTPStatus(t, server.URL, "passing")
server.Close()
server = mockHTTPServer(250)
expectHTTPStatus(t, server.URL, "passing")
server.Close()
server = mockHTTPServer(299)
expectHTTPStatus(t, server.URL, "passing")
server.Close()
}
func TestCheckHTTPWarning(t *testing.T) {
server := mockHTTPServer(429)
expectHTTPStatus(t, server.URL, "warning")
server.Close()
}

View File

@ -13,13 +13,18 @@ application level health checks. A health check is considered to be application
level if it associated with a service. A check is defined in a configuration file, level if it associated with a service. A check is defined in a configuration file,
or added at runtime over the HTTP interface. or added at runtime over the HTTP interface.
There are two different kinds of checks: There are three different kinds of checks:
* Script + Interval - These checks depend on invoking an external application * Script + Interval - These checks depend on invoking an external application
that does the health check and exits with an appropriate exit code, potentially that does the health check and exits with an appropriate exit code, potentially
generating some output. A script is paired with an invocation interval (e.g. generating some output. A script is paired with an invocation interval (e.g.
every 30 seconds). This is similar to the Nagios plugin system. every 30 seconds). This is similar to the Nagios plugin system.
* HTTP + Interval - These checks make an `HTTP GET` request every Interval (e.g.
every 30 seconds) to the specified URL. The status of the service depends on the HTTP Response Code.
any `2xx` code is passing, `429 Too Many Requests` is warning and anything else is failing.
This type of check should be preferred over a script that for example uses `curl`.
* Time to Live (TTL) - These checks retain their last known state for a given TTL. * Time to Live (TTL) - These checks retain their last known state for a given TTL.
The state of the check must be updated periodically over the HTTP interface. If an The state of the check must be updated periodically over the HTTP interface. If an
external system fails to update the status within a given TTL, the check is external system fails to update the status within a given TTL, the check is
@ -43,6 +48,19 @@ A check definition that is a script looks like:
} }
``` ```
An HTTP based check looks like:
```javascript
{
"check": {
"id": "api",
"name": "HTTP API on port 5000",
"http": "http://localhost:5000/health",
"interval": "10s"
}
}
```
A TTL based check is very similar: A TTL based check is very similar:
```javascript ```javascript
@ -56,7 +74,7 @@ A TTL based check is very similar:
} }
``` ```
Both types of definitions must include a `name`, and may optionally Each type of definitions must include a `name`, and may optionally
provide an `id` and `notes` field. The `id` is set to the `name` if not provide an `id` and `notes` field. The `id` is set to the `name` if not
provided. It is required that all checks have a unique ID per node, so if names provided. It is required that all checks have a unique ID per node, so if names
might conflict then unique ID's should be provided. might conflict then unique ID's should be provided.
@ -102,6 +120,12 @@ key in your configuration file.
}, },
{ {
"id": "chk2", "id": "chk2",
"name": "/health",
"http": "http://localhost:5000/health",
"interval": "15s"
},
{
"id": "chk3",
"name": "cpu", "name": "cpu",
"script": "/bin/check_cpu", "script": "/bin/check_cpu",
"interval": "10s" "interval": "10s"

View File

@ -422,7 +422,7 @@ The endpoint always returns 200.
The register endpoint is used to add a new check to the local agent. The register endpoint is used to add a new check to the local agent.
There is more documentation on checks [here](/docs/agent/checks.html). There is more documentation on checks [here](/docs/agent/checks.html).
Checks are either a script or TTL type. The agent is responsible for managing Checks are of script, HTTP, or TTL type. The agent is responsible for managing
the status of the check and keeping the Catalog in sync. the status of the check and keeping the Catalog in sync.
The register endpoint expects a JSON request body to be PUT. The request The register endpoint expects a JSON request body to be PUT. The request
@ -434,20 +434,25 @@ body must look like:
"Name": "Memory utilization", "Name": "Memory utilization",
"Notes": "Ensure we don't oversubscribe memory", "Notes": "Ensure we don't oversubscribe memory",
"Script": "/usr/local/bin/check_mem.py", "Script": "/usr/local/bin/check_mem.py",
"HTTP": "http://example.com",
"Interval": "10s", "Interval": "10s",
"TTL": "15s" "TTL": "15s"
} }
``` ```
The `Name` field is mandatory, as is either `Script` and `Interval` The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`.
or `TTL`. Only one of `Script` and `Interval` or `TTL` should be provided. `Script` and `HTTP` also require that `Interval` be set.
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
`ID` entries per agent, so it may be necessary to provide an ID. The `Notes` `ID` entries per agent, so it may be necessary to provide an ID. The `Notes`
field is not used by Consul, and is meant to be human readable. field is not used by Consul, and is meant to be human readable.
If a `Script` is provided, the check type is a script, and Consul will If a `Script` is provided, the check type is a script, and Consul will
evaluate the script every `Interval` to update the status. If a `TTL` type evaluate the script every `Interval` to update the status.
is used, then the TTL update APIs must be used to periodically update
An `HTTP` check will preform an HTTP GET request to the value of `HTTP` (expected to be a URL) every `Interval`. If the response is any `2xx` code the check is passing, if the response is `429 Too Many Requests` the check is warning, otherwise the check is critical.
If a `TTL` type is used, then the TTL update APIs must be used to periodically update
the state of the check. the state of the check.
The return code is 200 on success. The return code is 200 on success.
@ -515,6 +520,7 @@ body must look like:
"Port": 8000, "Port": 8000,
"Check": { "Check": {
"Script": "/usr/local/bin/check_redis.py", "Script": "/usr/local/bin/check_redis.py",
"HTTP": "http://localhost:5000/health",
"Interval": "10s", "Interval": "10s",
"TTL": "15s" "TTL": "15s"
} }
@ -523,8 +529,10 @@ body must look like:
The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`. The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`.
You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID. You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID.
`Tags`, `Address`, `Port` and `Check` are optional. If `Check` is provided, only one of `Script` and `Interval` `Tags`, `Address`, `Port` and `Check` are optional.
or `TTL` should be provided. There is more information about checks [here](/docs/agent/checks.html). If `Check` is provided, only one of `Script`, `HTTP` or `TTL` should be provided.
`Script` and `HTTP` also require `Interval`.
There is more information about checks [here](/docs/agent/checks.html).
The `Address` will default to that of the agent if not provided. The `Address` will default to that of the agent if not provided.
The created check will be named "service:\<ServiceId\>". The created check will be named "service:\<ServiceId\>".

View File

@ -55,7 +55,8 @@ node has any failing system-level check, the DNS interface will omit that
node from any service query. node from any service query.
There is more information about [checks here](/docs/agent/checks.html). The There is more information about [checks here](/docs/agent/checks.html). The
check must be of the script or TTL type. If it is a script type, `script` and check must be of the script, HTTP or TTL type. If it is a script type, `script` and
`interval` must be provided. If it is a HTTP type, `http` and
`interval` must be provided. If it is a TTL type, then only `ttl` must be `interval` must be provided. If it is a TTL type, then only `ttl` must be
provided. The check name is automatically generated as "service:<service-id>". provided. The check name is automatically generated as "service:<service-id>".