mirror of https://github.com/hashicorp/consul
Merge pull request #592 from nicholascapo/check-http
command/agent: Add simple HTTP check typepull/604/head
commit
713d30c73e
|
@ -51,11 +51,16 @@ type Agent struct {
|
|||
state localState
|
||||
|
||||
// checkMonitors maps the check ID to an associated monitor
|
||||
// checkTTLs maps the check ID to an associated check TTL
|
||||
// checkLock protects updates to either
|
||||
checkMonitors map[string]*CheckMonitor
|
||||
checkTTLs map[string]*CheckTTL
|
||||
checkLock sync.Mutex
|
||||
|
||||
// checkHTTPs maps the check ID to an associated HTTP check
|
||||
checkHTTPs map[string]*CheckHTTP
|
||||
|
||||
// checkTTLs maps the check ID to an associated check TTL
|
||||
checkTTLs map[string]*CheckTTL
|
||||
|
||||
// checkLock protects updates to the check* maps
|
||||
checkLock sync.Mutex
|
||||
|
||||
// eventCh is used to receive user events
|
||||
eventCh chan serf.UserEvent
|
||||
|
@ -111,6 +116,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
|
|||
logOutput: logOutput,
|
||||
checkMonitors: make(map[string]*CheckMonitor),
|
||||
checkTTLs: make(map[string]*CheckTTL),
|
||||
checkHTTPs: make(map[string]*CheckHTTP),
|
||||
eventCh: make(chan serf.UserEvent, 1024),
|
||||
eventBuf: make([]*UserEvent, 256),
|
||||
shutdownCh: make(chan struct{}),
|
||||
|
@ -382,6 +388,10 @@ func (a *Agent) Shutdown() error {
|
|||
chk.Stop()
|
||||
}
|
||||
|
||||
for _, chk := range a.checkHTTPs {
|
||||
chk.Stop()
|
||||
}
|
||||
|
||||
a.logger.Println("[INFO] agent: requesting shutdown")
|
||||
var err error
|
||||
if a.server != nil {
|
||||
|
@ -661,6 +671,26 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
|
|||
ttl.Start()
|
||||
a.checkTTLs[check.CheckID] = ttl
|
||||
|
||||
} else if chkType.IsHTTP() {
|
||||
if existing, ok := a.checkHTTPs[check.CheckID]; ok {
|
||||
existing.Stop()
|
||||
}
|
||||
if chkType.Interval < MinInterval {
|
||||
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
|
||||
check.CheckID, MinInterval))
|
||||
chkType.Interval = MinInterval
|
||||
}
|
||||
|
||||
http := &CheckHTTP{
|
||||
Notify: &a.state,
|
||||
CheckID: check.CheckID,
|
||||
HTTP: chkType.HTTP,
|
||||
Interval: chkType.Interval,
|
||||
Logger: a.logger,
|
||||
}
|
||||
http.Start()
|
||||
a.checkHTTPs[check.CheckID] = http
|
||||
|
||||
} else {
|
||||
if existing, ok := a.checkMonitors[check.CheckID]; ok {
|
||||
existing.Stop()
|
||||
|
@ -708,6 +738,10 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error {
|
|||
check.Stop()
|
||||
delete(a.checkMonitors, checkID)
|
||||
}
|
||||
if check, ok := a.checkHTTPs[checkID]; ok {
|
||||
check.Stop()
|
||||
delete(a.checkHTTPs, checkID)
|
||||
}
|
||||
if check, ok := a.checkTTLs[checkID]; ok {
|
||||
check.Stop()
|
||||
delete(a.checkTTLs, checkID)
|
||||
|
|
|
@ -4,7 +4,9 @@ import (
|
|||
"fmt"
|
||||
"github.com/armon/circbuf"
|
||||
"github.com/hashicorp/consul/consul/structs"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
@ -23,10 +25,14 @@ const (
|
|||
)
|
||||
|
||||
// CheckType is used to create either the CheckMonitor
|
||||
// or the CheckTTL. Only one of TTL or Script/Interval
|
||||
// needs to be provided
|
||||
// or the CheckTTL.
|
||||
// Three types are supported: Script, HTTP, and TTL
|
||||
// Script and HTTP both require Interval
|
||||
// Only one of the types needs to be provided
|
||||
// TTL or Script/Interval or HTTP/Interval
|
||||
type CheckType struct {
|
||||
Script string
|
||||
HTTP string
|
||||
Interval time.Duration
|
||||
|
||||
TTL time.Duration
|
||||
|
@ -36,7 +42,7 @@ type CheckType struct {
|
|||
|
||||
// Valid checks if the CheckType is valid
|
||||
func (c *CheckType) Valid() bool {
|
||||
return c.IsTTL() || c.IsMonitor()
|
||||
return c.IsTTL() || c.IsMonitor() || c.IsHTTP()
|
||||
}
|
||||
|
||||
// IsTTL checks if this is a TTL type
|
||||
|
@ -49,6 +55,11 @@ func (c *CheckType) IsMonitor() bool {
|
|||
return c.Script != "" && c.Interval != 0
|
||||
}
|
||||
|
||||
// IsHTTP checks if this is a HTTP type
|
||||
func (c *CheckType) IsHTTP() bool {
|
||||
return c.HTTP != "" && c.Interval != 0
|
||||
}
|
||||
|
||||
// CheckNotifier interface is used by the CheckMonitor
|
||||
// to notify when a check has a status update. The update
|
||||
// should take care to be idempotent.
|
||||
|
@ -244,3 +255,106 @@ type persistedCheck struct {
|
|||
Check *structs.HealthCheck
|
||||
ChkType *CheckType
|
||||
}
|
||||
|
||||
// CheckHTTP is used to periodically make an HTTP request to
|
||||
// determine the health of a given check.
|
||||
// The check is passing if the response code is 200.
|
||||
// The check is warning if the response code is 503.
|
||||
// The check is critical if the response code is anything else
|
||||
// or if the request returns an error
|
||||
type CheckHTTP struct {
|
||||
Notify CheckNotifier
|
||||
CheckID string
|
||||
HTTP string
|
||||
Interval time.Duration
|
||||
Logger *log.Logger
|
||||
|
||||
httpClient *http.Client
|
||||
stop bool
|
||||
stopCh chan struct{}
|
||||
stopLock sync.Mutex
|
||||
}
|
||||
|
||||
// Start is used to start an HTTP check.
|
||||
// The check runs until stop is called
|
||||
func (c *CheckHTTP) Start() {
|
||||
c.stopLock.Lock()
|
||||
defer c.stopLock.Unlock()
|
||||
|
||||
if c.httpClient == nil {
|
||||
// For long (>10s) interval checks the http timeout is 10s, otherwise the
|
||||
// timeout is the interval. This means that a check *should* return
|
||||
// before the next check begins.
|
||||
if c.Interval < 10*time.Second {
|
||||
c.httpClient = &http.Client{Timeout: c.Interval}
|
||||
} else {
|
||||
c.httpClient = &http.Client{Timeout: 10 * time.Second}
|
||||
}
|
||||
}
|
||||
|
||||
c.stop = false
|
||||
c.stopCh = make(chan struct{})
|
||||
go c.run()
|
||||
}
|
||||
|
||||
// Stop is used to stop an HTTP check.
|
||||
func (c *CheckHTTP) Stop() {
|
||||
c.stopLock.Lock()
|
||||
defer c.stopLock.Unlock()
|
||||
if !c.stop {
|
||||
c.stop = true
|
||||
close(c.stopCh)
|
||||
}
|
||||
}
|
||||
|
||||
// run is invoked by a goroutine to run until Stop() is called
|
||||
func (c *CheckHTTP) run() {
|
||||
// Get the randomized initial pause time
|
||||
initialPauseTime := randomStagger(c.Interval)
|
||||
c.Logger.Printf("[DEBUG] agent: pausing %v before first HTTP request of %s", initialPauseTime, c.HTTP)
|
||||
next := time.After(initialPauseTime)
|
||||
for {
|
||||
select {
|
||||
case <-next:
|
||||
c.check()
|
||||
next = time.After(c.Interval)
|
||||
case <-c.stopCh:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// check is invoked periodically to perform the HTTP check
|
||||
func (c *CheckHTTP) check() {
|
||||
resp, err := c.httpClient.Get(c.HTTP)
|
||||
if err != nil {
|
||||
c.Logger.Printf("[WARN] agent: http request failed '%s': %s", c.HTTP, err)
|
||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error())
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 200 && resp.StatusCode <= 299 {
|
||||
// PASSING (2xx)
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
c.Logger.Printf("[WARN] check '%v': Get error while reading body: %s", c.CheckID, err)
|
||||
body = []byte{}
|
||||
}
|
||||
result := fmt.Sprintf("HTTP GET %s: %s Output: %s", c.HTTP, resp.Status, body)
|
||||
c.Logger.Printf("[DEBUG] agent: http check '%v' is passing: %s", c.CheckID, result)
|
||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, result)
|
||||
|
||||
} else if resp.StatusCode == 429 {
|
||||
// WARNING
|
||||
// 429 Too Many Requests (RFC 6585)
|
||||
// The user has sent too many requests in a given amount of time.
|
||||
c.Logger.Printf("[WARN] check '%v' is now warning", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthWarning, resp.Status)
|
||||
|
||||
} else {
|
||||
// CRITICAL
|
||||
c.Logger.Printf("[WARN] check '%v' is now critical", c.CheckID)
|
||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, resp.Status)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
package agent
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/hashicorp/consul/consul/structs"
|
||||
"log"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
@ -160,3 +163,94 @@ func TestCheckTTL(t *testing.T) {
|
|||
t.Fatalf("should be critical %v", mock.state)
|
||||
}
|
||||
}
|
||||
|
||||
func mockHTTPServer(responseCode int) *httptest.Server {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(responseCode)
|
||||
return
|
||||
})
|
||||
|
||||
return httptest.NewServer(mux)
|
||||
}
|
||||
|
||||
func expectHTTPStatus(t *testing.T, url string, status string) {
|
||||
mock := &MockNotify{
|
||||
state: make(map[string]string),
|
||||
updates: make(map[string]int),
|
||||
output: make(map[string]string),
|
||||
}
|
||||
check := &CheckHTTP{
|
||||
Notify: mock,
|
||||
CheckID: "foo",
|
||||
HTTP: url,
|
||||
Interval: 10 * time.Millisecond,
|
||||
Logger: log.New(os.Stderr, "", log.LstdFlags),
|
||||
}
|
||||
check.Start()
|
||||
defer check.Stop()
|
||||
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Should have at least 2 updates
|
||||
if mock.updates["foo"] < 2 {
|
||||
t.Fatalf("should have 2 updates %v", mock.updates)
|
||||
}
|
||||
|
||||
if mock.state["foo"] != status {
|
||||
t.Fatalf("should be %v %v", status, mock.state)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckHTTPCritical(t *testing.T) {
|
||||
// var server *httptest.Server
|
||||
|
||||
server := mockHTTPServer(150)
|
||||
fmt.Println(server.URL)
|
||||
expectHTTPStatus(t, server.URL, "critical")
|
||||
server.Close()
|
||||
|
||||
// 2xx - 1
|
||||
server = mockHTTPServer(199)
|
||||
expectHTTPStatus(t, server.URL, "critical")
|
||||
server.Close()
|
||||
|
||||
// 2xx + 1
|
||||
server = mockHTTPServer(300)
|
||||
expectHTTPStatus(t, server.URL, "critical")
|
||||
server.Close()
|
||||
|
||||
server = mockHTTPServer(400)
|
||||
expectHTTPStatus(t, server.URL, "critical")
|
||||
server.Close()
|
||||
|
||||
server = mockHTTPServer(500)
|
||||
expectHTTPStatus(t, server.URL, "critical")
|
||||
server.Close()
|
||||
}
|
||||
|
||||
func TestCheckHTTPPassing(t *testing.T) {
|
||||
var server *httptest.Server
|
||||
|
||||
server = mockHTTPServer(200)
|
||||
expectHTTPStatus(t, server.URL, "passing")
|
||||
server.Close()
|
||||
|
||||
server = mockHTTPServer(201)
|
||||
expectHTTPStatus(t, server.URL, "passing")
|
||||
server.Close()
|
||||
|
||||
server = mockHTTPServer(250)
|
||||
expectHTTPStatus(t, server.URL, "passing")
|
||||
server.Close()
|
||||
|
||||
server = mockHTTPServer(299)
|
||||
expectHTTPStatus(t, server.URL, "passing")
|
||||
server.Close()
|
||||
}
|
||||
|
||||
func TestCheckHTTPWarning(t *testing.T) {
|
||||
server := mockHTTPServer(429)
|
||||
expectHTTPStatus(t, server.URL, "warning")
|
||||
server.Close()
|
||||
}
|
||||
|
|
|
@ -13,13 +13,18 @@ application level health checks. A health check is considered to be application
|
|||
level if it associated with a service. A check is defined in a configuration file,
|
||||
or added at runtime over the HTTP interface.
|
||||
|
||||
There are two different kinds of checks:
|
||||
There are three different kinds of checks:
|
||||
|
||||
* Script + Interval - These checks depend on invoking an external application
|
||||
that does the health check and exits with an appropriate exit code, potentially
|
||||
generating some output. A script is paired with an invocation interval (e.g.
|
||||
every 30 seconds). This is similar to the Nagios plugin system.
|
||||
|
||||
* HTTP + Interval - These checks make an `HTTP GET` request every Interval (e.g.
|
||||
every 30 seconds) to the specified URL. The status of the service depends on the HTTP Response Code.
|
||||
any `2xx` code is passing, `429 Too Many Requests` is warning and anything else is failing.
|
||||
This type of check should be preferred over a script that for example uses `curl`.
|
||||
|
||||
* Time to Live (TTL) - These checks retain their last known state for a given TTL.
|
||||
The state of the check must be updated periodically over the HTTP interface. If an
|
||||
external system fails to update the status within a given TTL, the check is
|
||||
|
@ -43,6 +48,19 @@ A check definition that is a script looks like:
|
|||
}
|
||||
```
|
||||
|
||||
An HTTP based check looks like:
|
||||
|
||||
```javascript
|
||||
{
|
||||
"check": {
|
||||
"id": "api",
|
||||
"name": "HTTP API on port 5000",
|
||||
"http": "http://localhost:5000/health",
|
||||
"interval": "10s"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
A TTL based check is very similar:
|
||||
|
||||
```javascript
|
||||
|
@ -56,7 +74,7 @@ A TTL based check is very similar:
|
|||
}
|
||||
```
|
||||
|
||||
Both types of definitions must include a `name`, and may optionally
|
||||
Each type of definitions must include a `name`, and may optionally
|
||||
provide an `id` and `notes` field. The `id` is set to the `name` if not
|
||||
provided. It is required that all checks have a unique ID per node, so if names
|
||||
might conflict then unique ID's should be provided.
|
||||
|
@ -102,6 +120,12 @@ key in your configuration file.
|
|||
},
|
||||
{
|
||||
"id": "chk2",
|
||||
"name": "/health",
|
||||
"http": "http://localhost:5000/health",
|
||||
"interval": "15s"
|
||||
},
|
||||
{
|
||||
"id": "chk3",
|
||||
"name": "cpu",
|
||||
"script": "/bin/check_cpu",
|
||||
"interval": "10s"
|
||||
|
|
|
@ -422,7 +422,7 @@ The endpoint always returns 200.
|
|||
|
||||
The register endpoint is used to add a new check to the local agent.
|
||||
There is more documentation on checks [here](/docs/agent/checks.html).
|
||||
Checks are either a script or TTL type. The agent is responsible for managing
|
||||
Checks are of script, HTTP, or TTL type. The agent is responsible for managing
|
||||
the status of the check and keeping the Catalog in sync.
|
||||
|
||||
The register endpoint expects a JSON request body to be PUT. The request
|
||||
|
@ -434,20 +434,25 @@ body must look like:
|
|||
"Name": "Memory utilization",
|
||||
"Notes": "Ensure we don't oversubscribe memory",
|
||||
"Script": "/usr/local/bin/check_mem.py",
|
||||
"HTTP": "http://example.com",
|
||||
"Interval": "10s",
|
||||
"TTL": "15s"
|
||||
}
|
||||
```
|
||||
|
||||
The `Name` field is mandatory, as is either `Script` and `Interval`
|
||||
or `TTL`. Only one of `Script` and `Interval` or `TTL` should be provided.
|
||||
The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`.
|
||||
`Script` and `HTTP` also require that `Interval` be set.
|
||||
|
||||
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
|
||||
`ID` entries per agent, so it may be necessary to provide an ID. The `Notes`
|
||||
field is not used by Consul, and is meant to be human readable.
|
||||
|
||||
If a `Script` is provided, the check type is a script, and Consul will
|
||||
evaluate the script every `Interval` to update the status. If a `TTL` type
|
||||
is used, then the TTL update APIs must be used to periodically update
|
||||
evaluate the script every `Interval` to update the status.
|
||||
|
||||
An `HTTP` check will preform an HTTP GET request to the value of `HTTP` (expected to be a URL) every `Interval`. If the response is any `2xx` code the check is passing, if the response is `429 Too Many Requests` the check is warning, otherwise the check is critical.
|
||||
|
||||
If a `TTL` type is used, then the TTL update APIs must be used to periodically update
|
||||
the state of the check.
|
||||
|
||||
The return code is 200 on success.
|
||||
|
@ -515,6 +520,7 @@ body must look like:
|
|||
"Port": 8000,
|
||||
"Check": {
|
||||
"Script": "/usr/local/bin/check_redis.py",
|
||||
"HTTP": "http://localhost:5000/health",
|
||||
"Interval": "10s",
|
||||
"TTL": "15s"
|
||||
}
|
||||
|
@ -523,8 +529,10 @@ body must look like:
|
|||
|
||||
The `Name` field is mandatory, If an `ID` is not provided, it is set to `Name`.
|
||||
You cannot have duplicate `ID` entries per agent, so it may be necessary to provide an ID.
|
||||
`Tags`, `Address`, `Port` and `Check` are optional. If `Check` is provided, only one of `Script` and `Interval`
|
||||
or `TTL` should be provided. There is more information about checks [here](/docs/agent/checks.html).
|
||||
`Tags`, `Address`, `Port` and `Check` are optional.
|
||||
If `Check` is provided, only one of `Script`, `HTTP` or `TTL` should be provided.
|
||||
`Script` and `HTTP` also require `Interval`.
|
||||
There is more information about checks [here](/docs/agent/checks.html).
|
||||
The `Address` will default to that of the agent if not provided.
|
||||
|
||||
The created check will be named "service:\<ServiceId\>".
|
||||
|
|
|
@ -55,7 +55,8 @@ node has any failing system-level check, the DNS interface will omit that
|
|||
node from any service query.
|
||||
|
||||
There is more information about [checks here](/docs/agent/checks.html). The
|
||||
check must be of the script or TTL type. If it is a script type, `script` and
|
||||
check must be of the script, HTTP or TTL type. If it is a script type, `script` and
|
||||
`interval` must be provided. If it is a HTTP type, `http` and
|
||||
`interval` must be provided. If it is a TTL type, then only `ttl` must be
|
||||
provided. The check name is automatically generated as "service:<service-id>".
|
||||
|
||||
|
|
Loading…
Reference in New Issue