mirror of https://github.com/hashicorp/consul
commit
4a9b91f2a2
|
@ -68,6 +68,7 @@ type AgentServiceCheck struct {
|
||||||
Timeout string `json:",omitempty"`
|
Timeout string `json:",omitempty"`
|
||||||
TTL string `json:",omitempty"`
|
TTL string `json:",omitempty"`
|
||||||
HTTP string `json:",omitempty"`
|
HTTP string `json:",omitempty"`
|
||||||
|
TCP string `json:",omitempty"`
|
||||||
Status string `json:",omitempty"`
|
Status string `json:",omitempty"`
|
||||||
}
|
}
|
||||||
type AgentServiceChecks []*AgentServiceCheck
|
type AgentServiceChecks []*AgentServiceCheck
|
||||||
|
|
|
@ -75,6 +75,9 @@ type Agent struct {
|
||||||
// checkHTTPs maps the check ID to an associated HTTP check
|
// checkHTTPs maps the check ID to an associated HTTP check
|
||||||
checkHTTPs map[string]*CheckHTTP
|
checkHTTPs map[string]*CheckHTTP
|
||||||
|
|
||||||
|
// checkTCPs maps the check ID to an associated TCP check
|
||||||
|
checkTCPs map[string]*CheckTCP
|
||||||
|
|
||||||
// checkTTLs maps the check ID to an associated check TTL
|
// checkTTLs maps the check ID to an associated check TTL
|
||||||
checkTTLs map[string]*CheckTTL
|
checkTTLs map[string]*CheckTTL
|
||||||
|
|
||||||
|
@ -145,6 +148,7 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
|
||||||
checkMonitors: make(map[string]*CheckMonitor),
|
checkMonitors: make(map[string]*CheckMonitor),
|
||||||
checkTTLs: make(map[string]*CheckTTL),
|
checkTTLs: make(map[string]*CheckTTL),
|
||||||
checkHTTPs: make(map[string]*CheckHTTP),
|
checkHTTPs: make(map[string]*CheckHTTP),
|
||||||
|
checkTCPs: make(map[string]*CheckTCP),
|
||||||
eventCh: make(chan serf.UserEvent, 1024),
|
eventCh: make(chan serf.UserEvent, 1024),
|
||||||
eventBuf: make([]*UserEvent, 256),
|
eventBuf: make([]*UserEvent, 256),
|
||||||
shutdownCh: make(chan struct{}),
|
shutdownCh: make(chan struct{}),
|
||||||
|
@ -440,6 +444,10 @@ func (a *Agent) Shutdown() error {
|
||||||
chk.Stop()
|
chk.Stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, chk := range a.checkTCPs {
|
||||||
|
chk.Stop()
|
||||||
|
}
|
||||||
|
|
||||||
a.logger.Println("[INFO] agent: requesting shutdown")
|
a.logger.Println("[INFO] agent: requesting shutdown")
|
||||||
var err error
|
var err error
|
||||||
if a.server != nil {
|
if a.server != nil {
|
||||||
|
@ -801,6 +809,27 @@ func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *CheckType, persist
|
||||||
http.Start()
|
http.Start()
|
||||||
a.checkHTTPs[check.CheckID] = http
|
a.checkHTTPs[check.CheckID] = http
|
||||||
|
|
||||||
|
} else if chkType.IsTCP() {
|
||||||
|
if existing, ok := a.checkTCPs[check.CheckID]; ok {
|
||||||
|
existing.Stop()
|
||||||
|
}
|
||||||
|
if chkType.Interval < MinInterval {
|
||||||
|
a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
|
||||||
|
check.CheckID, MinInterval))
|
||||||
|
chkType.Interval = MinInterval
|
||||||
|
}
|
||||||
|
|
||||||
|
tcp := &CheckTCP{
|
||||||
|
Notify: &a.state,
|
||||||
|
CheckID: check.CheckID,
|
||||||
|
TCP: chkType.TCP,
|
||||||
|
Interval: chkType.Interval,
|
||||||
|
Timeout: chkType.Timeout,
|
||||||
|
Logger: a.logger,
|
||||||
|
}
|
||||||
|
tcp.Start()
|
||||||
|
a.checkTCPs[check.CheckID] = tcp
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
if existing, ok := a.checkMonitors[check.CheckID]; ok {
|
if existing, ok := a.checkMonitors[check.CheckID]; ok {
|
||||||
existing.Stop()
|
existing.Stop()
|
||||||
|
@ -857,6 +886,10 @@ func (a *Agent) RemoveCheck(checkID string, persist bool) error {
|
||||||
check.Stop()
|
check.Stop()
|
||||||
delete(a.checkHTTPs, checkID)
|
delete(a.checkHTTPs, checkID)
|
||||||
}
|
}
|
||||||
|
if check, ok := a.checkTCPs[checkID]; ok {
|
||||||
|
check.Stop()
|
||||||
|
delete(a.checkTCPs, checkID)
|
||||||
|
}
|
||||||
if check, ok := a.checkTTLs[checkID]; ok {
|
if check, ok := a.checkTTLs[checkID]; ok {
|
||||||
check.Stop()
|
check.Stop()
|
||||||
delete(a.checkTTLs, checkID)
|
delete(a.checkTTLs, checkID)
|
||||||
|
|
|
@ -4,6 +4,7 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"sync"
|
"sync"
|
||||||
|
@ -31,13 +32,14 @@ const (
|
||||||
|
|
||||||
// CheckType is used to create either the CheckMonitor
|
// CheckType is used to create either the CheckMonitor
|
||||||
// or the CheckTTL.
|
// or the CheckTTL.
|
||||||
// Three types are supported: Script, HTTP, and TTL
|
// Four types are supported: Script, HTTP, TCP and TTL
|
||||||
// Script and HTTP both require Interval
|
// Script, HTTP and TCP all require Interval
|
||||||
// Only one of the types needs to be provided
|
// Only one of the types needs to be provided
|
||||||
// TTL or Script/Interval or HTTP/Interval
|
// TTL or Script/Interval or HTTP/Interval or TCP/Interval
|
||||||
type CheckType struct {
|
type CheckType struct {
|
||||||
Script string
|
Script string
|
||||||
HTTP string
|
HTTP string
|
||||||
|
TCP string
|
||||||
Interval time.Duration
|
Interval time.Duration
|
||||||
|
|
||||||
Timeout time.Duration
|
Timeout time.Duration
|
||||||
|
@ -51,7 +53,7 @@ type CheckTypes []*CheckType
|
||||||
|
|
||||||
// Valid checks if the CheckType is valid
|
// Valid checks if the CheckType is valid
|
||||||
func (c *CheckType) Valid() bool {
|
func (c *CheckType) Valid() bool {
|
||||||
return c.IsTTL() || c.IsMonitor() || c.IsHTTP()
|
return c.IsTTL() || c.IsMonitor() || c.IsHTTP() || c.IsTCP()
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsTTL checks if this is a TTL type
|
// IsTTL checks if this is a TTL type
|
||||||
|
@ -69,6 +71,11 @@ func (c *CheckType) IsHTTP() bool {
|
||||||
return c.HTTP != "" && c.Interval != 0
|
return c.HTTP != "" && c.Interval != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsTCP checks if this is a TCP type
|
||||||
|
func (c *CheckType) IsTCP() bool {
|
||||||
|
return c.TCP != "" && c.Interval != 0
|
||||||
|
}
|
||||||
|
|
||||||
// CheckNotifier interface is used by the CheckMonitor
|
// CheckNotifier interface is used by the CheckMonitor
|
||||||
// to notify when a check has a status update. The update
|
// to notify when a check has a status update. The update
|
||||||
// should take care to be idempotent.
|
// should take care to be idempotent.
|
||||||
|
@ -402,3 +409,86 @@ func (c *CheckHTTP) check() {
|
||||||
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, result)
|
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CheckTCP is used to periodically make an TCP/UDP connection to
|
||||||
|
// determine the health of a given check.
|
||||||
|
// The check is passing if the connection succeeds
|
||||||
|
// The check is critical if the connection returns an error
|
||||||
|
type CheckTCP struct {
|
||||||
|
Notify CheckNotifier
|
||||||
|
CheckID string
|
||||||
|
TCP string
|
||||||
|
Interval time.Duration
|
||||||
|
Timeout time.Duration
|
||||||
|
Logger *log.Logger
|
||||||
|
|
||||||
|
dialer *net.Dialer
|
||||||
|
stop bool
|
||||||
|
stopCh chan struct{}
|
||||||
|
stopLock sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start is used to start a TCP check.
|
||||||
|
// The check runs until stop is called
|
||||||
|
func (c *CheckTCP) Start() {
|
||||||
|
c.stopLock.Lock()
|
||||||
|
defer c.stopLock.Unlock()
|
||||||
|
|
||||||
|
if c.dialer == nil {
|
||||||
|
// Create the socket dialer
|
||||||
|
c.dialer = &net.Dialer{DualStack: true}
|
||||||
|
|
||||||
|
// For long (>10s) interval checks the socket timeout is 10s, otherwise
|
||||||
|
// the timeout is the interval. This means that a check *should* return
|
||||||
|
// before the next check begins.
|
||||||
|
if c.Timeout > 0 && c.Timeout < c.Interval {
|
||||||
|
c.dialer.Timeout = c.Timeout
|
||||||
|
} else if c.Interval < 10*time.Second {
|
||||||
|
c.dialer.Timeout = c.Interval
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.stop = false
|
||||||
|
c.stopCh = make(chan struct{})
|
||||||
|
go c.run()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stop is used to stop a TCP check.
|
||||||
|
func (c *CheckTCP) Stop() {
|
||||||
|
c.stopLock.Lock()
|
||||||
|
defer c.stopLock.Unlock()
|
||||||
|
if !c.stop {
|
||||||
|
c.stop = true
|
||||||
|
close(c.stopCh)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// run is invoked by a goroutine to run until Stop() is called
|
||||||
|
func (c *CheckTCP) run() {
|
||||||
|
// Get the randomized initial pause time
|
||||||
|
initialPauseTime := randomStagger(c.Interval)
|
||||||
|
c.Logger.Printf("[DEBUG] agent: pausing %v before first socket connection of %s", initialPauseTime, c.TCP)
|
||||||
|
next := time.After(initialPauseTime)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-next:
|
||||||
|
c.check()
|
||||||
|
next = time.After(c.Interval)
|
||||||
|
case <-c.stopCh:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check is invoked periodically to perform the TCP check
|
||||||
|
func (c *CheckTCP) check() {
|
||||||
|
conn, err := c.dialer.Dial(`tcp`, c.TCP)
|
||||||
|
if err != nil {
|
||||||
|
c.Logger.Printf("[WARN] agent: socket connection failed '%s': %s", c.TCP, err)
|
||||||
|
c.Notify.UpdateCheck(c.CheckID, structs.HealthCritical, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
conn.Close()
|
||||||
|
c.Logger.Printf("[DEBUG] agent: check '%v' is passing", c.CheckID)
|
||||||
|
c.Notify.UpdateCheck(c.CheckID, structs.HealthPassing, fmt.Sprintf("TCP connect %s: Success", c.TCP))
|
||||||
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package agent
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
|
@ -321,3 +322,74 @@ func TestCheckHTTP_disablesKeepAlives(t *testing.T) {
|
||||||
t.Fatalf("should have disabled keepalives")
|
t.Fatalf("should have disabled keepalives")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mockTCPServer(network string) net.Listener {
|
||||||
|
var (
|
||||||
|
addr string
|
||||||
|
)
|
||||||
|
|
||||||
|
if network == `tcp6` {
|
||||||
|
addr = `[::1]:0`
|
||||||
|
} else {
|
||||||
|
addr = `127.0.0.1:0`
|
||||||
|
}
|
||||||
|
|
||||||
|
listener, err := net.Listen(network, addr)
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return listener
|
||||||
|
}
|
||||||
|
|
||||||
|
func expectTCPStatus(t *testing.T, tcp string, status string) {
|
||||||
|
mock := &MockNotify{
|
||||||
|
state: make(map[string]string),
|
||||||
|
updates: make(map[string]int),
|
||||||
|
output: make(map[string]string),
|
||||||
|
}
|
||||||
|
check := &CheckTCP{
|
||||||
|
Notify: mock,
|
||||||
|
CheckID: "foo",
|
||||||
|
TCP: tcp,
|
||||||
|
Interval: 10 * time.Millisecond,
|
||||||
|
Logger: log.New(os.Stderr, "", log.LstdFlags),
|
||||||
|
}
|
||||||
|
check.Start()
|
||||||
|
defer check.Stop()
|
||||||
|
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
|
||||||
|
// Should have at least 2 updates
|
||||||
|
if mock.updates["foo"] < 2 {
|
||||||
|
t.Fatalf("should have 2 updates %v", mock.updates)
|
||||||
|
}
|
||||||
|
|
||||||
|
if mock.state["foo"] != status {
|
||||||
|
t.Fatalf("should be %v %v", status, mock.state)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckTCPCritical(t *testing.T) {
|
||||||
|
var (
|
||||||
|
tcpServer net.Listener
|
||||||
|
)
|
||||||
|
|
||||||
|
tcpServer = mockTCPServer(`tcp`)
|
||||||
|
expectTCPStatus(t, `127.0.0.1:0`, "critical")
|
||||||
|
tcpServer.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCheckTCPPassing(t *testing.T) {
|
||||||
|
var (
|
||||||
|
tcpServer net.Listener
|
||||||
|
)
|
||||||
|
|
||||||
|
tcpServer = mockTCPServer(`tcp`)
|
||||||
|
expectTCPStatus(t, tcpServer.Addr().String(), "passing")
|
||||||
|
tcpServer.Close()
|
||||||
|
|
||||||
|
tcpServer = mockTCPServer(`tcp6`)
|
||||||
|
expectTCPStatus(t, tcpServer.Addr().String(), "passing")
|
||||||
|
tcpServer.Close()
|
||||||
|
}
|
||||||
|
|
|
@ -31,6 +31,20 @@ There are three different kinds of checks:
|
||||||
It is possible to configure a custom HTTP check timeout value by specifying
|
It is possible to configure a custom HTTP check timeout value by specifying
|
||||||
the `timeout` field in the check definition.
|
the `timeout` field in the check definition.
|
||||||
|
|
||||||
|
* TCP + Interval - These checks make an TCP connection attempt every Interval
|
||||||
|
(e.g. every 30 seconds) to the specified IP/hostname and port. The status of
|
||||||
|
the service depends on whether the connection attempt is successful (ie - the
|
||||||
|
port is currently accepting connections). If the connection is accepted, the
|
||||||
|
status is `success`, otherwise the status is `critical`. In the case of a
|
||||||
|
hostname that resolves to both IPv4 and IPv6 addresses, an attempt will be
|
||||||
|
made to both addresses, and the first successful connection attempt will
|
||||||
|
result in a successful check. This type of check should be preferred over a
|
||||||
|
script that uses `netcat` or another external process to check a simple socket
|
||||||
|
operation. By default, TCP checks will be configured with a request timeout
|
||||||
|
equal to the check interval, with a max of 10 seconds. It is possible to
|
||||||
|
configure a custom TCP check timeout value by specifying the `timeout` field
|
||||||
|
in the check definition.
|
||||||
|
|
||||||
* <a name="TTL"></a>Time to Live (TTL) - These checks retain their last known state for a given TTL.
|
* <a name="TTL"></a>Time to Live (TTL) - These checks retain their last known state for a given TTL.
|
||||||
The state of the check must be updated periodically over the HTTP interface. If an
|
The state of the check must be updated periodically over the HTTP interface. If an
|
||||||
external system fails to update the status within a given TTL, the check is
|
external system fails to update the status within a given TTL, the check is
|
||||||
|
@ -75,6 +89,20 @@ A HTTP check:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
A TCP check:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
{
|
||||||
|
"check": {
|
||||||
|
"id": "ssh",
|
||||||
|
"name": "SSH TCP on port 22",
|
||||||
|
"tcp": "localhost:22",
|
||||||
|
"interval": "10s",
|
||||||
|
"timeout": "1s"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
A TTL check:
|
A TTL check:
|
||||||
|
|
||||||
```javascript
|
```javascript
|
||||||
|
@ -102,7 +130,7 @@ Checks may also contain a `token` field to provide an ACL token. This token is
|
||||||
used for any interaction with the catalog for the check, including
|
used for any interaction with the catalog for the check, including
|
||||||
[anti-entropy syncs](/docs/internals/anti-entropy.html) and deregistration.
|
[anti-entropy syncs](/docs/internals/anti-entropy.html) and deregistration.
|
||||||
|
|
||||||
Both script and HTTP checks must include an `interval` field. This field is
|
Script, TCP and HTTP checks must include an `interval` field. This field is
|
||||||
parsed by Go's `time` package, and has the following
|
parsed by Go's `time` package, and has the following
|
||||||
[formatting specification](http://golang.org/pkg/time/#ParseDuration):
|
[formatting specification](http://golang.org/pkg/time/#ParseDuration):
|
||||||
> A duration string is a possibly signed sequence of decimal numbers, each with
|
> A duration string is a possibly signed sequence of decimal numbers, each with
|
||||||
|
|
|
@ -224,8 +224,8 @@ The endpoint always returns 200.
|
||||||
|
|
||||||
The register endpoint is used to add a new check to the local agent.
|
The register endpoint is used to add a new check to the local agent.
|
||||||
There is more documentation on checks [here](/docs/agent/checks.html).
|
There is more documentation on checks [here](/docs/agent/checks.html).
|
||||||
Checks may be of script, HTTP, or TTL type. The agent is responsible for managing
|
Checks may be of script, HTTP, TCP, or TTL type. The agent is responsible for
|
||||||
the status of the check and keeping the Catalog in sync.
|
managing the status of the check and keeping the Catalog in sync.
|
||||||
|
|
||||||
The register endpoint expects a JSON request body to be PUT. The request
|
The register endpoint expects a JSON request body to be PUT. The request
|
||||||
body must look like:
|
body must look like:
|
||||||
|
@ -237,13 +237,14 @@ body must look like:
|
||||||
"Notes": "Ensure we don't oversubscribe memory",
|
"Notes": "Ensure we don't oversubscribe memory",
|
||||||
"Script": "/usr/local/bin/check_mem.py",
|
"Script": "/usr/local/bin/check_mem.py",
|
||||||
"HTTP": "http://example.com",
|
"HTTP": "http://example.com",
|
||||||
|
"TCP": "example.com:22",
|
||||||
"Interval": "10s",
|
"Interval": "10s",
|
||||||
"TTL": "15s"
|
"TTL": "15s"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The `Name` field is mandatory, as is one of `Script`, `HTTP` or `TTL`.
|
The `Name` field is mandatory, as is one of `Script`, `HTTP`, `TCP` or `TTL`.
|
||||||
`Script` and `HTTP` also require that `Interval` be set.
|
`Script`, `TCP` and `HTTP` also require that `Interval` be set.
|
||||||
|
|
||||||
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
|
If an `ID` is not provided, it is set to `Name`. You cannot have duplicate
|
||||||
`ID` entries per agent, so it may be necessary to provide an `ID`.
|
`ID` entries per agent, so it may be necessary to provide an `ID`.
|
||||||
|
@ -258,6 +259,14 @@ be a URL) every `Interval`. If the response is any `2xx` code, the check is `pas
|
||||||
If the response is `429 Too Many Requests`, the check is `warning`. Otherwise, the check
|
If the response is `429 Too Many Requests`, the check is `warning`. Otherwise, the check
|
||||||
is `critical`.
|
is `critical`.
|
||||||
|
|
||||||
|
An `TCP` check will perform an TCP connection attempt against the value of `TCP`
|
||||||
|
(expected to be an IP/hostname and port combination) every `Interval`. If the
|
||||||
|
connection attempt is successful, the check is `passing`. If the connection
|
||||||
|
attempt is unsuccessful, the check is `critical`. In the case of a hostname
|
||||||
|
that resolves to both IPv4 and IPv6 addresses, an attempt will be made to both
|
||||||
|
addresses, and the first successful connection attempt will result in a
|
||||||
|
successful check.
|
||||||
|
|
||||||
If a `TTL` type is used, then the TTL update endpoint must be used periodically to update
|
If a `TTL` type is used, then the TTL update endpoint must be used periodically to update
|
||||||
the state of the check.
|
the state of the check.
|
||||||
|
|
||||||
|
|
|
@ -62,13 +62,14 @@ the DNS interface as well. If a service is failing its health check or a
|
||||||
node has any failing system-level check, the DNS interface will omit that
|
node has any failing system-level check, the DNS interface will omit that
|
||||||
node from any service query.
|
node from any service query.
|
||||||
|
|
||||||
The check must be of the script, HTTP, or TTL type. If it is a script type, `script`
|
The check must be of the script, HTTP, TCP or TTL type. If it is a script type,
|
||||||
and `interval` must be provided. If it is a HTTP type, `http` and
|
`script` and `interval` must be provided. If it is a HTTP type, `http` and
|
||||||
`interval` must be provided. If it is a TTL type, then only `ttl` must be
|
`interval` must be provided. If it is a TCP type, `tcp` and `interval` must be
|
||||||
provided. The check name is automatically generated as
|
provided. If it is a TTL type, then only `ttl` must be provided. The check name
|
||||||
`service:<service-id>`. If there are multiple service checks registered, the
|
is automatically generated as `service:<service-id>`. If there are multiple
|
||||||
ID will be generated as `service:<service-id>:<num>` where `<num>` is an
|
service checks registered, the ID will be generated as
|
||||||
incrementing number starting from `1`.
|
`service:<service-id>:<num>` where `<num>` is an incrementing number starting
|
||||||
|
from `1`.
|
||||||
|
|
||||||
Note: there is more information about [checks here](/docs/agent/checks.html).
|
Note: there is more information about [checks here](/docs/agent/checks.html).
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue