mirror of https://github.com/hashicorp/consul
Merge pull request #1935 from hashicorp/f-reap-time
Makes reap time configurable for LAN and WAN.pull/1970/head
commit
c11e1506c5
|
@ -299,6 +299,12 @@ func (a *Agent) consulConfig() *consul.Config {
|
|||
base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrs.SerfWan.IP.String()
|
||||
base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.AdvertiseAddrs.SerfWan.Port
|
||||
}
|
||||
if a.config.ReconnectTimeoutLan != 0 {
|
||||
base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLan
|
||||
}
|
||||
if a.config.ReconnectTimeoutWan != 0 {
|
||||
base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWan
|
||||
}
|
||||
if a.config.AdvertiseAddrs.RPC != nil {
|
||||
base.RPCAdvertise = a.config.AdvertiseAddrs.RPC
|
||||
}
|
||||
|
|
|
@ -176,6 +176,43 @@ func TestAgent_CheckAdvertiseAddrsSettings(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestAgent_ReconnectConfigSettings(t *testing.T) {
|
||||
c := nextConfig()
|
||||
func() {
|
||||
dir, agent := makeAgent(t, c)
|
||||
defer os.RemoveAll(dir)
|
||||
defer agent.Shutdown()
|
||||
|
||||
lan := agent.consulConfig().SerfLANConfig.ReconnectTimeout
|
||||
if lan != 3*24*time.Hour {
|
||||
t.Fatalf("bad: %s", lan.String())
|
||||
}
|
||||
|
||||
wan := agent.consulConfig().SerfWANConfig.ReconnectTimeout
|
||||
if wan != 3*24*time.Hour {
|
||||
t.Fatalf("bad: %s", wan.String())
|
||||
}
|
||||
}()
|
||||
|
||||
c.ReconnectTimeoutLan = 24 * time.Hour
|
||||
c.ReconnectTimeoutWan = 36 * time.Hour
|
||||
func() {
|
||||
dir, agent := makeAgent(t, c)
|
||||
defer os.RemoveAll(dir)
|
||||
defer agent.Shutdown()
|
||||
|
||||
lan := agent.consulConfig().SerfLANConfig.ReconnectTimeout
|
||||
if lan != 24*time.Hour {
|
||||
t.Fatalf("bad: %s", lan.String())
|
||||
}
|
||||
|
||||
wan := agent.consulConfig().SerfWANConfig.ReconnectTimeout
|
||||
if wan != 36*time.Hour {
|
||||
t.Fatalf("bad: %s", wan.String())
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func TestAgent_AddService(t *testing.T) {
|
||||
dir, agent := makeAgent(t, nextConfig())
|
||||
defer os.RemoveAll(dir)
|
||||
|
|
|
@ -312,6 +312,14 @@ type Config struct {
|
|||
RetryIntervalWan time.Duration `mapstructure:"-" json:"-"`
|
||||
RetryIntervalWanRaw string `mapstructure:"retry_interval_wan"`
|
||||
|
||||
// ReconnectTimeout* specify the amount of time to wait to reconnect with
|
||||
// another agent before deciding it's permanently gone. This can be used to
|
||||
// control the time it takes to reap failed nodes from the cluster.
|
||||
ReconnectTimeoutLan time.Duration `mapstructure:"-"`
|
||||
ReconnectTimeoutLanRaw string `mapstructure:"reconnect_timeout"`
|
||||
ReconnectTimeoutWan time.Duration `mapstructure:"-"`
|
||||
ReconnectTimeoutWanRaw string `mapstructure:"reconnect_timeout_wan"`
|
||||
|
||||
// EnableUi enables the statically-compiled assets for the Consul web UI and
|
||||
// serves them at the default /ui/ endpoint automatically.
|
||||
EnableUi bool `mapstructure:"ui"`
|
||||
|
@ -778,6 +786,28 @@ func DecodeConfig(r io.Reader) (*Config, error) {
|
|||
result.RetryIntervalWan = dur
|
||||
}
|
||||
|
||||
const reconnectTimeoutMin = 8 * time.Hour
|
||||
if raw := result.ReconnectTimeoutLanRaw; raw != "" {
|
||||
dur, err := time.ParseDuration(raw)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ReconnectTimeoutLan invalid: %v", err)
|
||||
}
|
||||
if dur < reconnectTimeoutMin {
|
||||
return nil, fmt.Errorf("ReconnectTimeoutLan must be >= %s", reconnectTimeoutMin.String())
|
||||
}
|
||||
result.ReconnectTimeoutLan = dur
|
||||
}
|
||||
if raw := result.ReconnectTimeoutWanRaw; raw != "" {
|
||||
dur, err := time.ParseDuration(raw)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ReconnectTimeoutWan invalid: %v", err)
|
||||
}
|
||||
if dur < reconnectTimeoutMin {
|
||||
return nil, fmt.Errorf("ReconnectTimeoutWan must be >= %s", reconnectTimeoutMin.String())
|
||||
}
|
||||
result.ReconnectTimeoutWan = dur
|
||||
}
|
||||
|
||||
// Merge the single recursor
|
||||
if result.DNSRecursor != "" {
|
||||
result.DNSRecursors = append(result.DNSRecursors, result.DNSRecursor)
|
||||
|
@ -1131,6 +1161,14 @@ func MergeConfig(a, b *Config) *Config {
|
|||
if b.RetryIntervalWan != 0 {
|
||||
result.RetryIntervalWan = b.RetryIntervalWan
|
||||
}
|
||||
if b.ReconnectTimeoutLan != 0 {
|
||||
result.ReconnectTimeoutLan = b.ReconnectTimeoutLan
|
||||
result.ReconnectTimeoutLanRaw = b.ReconnectTimeoutLanRaw
|
||||
}
|
||||
if b.ReconnectTimeoutWan != 0 {
|
||||
result.ReconnectTimeoutWan = b.ReconnectTimeoutWan
|
||||
result.ReconnectTimeoutWanRaw = b.ReconnectTimeoutWanRaw
|
||||
}
|
||||
if b.DNSConfig.NodeTTL != 0 {
|
||||
result.DNSConfig.NodeTTL = b.DNSConfig.NodeTTL
|
||||
}
|
||||
|
|
|
@ -462,6 +462,29 @@ func TestDecodeConfig(t *testing.T) {
|
|||
t.Fatalf("bad: %#v", config)
|
||||
}
|
||||
|
||||
// Reconnect timeout LAN and WAN
|
||||
input = `{"reconnect_timeout": "8h", "reconnect_timeout_wan": "10h"}`
|
||||
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
|
||||
if err != nil {
|
||||
t.Fatalf("err: %s", err)
|
||||
}
|
||||
if config.ReconnectTimeoutLanRaw != "8h" ||
|
||||
config.ReconnectTimeoutLan.String() != "8h0m0s" ||
|
||||
config.ReconnectTimeoutWanRaw != "10h" ||
|
||||
config.ReconnectTimeoutWan.String() != "10h0m0s" {
|
||||
t.Fatalf("bad: %#v", config)
|
||||
}
|
||||
input = `{"reconnect_timeout": "7h"}`
|
||||
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
|
||||
if err == nil {
|
||||
t.Fatalf("decode should have failed")
|
||||
}
|
||||
input = `{"reconnect_timeout_wan": "7h"}`
|
||||
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
|
||||
if err == nil {
|
||||
t.Fatalf("decode should have failed")
|
||||
}
|
||||
|
||||
// Static UI server
|
||||
input = `{"ui": true}`
|
||||
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
|
||||
|
@ -1351,6 +1374,10 @@ func TestMergeConfig(t *testing.T) {
|
|||
RetryJoinWan: []string{"1.1.1.1"},
|
||||
RetryIntervalWanRaw: "10s",
|
||||
RetryIntervalWan: 10 * time.Second,
|
||||
ReconnectTimeoutLanRaw: "24h",
|
||||
ReconnectTimeoutLan: 24 * time.Hour,
|
||||
ReconnectTimeoutWanRaw: "36h",
|
||||
ReconnectTimeoutWan: 36 * time.Hour,
|
||||
CheckUpdateInterval: 8 * time.Minute,
|
||||
CheckUpdateIntervalRaw: "8m",
|
||||
ACLToken: "1234",
|
||||
|
|
|
@ -137,5 +137,5 @@ a server, replication to it will stop.
|
|||
|
||||
To prevent an accumulation of dead nodes (nodes in either _failed_ or _left_ states),
|
||||
Consul will automatically remove dead nodes out of the catalog. This process is
|
||||
called _reaping_. This is currently done on a non-configurable interval of 72 hours.
|
||||
called _reaping_. This is currently done on a configurable interval of 72 hours.
|
||||
Reaping is similar to leaving, causing all associated services to be deregistered.
|
||||
|
|
|
@ -580,6 +580,19 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass
|
|||
automatically reap child processes if it detects it is running as PID 1. If this is set to true or false, then
|
||||
it controls reaping regardless of Consul's PID (forces reaping on or off, respectively).
|
||||
|
||||
* <a name="reconnect_timeout"></a><a href="#reconnect_timeout">`reconnect_timeout`</a> This controls
|
||||
how long it takes for a failed node to be completely removed from the cluster. This defaults to
|
||||
72 hours and it is recommended that this is set to at least double the maximum expected recoverable
|
||||
outage time for a node or network partition. WARNING: Setting this time too low could cause Consul
|
||||
servers to be removed from quorum during an extended node failure or partition, which could complicate
|
||||
recovery of the cluster. The value is a time with a unit suffix, which can be "s", "m", "h" for seconds,
|
||||
minutes, or hours. The value must be >= 8 hours.
|
||||
|
||||
* <a name="reconnect_timeout_wan"></a><a href="#reconnect_timeout_wan">`reconnect_timeout_wan`</a> This
|
||||
is the WAN equivalent of the <a href="#reconnect_timeout">`reconnect_timeout`</a> parameter, which
|
||||
controls how long it takes for a failed server to be completely removed from the WAN pool. This also
|
||||
defaults to 72 hours, and must be >= 8 hours.
|
||||
|
||||
* <a name="recursor"></a><a href="#recursor">`recursor`</a> Provides a single recursor address.
|
||||
This has been deprecated, and the value is appended to the [`recursors`](#recursors) list for
|
||||
backwards compatibility.
|
||||
|
|
|
@ -61,7 +61,7 @@ the current state of the catalog can lag behind until the state is reconciled.
|
|||
|
||||
To prevent an accumulation of dead nodes (nodes in either _failed_ or _left_ states),
|
||||
Consul will automatically remove dead nodes out of the catalog. This process is
|
||||
called _reaping_. This is currently done on a non-configurable interval of 72 hours.
|
||||
called _reaping_. This is currently done on a configurable interval of 72 hours.
|
||||
Reaping is similar to leaving, causing all associated services to be deregistered.
|
||||
|
||||
## Q: Does Consul support delta updates for watchers or blocking queries?
|
||||
|
|
Loading…
Reference in New Issue