From eedeba682b8a31794be2cea9d105da4fd13c4de3 Mon Sep 17 00:00:00 2001 From: James Phillips Date: Sun, 10 Apr 2016 22:46:07 -0700 Subject: [PATCH 1/3] Makes reap time configurable for LAN and WAN. --- command/agent/agent.go | 6 +++ command/agent/agent_test.go | 37 +++++++++++++++++++ command/agent/config.go | 32 ++++++++++++++++ command/agent/config_test.go | 17 +++++++++ .../source/docs/agent/options.html.markdown | 11 ++++++ 5 files changed, 103 insertions(+) diff --git a/command/agent/agent.go b/command/agent/agent.go index fe495fcf91..0fe316c41e 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -299,6 +299,12 @@ func (a *Agent) consulConfig() *consul.Config { base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.AdvertiseAddrs.SerfWan.IP.String() base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.AdvertiseAddrs.SerfWan.Port } + if a.config.ReconnectTimeoutLan != 0 { + base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLan + } + if a.config.ReconnectTimeoutWan != 0 { + base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWan + } if a.config.AdvertiseAddrs.RPC != nil { base.RPCAdvertise = a.config.AdvertiseAddrs.RPC } diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index fb28fa3df5..32e5fc4a53 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -176,6 +176,43 @@ func TestAgent_CheckAdvertiseAddrsSettings(t *testing.T) { } } +func TestAgent_ReconnectConfigSettings(t *testing.T) { + c := nextConfig() + func() { + dir, agent := makeAgent(t, c) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + lan := agent.consulConfig().SerfLANConfig.ReconnectTimeout + if lan != 3*24*time.Hour { + t.Fatalf("bad: %s", lan.String()) + } + + wan := agent.consulConfig().SerfWANConfig.ReconnectTimeout + if wan != 3*24*time.Hour { + t.Fatalf("bad: %s", wan.String()) + } + }() + + c.ReconnectTimeoutLan = 2 * time.Hour + c.ReconnectTimeoutWan = 3 * time.Hour + func() { + dir, agent := makeAgent(t, c) + defer os.RemoveAll(dir) + defer agent.Shutdown() + + lan := agent.consulConfig().SerfLANConfig.ReconnectTimeout + if lan != 2*time.Hour { + t.Fatalf("bad: %s", lan.String()) + } + + wan := agent.consulConfig().SerfWANConfig.ReconnectTimeout + if wan != 3*time.Hour { + t.Fatalf("bad: %s", wan.String()) + } + }() +} + func TestAgent_AddService(t *testing.T) { dir, agent := makeAgent(t, nextConfig()) defer os.RemoveAll(dir) diff --git a/command/agent/config.go b/command/agent/config.go index fd65d1b424..ce8287ce95 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -312,6 +312,14 @@ type Config struct { RetryIntervalWan time.Duration `mapstructure:"-" json:"-"` RetryIntervalWanRaw string `mapstructure:"retry_interval_wan"` + // ReconnectTimeout* specify the amount of time to wait to reconnect with + // another agent before deciding it's permanently gone. This can be used to + // control the time it takes to reap failed nodes from the cluster. + ReconnectTimeoutLan time.Duration `mapstructure:"-"` + ReconnectTimeoutLanRaw string `mapstructure:"reconnect_timeout"` + ReconnectTimeoutWan time.Duration `mapstructure:"-"` + ReconnectTimeoutWanRaw string `mapstructure:"reconnect_timeout_wan"` + // EnableUi enables the statically-compiled assets for the Consul web UI and // serves them at the default /ui/ endpoint automatically. EnableUi bool `mapstructure:"ui"` @@ -778,6 +786,22 @@ func DecodeConfig(r io.Reader) (*Config, error) { result.RetryIntervalWan = dur } + if raw := result.ReconnectTimeoutLanRaw; raw != "" { + dur, err := time.ParseDuration(raw) + if err != nil { + return nil, fmt.Errorf("ReconnectTimeoutLan invalid: %v", err) + } + result.ReconnectTimeoutLan = dur + } + + if raw := result.ReconnectTimeoutWanRaw; raw != "" { + dur, err := time.ParseDuration(raw) + if err != nil { + return nil, fmt.Errorf("ReconnectTimeoutWan invalid: %v", err) + } + result.ReconnectTimeoutWan = dur + } + // Merge the single recursor if result.DNSRecursor != "" { result.DNSRecursors = append(result.DNSRecursors, result.DNSRecursor) @@ -1131,6 +1155,14 @@ func MergeConfig(a, b *Config) *Config { if b.RetryIntervalWan != 0 { result.RetryIntervalWan = b.RetryIntervalWan } + if b.ReconnectTimeoutLan != 0 { + result.ReconnectTimeoutLan = b.ReconnectTimeoutLan + result.ReconnectTimeoutLanRaw = b.ReconnectTimeoutLanRaw + } + if b.ReconnectTimeoutWan != 0 { + result.ReconnectTimeoutWan = b.ReconnectTimeoutWan + result.ReconnectTimeoutWanRaw = b.ReconnectTimeoutWanRaw + } if b.DNSConfig.NodeTTL != 0 { result.DNSConfig.NodeTTL = b.DNSConfig.NodeTTL } diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 5674413804..09dad36b8b 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -462,6 +462,19 @@ func TestDecodeConfig(t *testing.T) { t.Fatalf("bad: %#v", config) } + // Reconnect timeout LAN and WAN + input = `{"reconnect_timeout": "1m", "reconnect_timeout_wan": "2m"}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err != nil { + t.Fatalf("err: %s", err) + } + if config.ReconnectTimeoutLanRaw != "1m" || + config.ReconnectTimeoutLan.String() != "1m0s" || + config.ReconnectTimeoutWanRaw != "2m" || + config.ReconnectTimeoutWan.String() != "2m0s" { + t.Fatalf("bad: %#v", config) + } + // Static UI server input = `{"ui": true}` config, err = DecodeConfig(bytes.NewReader([]byte(input))) @@ -1351,6 +1364,10 @@ func TestMergeConfig(t *testing.T) { RetryJoinWan: []string{"1.1.1.1"}, RetryIntervalWanRaw: "10s", RetryIntervalWan: 10 * time.Second, + ReconnectTimeoutLanRaw: "1s", + ReconnectTimeoutLan: 1 * time.Second, + ReconnectTimeoutWanRaw: "2s", + ReconnectTimeoutWan: 2 * time.Second, CheckUpdateInterval: 8 * time.Minute, CheckUpdateIntervalRaw: "8m", ACLToken: "1234", diff --git a/website/source/docs/agent/options.html.markdown b/website/source/docs/agent/options.html.markdown index df90858970..fc21a7e2e8 100644 --- a/website/source/docs/agent/options.html.markdown +++ b/website/source/docs/agent/options.html.markdown @@ -580,6 +580,17 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass automatically reap child processes if it detects it is running as PID 1. If this is set to true or false, then it controls reaping regardless of Consul's PID (forces reaping on or off, respectively). +* `reconnect_timeout` This controls + how long it takes for a failed node to be completely removed from the cluster. This defaults to + 72 hours and it is recommended that this is set to at least double the maximum expected recoverable + outage time for a node or network partition. The value is a time with a unit suffix, which can be + "s", "m", "h" for seconds, minutes, or hours. + +* `reconnect_timeout_wan` This + is the WAN equivalent of the `reconnect_timeout` parameter, which + controls how long it takes for a failed server to be completely removed from the WAN pool. This also + defaults to 72 hours. + * `recursor` Provides a single recursor address. This has been deprecated, and the value is appended to the [`recursors`](#recursors) list for backwards compatibility. From 7f55f9825f68be1ad2d516d2ec70d649f6c25bd5 Mon Sep 17 00:00:00 2001 From: James Phillips Date: Sun, 10 Apr 2016 23:40:54 -0700 Subject: [PATCH 2/3] Updates some docs that say reaping is not configurable. --- website/source/docs/agent/basics.html.markdown | 2 +- website/source/docs/faq.html.markdown | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/website/source/docs/agent/basics.html.markdown b/website/source/docs/agent/basics.html.markdown index b045a3abac..f610cf69eb 100644 --- a/website/source/docs/agent/basics.html.markdown +++ b/website/source/docs/agent/basics.html.markdown @@ -137,5 +137,5 @@ a server, replication to it will stop. To prevent an accumulation of dead nodes (nodes in either _failed_ or _left_ states), Consul will automatically remove dead nodes out of the catalog. This process is -called _reaping_. This is currently done on a non-configurable interval of 72 hours. +called _reaping_. This is currently done on a configurable interval of 72 hours. Reaping is similar to leaving, causing all associated services to be deregistered. diff --git a/website/source/docs/faq.html.markdown b/website/source/docs/faq.html.markdown index e993188c97..60281b52c5 100644 --- a/website/source/docs/faq.html.markdown +++ b/website/source/docs/faq.html.markdown @@ -61,7 +61,7 @@ the current state of the catalog can lag behind until the state is reconciled. To prevent an accumulation of dead nodes (nodes in either _failed_ or _left_ states), Consul will automatically remove dead nodes out of the catalog. This process is -called _reaping_. This is currently done on a non-configurable interval of 72 hours. +called _reaping_. This is currently done on a configurable interval of 72 hours. Reaping is similar to leaving, causing all associated services to be deregistered. ## Q: Does Consul support delta updates for watchers or blocking queries? From cf00c11221b5f4938a8fcf40825812a8582234de Mon Sep 17 00:00:00 2001 From: James Phillips Date: Sun, 10 Apr 2016 23:31:16 -0700 Subject: [PATCH 3/3] Sets an anti-footgun floor for the configurable reap time. --- command/agent/agent_test.go | 8 +++--- command/agent/config.go | 8 +++++- command/agent/config_test.go | 28 +++++++++++++------ .../source/docs/agent/options.html.markdown | 8 ++++-- 4 files changed, 35 insertions(+), 17 deletions(-) diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 32e5fc4a53..baf5c5dc53 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -194,20 +194,20 @@ func TestAgent_ReconnectConfigSettings(t *testing.T) { } }() - c.ReconnectTimeoutLan = 2 * time.Hour - c.ReconnectTimeoutWan = 3 * time.Hour + c.ReconnectTimeoutLan = 24 * time.Hour + c.ReconnectTimeoutWan = 36 * time.Hour func() { dir, agent := makeAgent(t, c) defer os.RemoveAll(dir) defer agent.Shutdown() lan := agent.consulConfig().SerfLANConfig.ReconnectTimeout - if lan != 2*time.Hour { + if lan != 24*time.Hour { t.Fatalf("bad: %s", lan.String()) } wan := agent.consulConfig().SerfWANConfig.ReconnectTimeout - if wan != 3*time.Hour { + if wan != 36*time.Hour { t.Fatalf("bad: %s", wan.String()) } }() diff --git a/command/agent/config.go b/command/agent/config.go index ce8287ce95..21a3fa454b 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -786,19 +786,25 @@ func DecodeConfig(r io.Reader) (*Config, error) { result.RetryIntervalWan = dur } + const reconnectTimeoutMin = 8 * time.Hour if raw := result.ReconnectTimeoutLanRaw; raw != "" { dur, err := time.ParseDuration(raw) if err != nil { return nil, fmt.Errorf("ReconnectTimeoutLan invalid: %v", err) } + if dur < reconnectTimeoutMin { + return nil, fmt.Errorf("ReconnectTimeoutLan must be >= %s", reconnectTimeoutMin.String()) + } result.ReconnectTimeoutLan = dur } - if raw := result.ReconnectTimeoutWanRaw; raw != "" { dur, err := time.ParseDuration(raw) if err != nil { return nil, fmt.Errorf("ReconnectTimeoutWan invalid: %v", err) } + if dur < reconnectTimeoutMin { + return nil, fmt.Errorf("ReconnectTimeoutWan must be >= %s", reconnectTimeoutMin.String()) + } result.ReconnectTimeoutWan = dur } diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 09dad36b8b..3fc0a6880d 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -463,17 +463,27 @@ func TestDecodeConfig(t *testing.T) { } // Reconnect timeout LAN and WAN - input = `{"reconnect_timeout": "1m", "reconnect_timeout_wan": "2m"}` + input = `{"reconnect_timeout": "8h", "reconnect_timeout_wan": "10h"}` config, err = DecodeConfig(bytes.NewReader([]byte(input))) if err != nil { t.Fatalf("err: %s", err) } - if config.ReconnectTimeoutLanRaw != "1m" || - config.ReconnectTimeoutLan.String() != "1m0s" || - config.ReconnectTimeoutWanRaw != "2m" || - config.ReconnectTimeoutWan.String() != "2m0s" { + if config.ReconnectTimeoutLanRaw != "8h" || + config.ReconnectTimeoutLan.String() != "8h0m0s" || + config.ReconnectTimeoutWanRaw != "10h" || + config.ReconnectTimeoutWan.String() != "10h0m0s" { t.Fatalf("bad: %#v", config) } + input = `{"reconnect_timeout": "7h"}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err == nil { + t.Fatalf("decode should have failed") + } + input = `{"reconnect_timeout_wan": "7h"}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err == nil { + t.Fatalf("decode should have failed") + } // Static UI server input = `{"ui": true}` @@ -1364,10 +1374,10 @@ func TestMergeConfig(t *testing.T) { RetryJoinWan: []string{"1.1.1.1"}, RetryIntervalWanRaw: "10s", RetryIntervalWan: 10 * time.Second, - ReconnectTimeoutLanRaw: "1s", - ReconnectTimeoutLan: 1 * time.Second, - ReconnectTimeoutWanRaw: "2s", - ReconnectTimeoutWan: 2 * time.Second, + ReconnectTimeoutLanRaw: "24h", + ReconnectTimeoutLan: 24 * time.Hour, + ReconnectTimeoutWanRaw: "36h", + ReconnectTimeoutWan: 36 * time.Hour, CheckUpdateInterval: 8 * time.Minute, CheckUpdateIntervalRaw: "8m", ACLToken: "1234", diff --git a/website/source/docs/agent/options.html.markdown b/website/source/docs/agent/options.html.markdown index fc21a7e2e8..06c1e404d0 100644 --- a/website/source/docs/agent/options.html.markdown +++ b/website/source/docs/agent/options.html.markdown @@ -583,13 +583,15 @@ Consul will not enable TLS for the HTTP API unless the `https` port has been ass * `reconnect_timeout` This controls how long it takes for a failed node to be completely removed from the cluster. This defaults to 72 hours and it is recommended that this is set to at least double the maximum expected recoverable - outage time for a node or network partition. The value is a time with a unit suffix, which can be - "s", "m", "h" for seconds, minutes, or hours. + outage time for a node or network partition. WARNING: Setting this time too low could cause Consul + servers to be removed from quorum during an extended node failure or partition, which could complicate + recovery of the cluster. The value is a time with a unit suffix, which can be "s", "m", "h" for seconds, + minutes, or hours. The value must be >= 8 hours. * `reconnect_timeout_wan` This is the WAN equivalent of the `reconnect_timeout` parameter, which controls how long it takes for a failed server to be completely removed from the WAN pool. This also - defaults to 72 hours. + defaults to 72 hours, and must be >= 8 hours. * `recursor` Provides a single recursor address. This has been deprecated, and the value is appended to the [`recursors`](#recursors) list for