From 95f462d5f10571494760b7e64478846dc8741806 Mon Sep 17 00:00:00 2001 From: Dan Bond Date: Mon, 15 May 2023 04:05:47 -0700 Subject: [PATCH] agent: prevent very old servers re-joining a cluster with stale data (#17171) * agent: configure server lastseen timestamp Signed-off-by: Dan Bond * use correct config Signed-off-by: Dan Bond * add comments Signed-off-by: Dan Bond * use default age in test golden data Signed-off-by: Dan Bond * add changelog Signed-off-by: Dan Bond * fix runtime test Signed-off-by: Dan Bond * agent: add server_metadata Signed-off-by: Dan Bond * update comments Signed-off-by: Dan Bond * correctly check if metadata file does not exist Signed-off-by: Dan Bond * follow instructions for adding new config Signed-off-by: Dan Bond * add comments Signed-off-by: Dan Bond * update comments Signed-off-by: Dan Bond * Update agent/agent.go Co-authored-by: Dan Upton * agent/config: add validation for duration with min Signed-off-by: Dan Bond * docs: add new server_rejoin_age_max config definition Signed-off-by: Dan Bond * agent: add unit test for checking server last seen Signed-off-by: Dan Bond * agent: log continually for 60s before erroring Signed-off-by: Dan Bond * pr comments Signed-off-by: Dan Bond * remove unneeded todo * agent: fix error message Signed-off-by: Dan Bond --------- Signed-off-by: Dan Bond Co-authored-by: Dan Upton --- .changelog/17171.txt | 3 + agent/agent.go | 96 +++++++++++++++++-- agent/agent_test.go | 65 +++++++++++++ agent/config/builder.go | 14 ++- agent/config/builder_test.go | 15 +++ agent/config/config.go | 1 + agent/config/default.go | 1 + agent/config/runtime.go | 12 +++ agent/config/runtime_test.go | 7 +- .../TestRuntimeConfig_Sanitize.golden | 1 + agent/config/testdata/full-config.hcl | 1 + agent/config/testdata/full-config.json | 1 + agent/consul/config.go | 6 ++ agent/consul/server_metadata.go | 71 ++++++++++++++ agent/consul/server_metadata_test.go | 68 +++++++++++++ .../docs/agent/config/config-files.mdx | 69 ++++++------- 16 files changed, 385 insertions(+), 46 deletions(-) create mode 100644 .changelog/17171.txt create mode 100644 agent/consul/server_metadata.go create mode 100644 agent/consul/server_metadata_test.go diff --git a/.changelog/17171.txt b/.changelog/17171.txt new file mode 100644 index 0000000000..882b635879 --- /dev/null +++ b/.changelog/17171.txt @@ -0,0 +1,3 @@ +```release-note:improvement +agent: add a configurable maximimum age (default: 7 days) to prevent servers re-joining a cluster with stale data +``` diff --git a/agent/agent.go b/agent/agent.go index 65467dc7ea..73906f853f 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -7,6 +7,7 @@ import ( "context" "crypto/tls" "encoding/json" + "errors" "fmt" "io" "net" @@ -22,8 +23,6 @@ import ( "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/consul/agent/rpcclient" - "github.com/hashicorp/consul/agent/rpcclient/configentry" "github.com/hashicorp/go-connlimit" "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" @@ -50,12 +49,13 @@ import ( grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns" middleware "github.com/hashicorp/consul/agent/grpc-middleware" "github.com/hashicorp/consul/agent/hcp/scada" - libscada "github.com/hashicorp/consul/agent/hcp/scada" "github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/proxycfg" proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue" catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog" localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local" + "github.com/hashicorp/consul/agent/rpcclient" + "github.com/hashicorp/consul/agent/rpcclient/configentry" "github.com/hashicorp/consul/agent/rpcclient/health" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/systemd" @@ -575,11 +575,11 @@ func (a *Agent) Start(ctx context.Context) error { return err } - // copy over the existing node id, this cannot be - // changed while running anyways but this prevents - // breaking some existing behavior. then overwrite - // the configuration + // Copy over the existing node id. This cannot be + // changed while running, but this prevents + // breaking some existing behavior. c.NodeID = a.config.NodeID + // Overwrite the configuration. a.config = c if err := a.tlsConfigurator.Update(a.config.TLS); err != nil { @@ -625,6 +625,20 @@ func (a *Agent) Start(ctx context.Context) error { if c.ServerMode { serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer) + // Check for a last seen timestamp and exit if deemed stale before attempting to join + // Serf/Raft or listen for requests. + if err := a.checkServerLastSeen(consul.ReadServerMetadata); err != nil { + deadline := time.Now().Add(time.Minute) + for time.Now().Before(deadline) { + a.logger.Error("startup error", "error", err) + time.Sleep(10 * time.Second) + } + return err + } + + // periodically write server metadata to disk. + go a.persistServerMetadata() + incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter( &lib.StopChannelContext{StopCh: a.shutdownCh}, serverLogger, @@ -661,7 +675,6 @@ func (a *Agent) Start(ctx context.Context) error { return fmt.Errorf("failed to start server cert manager: %w", err) } } - } else { a.externalGRPCServer = external.NewServer( a.logger.Named("grpc.external"), @@ -1094,7 +1107,7 @@ func (a *Agent) listenHTTP() ([]apiServer, error) { MaxHeaderBytes: a.config.HTTPMaxHeaderBytes, } - if libscada.IsCapability(l.Addr()) { + if scada.IsCapability(l.Addr()) { // wrap in http2 server handler httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{}) } @@ -1521,6 +1534,8 @@ func newConsulConfig(runtimeCfg *config.RuntimeConfig, logger hclog.Logger) (*co cfg.Reporting.License.Enabled = runtimeCfg.Reporting.License.Enabled + cfg.ServerRejoinAgeMax = runtimeCfg.ServerRejoinAgeMax + enterpriseConsulConfig(cfg, runtimeCfg) return cfg, nil @@ -4529,7 +4544,70 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources { a.fillEnterpriseProxyDataSources(&sources) return sources +} + +// persistServerMetadata periodically writes a server's metadata to a file +// in the configured data directory. +func (a *Agent) persistServerMetadata() { + file := filepath.Join(a.config.DataDir, consul.ServerMetadataFile) + + // Create a timer with no initial tick to allow metadata to be written immediately. + t := time.NewTimer(0) + defer t.Stop() + + for { + select { + case <-t.C: + // Reset the timer to the larger periodic interval. + t.Reset(1 * time.Hour) + + f, err := consul.OpenServerMetadata(file) + if err != nil { + a.logger.Error("failed to open existing server metadata: %w", err) + continue + } + + if err := consul.WriteServerMetadata(f); err != nil { + f.Close() + a.logger.Error("failed to write server metadata: %w", err) + continue + } + + f.Close() + case <-a.shutdownCh: + return + } + } +} +// checkServerLastSeen is a safety check that only occurs once of startup to prevent old servers +// with stale data from rejoining an existing cluster. +// +// It attempts to read a server's metadata file and check the last seen Unix timestamp against a +// configurable max age. If the metadata file does not exist, we treat this as an initial startup +// and return no error. +// +// Example: if the server recorded a last seen timestamp of now-7d, and we configure a max age +// of 3d, then we should prevent the server from rejoining. +func (a *Agent) checkServerLastSeen(readFn consul.ServerMetadataReadFunc) error { + filename := filepath.Join(a.config.DataDir, consul.ServerMetadataFile) + + // Read server metadata file. + md, err := readFn(filename) + if err != nil { + // Return early if it doesn't exist as this likely indicates the server is starting for the first time. + if errors.Is(err, os.ErrNotExist) { + return nil + } + return fmt.Errorf("error reading server metadata: %w", err) + } + + maxAge := a.config.ServerRejoinAgeMax + if md.IsLastSeenStale(maxAge) { + return fmt.Errorf("refusing to rejoin cluster because server has been offline for more than the configured server_rejoin_age_max (%s) - consider wiping your data dir", maxAge) + } + + return nil } func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string { diff --git a/agent/agent_test.go b/agent/agent_test.go index a0f688f5f5..b2f0ea32e3 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -12,6 +12,7 @@ import ( "crypto/x509" "encoding/base64" "encoding/json" + "errors" "fmt" mathrand "math/rand" "net" @@ -6204,6 +6205,70 @@ cloud { require.NoError(t, err) } +func TestAgent_checkServerLastSeen(t *testing.T) { + bd := BaseDeps{ + Deps: consul.Deps{ + Logger: hclog.NewInterceptLogger(nil), + Tokens: new(token.Store), + GRPCConnPool: &fakeGRPCConnPool{}, + }, + RuntimeConfig: &config.RuntimeConfig{}, + Cache: cache.New(cache.Options{}), + } + agent, err := New(bd) + require.NoError(t, err) + + // Test that an ErrNotExist OS error is treated as ok. + t.Run("TestReadErrNotExist", func(t *testing.T) { + readFn := func(filename string) (*consul.ServerMetadata, error) { + return nil, os.ErrNotExist + } + + err := agent.checkServerLastSeen(readFn) + require.NoError(t, err) + }) + + // Test that an error reading server metadata is treated as an error. + t.Run("TestReadErr", func(t *testing.T) { + expected := errors.New("read error") + readFn := func(filename string) (*consul.ServerMetadata, error) { + return nil, expected + } + + err := agent.checkServerLastSeen(readFn) + require.ErrorIs(t, err, expected) + }) + + // Test that a server with a 7d old last seen timestamp is treated as an error. + t.Run("TestIsLastSeenStaleErr", func(t *testing.T) { + agent.config.ServerRejoinAgeMax = time.Hour + + readFn := func(filename string) (*consul.ServerMetadata, error) { + return &consul.ServerMetadata{ + LastSeenUnix: time.Now().Add(-24 * 7 * time.Hour).Unix(), + }, nil + } + + err := agent.checkServerLastSeen(readFn) + require.Error(t, err) + require.ErrorContains(t, err, "refusing to rejoin cluster because server has been offline for more than the configured server_rejoin_age_max") + }) + + // Test that a server with a 6h old last seen timestamp is not treated as an error. + t.Run("TestNoErr", func(t *testing.T) { + agent.config.ServerRejoinAgeMax = 24 * 7 * time.Hour + + readFn := func(filename string) (*consul.ServerMetadata, error) { + return &consul.ServerMetadata{ + LastSeenUnix: time.Now().Add(-6 * time.Hour).Unix(), + }, nil + } + + err := agent.checkServerLastSeen(readFn) + require.NoError(t, err) + }) +} + func getExpectedCaPoolByFile(t *testing.T) *x509.CertPool { pool := x509.NewCertPool() data, err := os.ReadFile("../test/ca/root.cer") diff --git a/agent/config/builder.go b/agent/config/builder.go index 513d5931c7..87ee229406 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -28,8 +28,6 @@ import ( "github.com/hashicorp/memberlist" "golang.org/x/time/rate" - hcpconfig "github.com/hashicorp/consul/agent/hcp/config" - "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/connect/ca" @@ -37,6 +35,7 @@ import ( "github.com/hashicorp/consul/agent/consul/authmethod/ssoauth" consulrate "github.com/hashicorp/consul/agent/consul/rate" "github.com/hashicorp/consul/agent/dns" + hcpconfig "github.com/hashicorp/consul/agent/hcp/config" "github.com/hashicorp/consul/agent/rpc/middleware" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" @@ -1090,6 +1089,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) { ServerMode: serverMode, ServerName: stringVal(c.ServerName), ServerPort: serverPort, + ServerRejoinAgeMax: b.durationValWithDefaultMin("server_rejoin_age_max", c.ServerRejoinAgeMax, 24*7*time.Hour, 6*time.Hour), Services: services, SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin), SkipLeaveOnInt: skipLeaveOnInt, @@ -1952,6 +1952,16 @@ func (b *builder) durationValWithDefault(name string, v *string, defaultVal time return d } +// durationValWithDefaultMin is equivalent to durationValWithDefault, but enforces a minimum duration. +func (b *builder) durationValWithDefaultMin(name string, v *string, defaultVal, minVal time.Duration) (d time.Duration) { + d = b.durationValWithDefault(name, v, defaultVal) + if d < minVal { + b.err = multierror.Append(b.err, fmt.Errorf("%s: duration '%s' cannot be less than: %s", name, *v, minVal)) + } + + return d +} + func (b *builder) durationVal(name string, v *string) (d time.Duration) { return b.durationValWithDefault(name, v, 0) } diff --git a/agent/config/builder_test.go b/agent/config/builder_test.go index 3fe6573ba0..28d5b2972c 100644 --- a/agent/config/builder_test.go +++ b/agent/config/builder_test.go @@ -311,6 +311,21 @@ func TestBuilder_DurationVal_InvalidDuration(t *testing.T) { require.Contains(t, b.err.Error(), badDuration2) } +func TestBuilder_DurationValWithDefaultMin(t *testing.T) { + b := builder{} + + // Attempt to validate that a duration of 10 hours will not error when the min val is 1 hour. + dur := "10h0m0s" + b.durationValWithDefaultMin("field2", &dur, 24*7*time.Hour, time.Hour) + require.NoError(t, b.err) + + // Attempt to validate that a duration of 1 min will error when the min val is 1 hour. + dur = "0h1m0s" + b.durationValWithDefaultMin("field1", &dur, 24*7*time.Hour, time.Hour) + require.Error(t, b.err) + require.Contains(t, b.err.Error(), "1 error") +} + func TestBuilder_ServiceVal_MultiError(t *testing.T) { b := builder{} b.serviceVal(&ServiceDefinition{ diff --git a/agent/config/config.go b/agent/config/config.go index 4d41815550..a1f4145292 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -228,6 +228,7 @@ type Config struct { SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"` ServerMode *bool `mapstructure:"server" json:"server,omitempty"` ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"` + ServerRejoinAgeMax *string `mapstructure:"server_rejoin_age_max" json:"server_rejoin_age_max,omitempty"` Service *ServiceDefinition `mapstructure:"service" json:"-"` Services []ServiceDefinition `mapstructure:"services" json:"-"` SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"` diff --git a/agent/config/default.go b/agent/config/default.go index 4c88c2ac3d..3af8d0867d 100644 --- a/agent/config/default.go +++ b/agent/config/default.go @@ -58,6 +58,7 @@ func DefaultSource() Source { segment_limit = 64 server = false + server_rejoin_age_max = "168h" syslog_facility = "LOCAL0" tls = { diff --git a/agent/config/runtime.go b/agent/config/runtime.go index a8d6c62ebd..dca9abe0e7 100644 --- a/agent/config/runtime.go +++ b/agent/config/runtime.go @@ -1358,6 +1358,18 @@ type RuntimeConfig struct { // hcl: ports { server = int } ServerPort int + // ServerRejoinAgeMax is used to specify the duration of time a server + // is allowed to be down/offline before a startup operation is refused. + // + // For example: if a server has been offline for 5 days, and this option + // is configured to 3 days, then any subsequent startup operation will fail + // and require an operator to manually intervene. + // + // The default is: 7 days + // + // hcl: server_rejoin_age_max = "duration" + ServerRejoinAgeMax time.Duration + // Services contains the provided service definitions: // // hcl: services = [ diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go index 8b8ee4f86a..1064829cd3 100644 --- a/agent/config/runtime_test.go +++ b/agent/config/runtime_test.go @@ -25,13 +25,12 @@ import ( "github.com/stretchr/testify/require" "golang.org/x/time/rate" - hcpconfig "github.com/hashicorp/consul/agent/hcp/config" - "github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/consul" consulrate "github.com/hashicorp/consul/agent/consul/rate" + hcpconfig "github.com/hashicorp/consul/agent/hcp/config" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/lib" @@ -6419,6 +6418,7 @@ func TestLoad_FullConfig(t *testing.T) { SerfPortWAN: 8302, ServerMode: true, ServerName: "Oerr9n1G", + ServerRejoinAgeMax: 604800 * time.Second, ServerPort: 3757, Services: []*structs.ServiceDefinition{ { @@ -7163,7 +7163,8 @@ func TestRuntimeConfig_Sanitize(t *testing.T) { }, }, }, - Locality: &Locality{Region: strPtr("us-west-1"), Zone: strPtr("us-west-1a")}, + Locality: &Locality{Region: strPtr("us-west-1"), Zone: strPtr("us-west-1a")}, + ServerRejoinAgeMax: 24 * 7 * time.Hour, } b, err := json.MarshalIndent(rt.Sanitized(), "", " ") diff --git a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden index f2bd8f2f3c..c17636eef7 100644 --- a/agent/config/testdata/TestRuntimeConfig_Sanitize.golden +++ b/agent/config/testdata/TestRuntimeConfig_Sanitize.golden @@ -332,6 +332,7 @@ "ServerMode": false, "ServerName": "", "ServerPort": 0, + "ServerRejoinAgeMax": "168h0m0s", "Services": [ { "Address": "", diff --git a/agent/config/testdata/full-config.hcl b/agent/config/testdata/full-config.hcl index 718c879b29..c29c334b95 100644 --- a/agent/config/testdata/full-config.hcl +++ b/agent/config/testdata/full-config.hcl @@ -394,6 +394,7 @@ serf_lan = "99.43.63.15" serf_wan = "67.88.33.19" server = true server_name = "Oerr9n1G" +server_rejoin_age_max = "604800s" service = { id = "dLOXpSCI" name = "o1ynPkp0" diff --git a/agent/config/testdata/full-config.json b/agent/config/testdata/full-config.json index f98bfe4dab..7640394a4f 100644 --- a/agent/config/testdata/full-config.json +++ b/agent/config/testdata/full-config.json @@ -453,6 +453,7 @@ "serf_wan": "67.88.33.19", "server": true, "server_name": "Oerr9n1G", + "server_rejoin_age_max": "604800s", "service": { "id": "dLOXpSCI", "name": "o1ynPkp0", diff --git a/agent/consul/config.go b/agent/consul/config.go index e35fceb556..eef4bc4376 100644 --- a/agent/consul/config.go +++ b/agent/consul/config.go @@ -447,6 +447,10 @@ type Config struct { // Embedded Consul Enterprise specific configuration *EnterpriseConfig + + // ServerRejoinAgeMax is used to specify the duration of time a server + // is allowed to be down/offline before a startup operation is refused. + ServerRejoinAgeMax time.Duration } func (c *Config) InPrimaryDatacenter() bool { @@ -574,6 +578,8 @@ func DefaultConfig() *Config { PeeringTestAllowPeerRegistrations: false, EnterpriseConfig: DefaultEnterpriseConfig(), + + ServerRejoinAgeMax: 24 * 7 * time.Hour, } // Increase our reap interval to 3 days instead of 24h. diff --git a/agent/consul/server_metadata.go b/agent/consul/server_metadata.go new file mode 100644 index 0000000000..742391e0b6 --- /dev/null +++ b/agent/consul/server_metadata.go @@ -0,0 +1,71 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package consul + +import ( + "encoding/json" + "io" + "os" + "time" +) + +// ServerMetadataFile is the name of the file on disk that server metadata +// should be written to. +const ServerMetadataFile = "server_metadata.json" + +// ServerMetadata represents specific metadata about a running server. +type ServerMetadata struct { + // LastSeenUnix is the timestamp a server was last seen, in Unix format. + LastSeenUnix int64 `json:"last_seen_unix"` +} + +// IsLastSeenStale checks whether the last seen timestamp is older than a given duration. +func (md *ServerMetadata) IsLastSeenStale(d time.Duration) bool { + lastSeen := time.Unix(md.LastSeenUnix, 0) + maxAge := time.Now().Add(-d) + + return lastSeen.Before(maxAge) +} + +// OpenServerMetadata is a helper function for opening the server metadata file +// with the correct permissions. +func OpenServerMetadata(filename string) (io.WriteCloser, error) { + return os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) +} + +type ServerMetadataReadFunc func(filename string) (*ServerMetadata, error) + +// ReadServerMetadata is a helper function for reading the contents of a server +// metadata file and unmarshaling the data from JSON. +func ReadServerMetadata(filename string) (*ServerMetadata, error) { + b, err := os.ReadFile(filename) + if err != nil { + return nil, err + } + + var md ServerMetadata + if err := json.Unmarshal(b, &md); err != nil { + return nil, err + } + + return &md, nil +} + +// WriteServerMetadata writes server metadata to a file in JSON format. +func WriteServerMetadata(w io.Writer) error { + md := &ServerMetadata{ + LastSeenUnix: time.Now().Unix(), + } + + b, err := json.Marshal(md) + if err != nil { + return err + } + + if _, err := w.Write(b); err != nil { + return err + } + + return nil +} diff --git a/agent/consul/server_metadata_test.go b/agent/consul/server_metadata_test.go new file mode 100644 index 0000000000..d091bfdf36 --- /dev/null +++ b/agent/consul/server_metadata_test.go @@ -0,0 +1,68 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package consul + +import ( + "bytes" + "errors" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +type mockServerMetadataWriter struct { + writeErr error +} + +func (m *mockServerMetadataWriter) Write(p []byte) (n int, err error) { + if m.writeErr != nil { + return 0, m.writeErr + } + + return 1, nil +} + +func TestServerMetadata(t *testing.T) { + now := time.Now() + + t.Run("TestIsLastSeenStaleTrue", func(t *testing.T) { + // Create a server that is 48 hours old. + md := &ServerMetadata{ + LastSeenUnix: now.Add(-48 * time.Hour).Unix(), + } + + stale := md.IsLastSeenStale(24 * time.Hour) + assert.True(t, stale) + }) + + t.Run("TestIsLastSeenStaleFalse", func(t *testing.T) { + // Create a server that is 1 hour old. + md := &ServerMetadata{ + LastSeenUnix: now.Add(-1 * time.Hour).Unix(), + } + + stale := md.IsLastSeenStale(24 * time.Hour) + assert.False(t, stale) + }) +} + +func TestWriteServerMetadata(t *testing.T) { + t.Run("TestWriteError", func(t *testing.T) { + m := &mockServerMetadataWriter{ + writeErr: errors.New("write error"), + } + + err := WriteServerMetadata(m) + assert.Error(t, err) + }) + + t.Run("TestOK", func(t *testing.T) { + b := new(bytes.Buffer) + + err := WriteServerMetadata(b) + assert.NoError(t, err) + assert.True(t, b.Len() > 0) + }) +} diff --git a/website/content/docs/agent/config/config-files.mdx b/website/content/docs/agent/config/config-files.mdx index b19991fe04..ee5e273be9 100644 --- a/website/content/docs/agent/config/config-files.mdx +++ b/website/content/docs/agent/config/config-files.mdx @@ -7,7 +7,7 @@ description: >- # Agents Configuration File Reference ((#configuration_files)) -This topic describes the parameters for configuring Consul agents. For information about how to start Consul agents, refer to [Starting the Consul Agent](/consul/docs/agent#starting-the-consul-agent). +This topic describes the parameters for configuring Consul agents. For information about how to start Consul agents, refer to [Starting the Consul Agent](/consul/docs/agent#starting-the-consul-agent). ## Overview @@ -63,25 +63,25 @@ telemetry { -### Time-to-live values +### Time-to-live values Consul uses the Go `time` package to parse all time-to-live (TTL) values used in Consul agent configuration files. Specify integer and float values as a string and include one or more of the following units of time: - `ns` -- `us` -- `µs` +- `us` +- `µs` - `ms` -- `s` +- `s` - `m` - `h` -Examples: +Examples: -- `'300ms'` -- `'1.5h'` +- `'300ms'` +- `'1.5h'` - `'2h45m'` -Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDuration) for additional information. +Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDuration) for additional information. ## General parameters @@ -549,9 +549,9 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati - `https_handshake_timeout` - Configures the limit for how long the HTTPS server in both client and server agents will wait for a client to complete a TLS handshake. This should be kept conservative as it limits how many connections an unauthenticated attacker can open if `verify_incoming` is being using to authenticate clients (strongly recommended in production). Default value is `5s`. - `request_limits` - This object specifies configurations that limit the rate of RPC and gRPC requests on the Consul server. Limiting the rate of gRPC and RPC requests also limits HTTP requests to the Consul server. - `mode` - String value that specifies an action to take if the rate of requests exceeds the limit. You can specify the following values: - - `permissive`: The server continues to allow requests and records an error in the logs. - - `enforcing`: The server stops accepting requests and records an error in the logs. - - `disabled`: Limits are not enforced or tracked. This is the default value for `mode`. + - `permissive`: The server continues to allow requests and records an error in the logs. + - `enforcing`: The server stops accepting requests and records an error in the logs. + - `disabled`: Limits are not enforced or tracked. This is the default value for `mode`. - `read_rate` - Integer value that specifies the number of read requests per second. Default is `100`. - `write_rate` - Integer value that specifies the number of write requests per second. Default is `100`. - `rpc_handshake_timeout` - Configures the limit for how long servers will wait after a client TCP connection is established before they complete the connection handshake. When TLS is used, the same timeout applies to the TLS handshake separately from the initial protocol negotiation. All Consul clients should perform this immediately on establishing a new connection. This should be kept conservative as it limits how many connections an unauthenticated attacker can open if `verify_incoming` is being using to authenticate clients (strongly recommended in production). When `verify_incoming` is true on servers, this limits how long the connection socket and associated goroutines will be held open before the client successfully authenticates. Default value is `5s`. @@ -736,6 +736,11 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati - `server` Equivalent to the [`-server` command-line flag](/consul/docs/agent/config/cli-flags#_server). +- `server_rejoin_age_max` - controls the allowed maximum age of a stale server attempting to rejoin a cluster. + If a server is not running for this period, then it will refuse to start up again until an operator intervenes. This is to protect + clusters from instability caused by decommissioned servers accidentally being started again. + Note: the default value is 7d and the minimum value is 6h. + - `non_voting_server` - **This field is deprecated in Consul 1.9.1. See the [`read_replica`](#read_replica) field instead.** - `read_replica` - Equivalent to the [`-read-replica` command-line flag](/consul/docs/agent/config/cli-flags#_read_replica). @@ -924,7 +929,7 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati [`acl.tokens.agent_recovery`](#acl_tokens_agent_recovery).** - `config_file_service_registration` ((#acl_tokens_config_file_service_registration)) - Specifies the ACL - token the agent uses to register services and checks from [service](/consul/docs/services/usage/define-services) and [check](/consul/docs/services/usage/checks) definitions + token the agent uses to register services and checks from [service](/consul/docs/services/usage/define-services) and [check](/consul/docs/services/usage/checks) definitions specified in configuration files or fragments passed to the agent using the `-hcl` flag. @@ -1626,12 +1631,12 @@ subsystem that provides Consul's service mesh capabilities. - `backend` ((#raft_logstore_backend)) Specifies which storage engine to use to persist logs. Valid options are `boltdb` or `wal`. Default - is `boltdb`. The `wal` option specifies an experimental backend that - should be used with caution. Refer to - [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) + is `boltdb`. The `wal` option specifies an experimental backend that + should be used with caution. Refer to + [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) for more information. - - `disable_log_cache` ((#raft_logstore_disable_log_cache)) Disables the in-memory cache for recent logs. We recommend using it for performance testing purposes, as no significant improvement has been measured when the cache is disabled. While the in-memory log cache theoretically prevents disk reads for recent logs, recent logs are also stored in the OS page cache, which does not slow either the `boltdb` or `wal` backend's ability to read them. + - `disable_log_cache` ((#raft_logstore_disable_log_cache)) Disables the in-memory cache for recent logs. We recommend using it for performance testing purposes, as no significant improvement has been measured when the cache is disabled. While the in-memory log cache theoretically prevents disk reads for recent logs, recent logs are also stored in the OS page cache, which does not slow either the `boltdb` or `wal` backend's ability to read them. - `verification` ((#raft_logstore_verification)) This is a nested object that allows configuring the online verification of the LogStore. Verification @@ -1652,43 +1657,43 @@ subsystem that provides Consul's service mesh capabilities. on that server. The only correct response is to stop the server, remove its data directory, and restart so it can be caught back up with a correct server again. Please report verification failures including details about - your hardware and workload via GitHub issues. Refer to - [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) + your hardware and workload via GitHub issues. Refer to + [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) for more information. - `enabled` ((#raft_logstore_verification_enabled)) - Set to `true` to - allow this Consul server to write and verify log verification checkpoints + allow this Consul server to write and verify log verification checkpoints when elected leader. - - `interval` ((#raft_logstore_verification_interval)) - Specifies the time - interval between checkpoints. There is no default value. You must - configure the `interval` and set [`enabled`](#raft_logstore_verification_enabled) - to `true` to correctly enable intervals. We recommend using an interval - between `30s` and `5m`. The performance overhead is insignificant when the + - `interval` ((#raft_logstore_verification_interval)) - Specifies the time + interval between checkpoints. There is no default value. You must + configure the `interval` and set [`enabled`](#raft_logstore_verification_enabled) + to `true` to correctly enable intervals. We recommend using an interval + between `30s` and `5m`. The performance overhead is insignificant when the interval is set to `5m` or less. - - `boltdb` ((#raft_logstore_boltdb)) - Object that configures options for + - `boltdb` ((#raft_logstore_boltdb)) - Object that configures options for Raft's `boltdb` backend. It has no effect if the `backend` is not `boltdb`. - - `no_freelist_sync` ((#raft_logstore_boltdb_no_freelist_sync)) - Set to + - `no_freelist_sync` ((#raft_logstore_boltdb_no_freelist_sync)) - Set to `true` to disable storing BoltDB's freelist to disk within the `raft.db` file. Disabling freelist syncs reduces the disk IO required for write operations, but could potentially increase start up time because Consul must scan the database to find free space within the file. - - - `wal` ((#raft_logstore_wal)) - Object that configures the `wal` backend. - Refer to [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) + - - `wal` ((#raft_logstore_wal)) - Object that configures the `wal` backend. + Refer to [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) for more information. - - `segment_size_mb` ((#raft_logstore_wal_segment_size_mb)) - Integer value + - `segment_size_mb` ((#raft_logstore_wal_segment_size_mb)) - Integer value that represents the target size in MB for each segment file before rolling to a new segment. The default value is `64` and is suitable for - most deployments. While a smaller value may use less disk space because you + most deployments. While a smaller value may use less disk space because you can reclaim space by deleting old segments sooner, the smaller segment that results may affect performance because safely rotating to a new file more frequently can impact tail latencies. Larger values are unlikely - to improve performance significantly. We recommend using this + to improve performance significantly. We recommend using this configuration for performance testing purposes. - `raft_protocol` ((#raft_protocol)) Equivalent to the [`-raft-protocol`