Browse Source

agent: prevent very old servers re-joining a cluster with stale data (#17171)

* agent: configure server lastseen timestamp

Signed-off-by: Dan Bond <danbond@protonmail.com>

* use correct config

Signed-off-by: Dan Bond <danbond@protonmail.com>

* add comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* use default age in test golden data

Signed-off-by: Dan Bond <danbond@protonmail.com>

* add changelog

Signed-off-by: Dan Bond <danbond@protonmail.com>

* fix runtime test

Signed-off-by: Dan Bond <danbond@protonmail.com>

* agent: add server_metadata

Signed-off-by: Dan Bond <danbond@protonmail.com>

* update comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* correctly check if metadata file does not exist

Signed-off-by: Dan Bond <danbond@protonmail.com>

* follow instructions for adding new config

Signed-off-by: Dan Bond <danbond@protonmail.com>

* add comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* update comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* Update agent/agent.go

Co-authored-by: Dan Upton <daniel@floppy.co>

* agent/config: add validation for duration with min

Signed-off-by: Dan Bond <danbond@protonmail.com>

* docs: add new server_rejoin_age_max config definition

Signed-off-by: Dan Bond <danbond@protonmail.com>

* agent: add unit test for checking server last seen

Signed-off-by: Dan Bond <danbond@protonmail.com>

* agent: log continually for 60s before erroring

Signed-off-by: Dan Bond <danbond@protonmail.com>

* pr comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* remove unneeded todo

* agent: fix error message

Signed-off-by: Dan Bond <danbond@protonmail.com>

---------

Signed-off-by: Dan Bond <danbond@protonmail.com>
Co-authored-by: Dan Upton <daniel@floppy.co>
pull/17215/head
Dan Bond 2 years ago committed by GitHub
parent
commit
95f462d5f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 3
      .changelog/17171.txt
  2. 96
      agent/agent.go
  3. 65
      agent/agent_test.go
  4. 14
      agent/config/builder.go
  5. 15
      agent/config/builder_test.go
  6. 1
      agent/config/config.go
  7. 1
      agent/config/default.go
  8. 12
      agent/config/runtime.go
  9. 7
      agent/config/runtime_test.go
  10. 1
      agent/config/testdata/TestRuntimeConfig_Sanitize.golden
  11. 1
      agent/config/testdata/full-config.hcl
  12. 1
      agent/config/testdata/full-config.json
  13. 6
      agent/consul/config.go
  14. 71
      agent/consul/server_metadata.go
  15. 68
      agent/consul/server_metadata_test.go
  16. 69
      website/content/docs/agent/config/config-files.mdx

3
.changelog/17171.txt

@ -0,0 +1,3 @@
```release-note:improvement
agent: add a configurable maximimum age (default: 7 days) to prevent servers re-joining a cluster with stale data
```

96
agent/agent.go

@ -7,6 +7,7 @@ import (
"context" "context"
"crypto/tls" "crypto/tls"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
"io" "io"
"net" "net"
@ -22,8 +23,6 @@ import (
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus" "github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/rpcclient"
"github.com/hashicorp/consul/agent/rpcclient/configentry"
"github.com/hashicorp/go-connlimit" "github.com/hashicorp/go-connlimit"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb" "github.com/hashicorp/go-memdb"
@ -50,12 +49,13 @@ import (
grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns" grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns"
middleware "github.com/hashicorp/consul/agent/grpc-middleware" middleware "github.com/hashicorp/consul/agent/grpc-middleware"
"github.com/hashicorp/consul/agent/hcp/scada" "github.com/hashicorp/consul/agent/hcp/scada"
libscada "github.com/hashicorp/consul/agent/hcp/scada"
"github.com/hashicorp/consul/agent/local" "github.com/hashicorp/consul/agent/local"
"github.com/hashicorp/consul/agent/proxycfg" "github.com/hashicorp/consul/agent/proxycfg"
proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue" proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue"
catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog" catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog"
localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local" localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local"
"github.com/hashicorp/consul/agent/rpcclient"
"github.com/hashicorp/consul/agent/rpcclient/configentry"
"github.com/hashicorp/consul/agent/rpcclient/health" "github.com/hashicorp/consul/agent/rpcclient/health"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/systemd" "github.com/hashicorp/consul/agent/systemd"
@ -575,11 +575,11 @@ func (a *Agent) Start(ctx context.Context) error {
return err return err
} }
// copy over the existing node id, this cannot be // Copy over the existing node id. This cannot be
// changed while running anyways but this prevents // changed while running, but this prevents
// breaking some existing behavior. then overwrite // breaking some existing behavior.
// the configuration
c.NodeID = a.config.NodeID c.NodeID = a.config.NodeID
// Overwrite the configuration.
a.config = c a.config = c
if err := a.tlsConfigurator.Update(a.config.TLS); err != nil { if err := a.tlsConfigurator.Update(a.config.TLS); err != nil {
@ -625,6 +625,20 @@ func (a *Agent) Start(ctx context.Context) error {
if c.ServerMode { if c.ServerMode {
serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer) serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer)
// Check for a last seen timestamp and exit if deemed stale before attempting to join
// Serf/Raft or listen for requests.
if err := a.checkServerLastSeen(consul.ReadServerMetadata); err != nil {
deadline := time.Now().Add(time.Minute)
for time.Now().Before(deadline) {
a.logger.Error("startup error", "error", err)
time.Sleep(10 * time.Second)
}
return err
}
// periodically write server metadata to disk.
go a.persistServerMetadata()
incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter( incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter(
&lib.StopChannelContext{StopCh: a.shutdownCh}, &lib.StopChannelContext{StopCh: a.shutdownCh},
serverLogger, serverLogger,
@ -661,7 +675,6 @@ func (a *Agent) Start(ctx context.Context) error {
return fmt.Errorf("failed to start server cert manager: %w", err) return fmt.Errorf("failed to start server cert manager: %w", err)
} }
} }
} else { } else {
a.externalGRPCServer = external.NewServer( a.externalGRPCServer = external.NewServer(
a.logger.Named("grpc.external"), a.logger.Named("grpc.external"),
@ -1094,7 +1107,7 @@ func (a *Agent) listenHTTP() ([]apiServer, error) {
MaxHeaderBytes: a.config.HTTPMaxHeaderBytes, MaxHeaderBytes: a.config.HTTPMaxHeaderBytes,
} }
if libscada.IsCapability(l.Addr()) { if scada.IsCapability(l.Addr()) {
// wrap in http2 server handler // wrap in http2 server handler
httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{}) httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{})
} }
@ -1521,6 +1534,8 @@ func newConsulConfig(runtimeCfg *config.RuntimeConfig, logger hclog.Logger) (*co
cfg.Reporting.License.Enabled = runtimeCfg.Reporting.License.Enabled cfg.Reporting.License.Enabled = runtimeCfg.Reporting.License.Enabled
cfg.ServerRejoinAgeMax = runtimeCfg.ServerRejoinAgeMax
enterpriseConsulConfig(cfg, runtimeCfg) enterpriseConsulConfig(cfg, runtimeCfg)
return cfg, nil return cfg, nil
@ -4529,7 +4544,70 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources {
a.fillEnterpriseProxyDataSources(&sources) a.fillEnterpriseProxyDataSources(&sources)
return sources return sources
}
// persistServerMetadata periodically writes a server's metadata to a file
// in the configured data directory.
func (a *Agent) persistServerMetadata() {
file := filepath.Join(a.config.DataDir, consul.ServerMetadataFile)
// Create a timer with no initial tick to allow metadata to be written immediately.
t := time.NewTimer(0)
defer t.Stop()
for {
select {
case <-t.C:
// Reset the timer to the larger periodic interval.
t.Reset(1 * time.Hour)
f, err := consul.OpenServerMetadata(file)
if err != nil {
a.logger.Error("failed to open existing server metadata: %w", err)
continue
}
if err := consul.WriteServerMetadata(f); err != nil {
f.Close()
a.logger.Error("failed to write server metadata: %w", err)
continue
}
f.Close()
case <-a.shutdownCh:
return
}
}
}
// checkServerLastSeen is a safety check that only occurs once of startup to prevent old servers
// with stale data from rejoining an existing cluster.
//
// It attempts to read a server's metadata file and check the last seen Unix timestamp against a
// configurable max age. If the metadata file does not exist, we treat this as an initial startup
// and return no error.
//
// Example: if the server recorded a last seen timestamp of now-7d, and we configure a max age
// of 3d, then we should prevent the server from rejoining.
func (a *Agent) checkServerLastSeen(readFn consul.ServerMetadataReadFunc) error {
filename := filepath.Join(a.config.DataDir, consul.ServerMetadataFile)
// Read server metadata file.
md, err := readFn(filename)
if err != nil {
// Return early if it doesn't exist as this likely indicates the server is starting for the first time.
if errors.Is(err, os.ErrNotExist) {
return nil
}
return fmt.Errorf("error reading server metadata: %w", err)
}
maxAge := a.config.ServerRejoinAgeMax
if md.IsLastSeenStale(maxAge) {
return fmt.Errorf("refusing to rejoin cluster because server has been offline for more than the configured server_rejoin_age_max (%s) - consider wiping your data dir", maxAge)
}
return nil
} }
func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string { func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string {

65
agent/agent_test.go

@ -12,6 +12,7 @@ import (
"crypto/x509" "crypto/x509"
"encoding/base64" "encoding/base64"
"encoding/json" "encoding/json"
"errors"
"fmt" "fmt"
mathrand "math/rand" mathrand "math/rand"
"net" "net"
@ -6204,6 +6205,70 @@ cloud {
require.NoError(t, err) require.NoError(t, err)
} }
func TestAgent_checkServerLastSeen(t *testing.T) {
bd := BaseDeps{
Deps: consul.Deps{
Logger: hclog.NewInterceptLogger(nil),
Tokens: new(token.Store),
GRPCConnPool: &fakeGRPCConnPool{},
},
RuntimeConfig: &config.RuntimeConfig{},
Cache: cache.New(cache.Options{}),
}
agent, err := New(bd)
require.NoError(t, err)
// Test that an ErrNotExist OS error is treated as ok.
t.Run("TestReadErrNotExist", func(t *testing.T) {
readFn := func(filename string) (*consul.ServerMetadata, error) {
return nil, os.ErrNotExist
}
err := agent.checkServerLastSeen(readFn)
require.NoError(t, err)
})
// Test that an error reading server metadata is treated as an error.
t.Run("TestReadErr", func(t *testing.T) {
expected := errors.New("read error")
readFn := func(filename string) (*consul.ServerMetadata, error) {
return nil, expected
}
err := agent.checkServerLastSeen(readFn)
require.ErrorIs(t, err, expected)
})
// Test that a server with a 7d old last seen timestamp is treated as an error.
t.Run("TestIsLastSeenStaleErr", func(t *testing.T) {
agent.config.ServerRejoinAgeMax = time.Hour
readFn := func(filename string) (*consul.ServerMetadata, error) {
return &consul.ServerMetadata{
LastSeenUnix: time.Now().Add(-24 * 7 * time.Hour).Unix(),
}, nil
}
err := agent.checkServerLastSeen(readFn)
require.Error(t, err)
require.ErrorContains(t, err, "refusing to rejoin cluster because server has been offline for more than the configured server_rejoin_age_max")
})
// Test that a server with a 6h old last seen timestamp is not treated as an error.
t.Run("TestNoErr", func(t *testing.T) {
agent.config.ServerRejoinAgeMax = 24 * 7 * time.Hour
readFn := func(filename string) (*consul.ServerMetadata, error) {
return &consul.ServerMetadata{
LastSeenUnix: time.Now().Add(-6 * time.Hour).Unix(),
}, nil
}
err := agent.checkServerLastSeen(readFn)
require.NoError(t, err)
})
}
func getExpectedCaPoolByFile(t *testing.T) *x509.CertPool { func getExpectedCaPoolByFile(t *testing.T) *x509.CertPool {
pool := x509.NewCertPool() pool := x509.NewCertPool()
data, err := os.ReadFile("../test/ca/root.cer") data, err := os.ReadFile("../test/ca/root.cer")

14
agent/config/builder.go

@ -28,8 +28,6 @@ import (
"github.com/hashicorp/memberlist" "github.com/hashicorp/memberlist"
"golang.org/x/time/rate" "golang.org/x/time/rate"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/connect/ca" "github.com/hashicorp/consul/agent/connect/ca"
@ -37,6 +35,7 @@ import (
"github.com/hashicorp/consul/agent/consul/authmethod/ssoauth" "github.com/hashicorp/consul/agent/consul/authmethod/ssoauth"
consulrate "github.com/hashicorp/consul/agent/consul/rate" consulrate "github.com/hashicorp/consul/agent/consul/rate"
"github.com/hashicorp/consul/agent/dns" "github.com/hashicorp/consul/agent/dns"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/rpc/middleware" "github.com/hashicorp/consul/agent/rpc/middleware"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/agent/token"
@ -1090,6 +1089,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) {
ServerMode: serverMode, ServerMode: serverMode,
ServerName: stringVal(c.ServerName), ServerName: stringVal(c.ServerName),
ServerPort: serverPort, ServerPort: serverPort,
ServerRejoinAgeMax: b.durationValWithDefaultMin("server_rejoin_age_max", c.ServerRejoinAgeMax, 24*7*time.Hour, 6*time.Hour),
Services: services, Services: services,
SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin), SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin),
SkipLeaveOnInt: skipLeaveOnInt, SkipLeaveOnInt: skipLeaveOnInt,
@ -1952,6 +1952,16 @@ func (b *builder) durationValWithDefault(name string, v *string, defaultVal time
return d return d
} }
// durationValWithDefaultMin is equivalent to durationValWithDefault, but enforces a minimum duration.
func (b *builder) durationValWithDefaultMin(name string, v *string, defaultVal, minVal time.Duration) (d time.Duration) {
d = b.durationValWithDefault(name, v, defaultVal)
if d < minVal {
b.err = multierror.Append(b.err, fmt.Errorf("%s: duration '%s' cannot be less than: %s", name, *v, minVal))
}
return d
}
func (b *builder) durationVal(name string, v *string) (d time.Duration) { func (b *builder) durationVal(name string, v *string) (d time.Duration) {
return b.durationValWithDefault(name, v, 0) return b.durationValWithDefault(name, v, 0)
} }

15
agent/config/builder_test.go

@ -311,6 +311,21 @@ func TestBuilder_DurationVal_InvalidDuration(t *testing.T) {
require.Contains(t, b.err.Error(), badDuration2) require.Contains(t, b.err.Error(), badDuration2)
} }
func TestBuilder_DurationValWithDefaultMin(t *testing.T) {
b := builder{}
// Attempt to validate that a duration of 10 hours will not error when the min val is 1 hour.
dur := "10h0m0s"
b.durationValWithDefaultMin("field2", &dur, 24*7*time.Hour, time.Hour)
require.NoError(t, b.err)
// Attempt to validate that a duration of 1 min will error when the min val is 1 hour.
dur = "0h1m0s"
b.durationValWithDefaultMin("field1", &dur, 24*7*time.Hour, time.Hour)
require.Error(t, b.err)
require.Contains(t, b.err.Error(), "1 error")
}
func TestBuilder_ServiceVal_MultiError(t *testing.T) { func TestBuilder_ServiceVal_MultiError(t *testing.T) {
b := builder{} b := builder{}
b.serviceVal(&ServiceDefinition{ b.serviceVal(&ServiceDefinition{

1
agent/config/config.go

@ -228,6 +228,7 @@ type Config struct {
SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"` SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"`
ServerMode *bool `mapstructure:"server" json:"server,omitempty"` ServerMode *bool `mapstructure:"server" json:"server,omitempty"`
ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"` ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"`
ServerRejoinAgeMax *string `mapstructure:"server_rejoin_age_max" json:"server_rejoin_age_max,omitempty"`
Service *ServiceDefinition `mapstructure:"service" json:"-"` Service *ServiceDefinition `mapstructure:"service" json:"-"`
Services []ServiceDefinition `mapstructure:"services" json:"-"` Services []ServiceDefinition `mapstructure:"services" json:"-"`
SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"` SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"`

1
agent/config/default.go

@ -58,6 +58,7 @@ func DefaultSource() Source {
segment_limit = 64 segment_limit = 64
server = false server = false
server_rejoin_age_max = "168h"
syslog_facility = "LOCAL0" syslog_facility = "LOCAL0"
tls = { tls = {

12
agent/config/runtime.go

@ -1358,6 +1358,18 @@ type RuntimeConfig struct {
// hcl: ports { server = int } // hcl: ports { server = int }
ServerPort int ServerPort int
// ServerRejoinAgeMax is used to specify the duration of time a server
// is allowed to be down/offline before a startup operation is refused.
//
// For example: if a server has been offline for 5 days, and this option
// is configured to 3 days, then any subsequent startup operation will fail
// and require an operator to manually intervene.
//
// The default is: 7 days
//
// hcl: server_rejoin_age_max = "duration"
ServerRejoinAgeMax time.Duration
// Services contains the provided service definitions: // Services contains the provided service definitions:
// //
// hcl: services = [ // hcl: services = [

7
agent/config/runtime_test.go

@ -25,13 +25,12 @@ import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"golang.org/x/time/rate" "golang.org/x/time/rate"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/acl" "github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/cache" "github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/checks" "github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/consul" "github.com/hashicorp/consul/agent/consul"
consulrate "github.com/hashicorp/consul/agent/consul/rate" consulrate "github.com/hashicorp/consul/agent/consul/rate"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token" "github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/lib"
@ -6419,6 +6418,7 @@ func TestLoad_FullConfig(t *testing.T) {
SerfPortWAN: 8302, SerfPortWAN: 8302,
ServerMode: true, ServerMode: true,
ServerName: "Oerr9n1G", ServerName: "Oerr9n1G",
ServerRejoinAgeMax: 604800 * time.Second,
ServerPort: 3757, ServerPort: 3757,
Services: []*structs.ServiceDefinition{ Services: []*structs.ServiceDefinition{
{ {
@ -7163,7 +7163,8 @@ func TestRuntimeConfig_Sanitize(t *testing.T) {
}, },
}, },
}, },
Locality: &Locality{Region: strPtr("us-west-1"), Zone: strPtr("us-west-1a")}, Locality: &Locality{Region: strPtr("us-west-1"), Zone: strPtr("us-west-1a")},
ServerRejoinAgeMax: 24 * 7 * time.Hour,
} }
b, err := json.MarshalIndent(rt.Sanitized(), "", " ") b, err := json.MarshalIndent(rt.Sanitized(), "", " ")

1
agent/config/testdata/TestRuntimeConfig_Sanitize.golden vendored

@ -332,6 +332,7 @@
"ServerMode": false, "ServerMode": false,
"ServerName": "", "ServerName": "",
"ServerPort": 0, "ServerPort": 0,
"ServerRejoinAgeMax": "168h0m0s",
"Services": [ "Services": [
{ {
"Address": "", "Address": "",

1
agent/config/testdata/full-config.hcl vendored

@ -394,6 +394,7 @@ serf_lan = "99.43.63.15"
serf_wan = "67.88.33.19" serf_wan = "67.88.33.19"
server = true server = true
server_name = "Oerr9n1G" server_name = "Oerr9n1G"
server_rejoin_age_max = "604800s"
service = { service = {
id = "dLOXpSCI" id = "dLOXpSCI"
name = "o1ynPkp0" name = "o1ynPkp0"

1
agent/config/testdata/full-config.json vendored

@ -453,6 +453,7 @@
"serf_wan": "67.88.33.19", "serf_wan": "67.88.33.19",
"server": true, "server": true,
"server_name": "Oerr9n1G", "server_name": "Oerr9n1G",
"server_rejoin_age_max": "604800s",
"service": { "service": {
"id": "dLOXpSCI", "id": "dLOXpSCI",
"name": "o1ynPkp0", "name": "o1ynPkp0",

6
agent/consul/config.go

@ -447,6 +447,10 @@ type Config struct {
// Embedded Consul Enterprise specific configuration // Embedded Consul Enterprise specific configuration
*EnterpriseConfig *EnterpriseConfig
// ServerRejoinAgeMax is used to specify the duration of time a server
// is allowed to be down/offline before a startup operation is refused.
ServerRejoinAgeMax time.Duration
} }
func (c *Config) InPrimaryDatacenter() bool { func (c *Config) InPrimaryDatacenter() bool {
@ -574,6 +578,8 @@ func DefaultConfig() *Config {
PeeringTestAllowPeerRegistrations: false, PeeringTestAllowPeerRegistrations: false,
EnterpriseConfig: DefaultEnterpriseConfig(), EnterpriseConfig: DefaultEnterpriseConfig(),
ServerRejoinAgeMax: 24 * 7 * time.Hour,
} }
// Increase our reap interval to 3 days instead of 24h. // Increase our reap interval to 3 days instead of 24h.

71
agent/consul/server_metadata.go

@ -0,0 +1,71 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package consul
import (
"encoding/json"
"io"
"os"
"time"
)
// ServerMetadataFile is the name of the file on disk that server metadata
// should be written to.
const ServerMetadataFile = "server_metadata.json"
// ServerMetadata represents specific metadata about a running server.
type ServerMetadata struct {
// LastSeenUnix is the timestamp a server was last seen, in Unix format.
LastSeenUnix int64 `json:"last_seen_unix"`
}
// IsLastSeenStale checks whether the last seen timestamp is older than a given duration.
func (md *ServerMetadata) IsLastSeenStale(d time.Duration) bool {
lastSeen := time.Unix(md.LastSeenUnix, 0)
maxAge := time.Now().Add(-d)
return lastSeen.Before(maxAge)
}
// OpenServerMetadata is a helper function for opening the server metadata file
// with the correct permissions.
func OpenServerMetadata(filename string) (io.WriteCloser, error) {
return os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
}
type ServerMetadataReadFunc func(filename string) (*ServerMetadata, error)
// ReadServerMetadata is a helper function for reading the contents of a server
// metadata file and unmarshaling the data from JSON.
func ReadServerMetadata(filename string) (*ServerMetadata, error) {
b, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
var md ServerMetadata
if err := json.Unmarshal(b, &md); err != nil {
return nil, err
}
return &md, nil
}
// WriteServerMetadata writes server metadata to a file in JSON format.
func WriteServerMetadata(w io.Writer) error {
md := &ServerMetadata{
LastSeenUnix: time.Now().Unix(),
}
b, err := json.Marshal(md)
if err != nil {
return err
}
if _, err := w.Write(b); err != nil {
return err
}
return nil
}

68
agent/consul/server_metadata_test.go

@ -0,0 +1,68 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package consul
import (
"bytes"
"errors"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
type mockServerMetadataWriter struct {
writeErr error
}
func (m *mockServerMetadataWriter) Write(p []byte) (n int, err error) {
if m.writeErr != nil {
return 0, m.writeErr
}
return 1, nil
}
func TestServerMetadata(t *testing.T) {
now := time.Now()
t.Run("TestIsLastSeenStaleTrue", func(t *testing.T) {
// Create a server that is 48 hours old.
md := &ServerMetadata{
LastSeenUnix: now.Add(-48 * time.Hour).Unix(),
}
stale := md.IsLastSeenStale(24 * time.Hour)
assert.True(t, stale)
})
t.Run("TestIsLastSeenStaleFalse", func(t *testing.T) {
// Create a server that is 1 hour old.
md := &ServerMetadata{
LastSeenUnix: now.Add(-1 * time.Hour).Unix(),
}
stale := md.IsLastSeenStale(24 * time.Hour)
assert.False(t, stale)
})
}
func TestWriteServerMetadata(t *testing.T) {
t.Run("TestWriteError", func(t *testing.T) {
m := &mockServerMetadataWriter{
writeErr: errors.New("write error"),
}
err := WriteServerMetadata(m)
assert.Error(t, err)
})
t.Run("TestOK", func(t *testing.T) {
b := new(bytes.Buffer)
err := WriteServerMetadata(b)
assert.NoError(t, err)
assert.True(t, b.Len() > 0)
})
}

69
website/content/docs/agent/config/config-files.mdx

@ -7,7 +7,7 @@ description: >-
# Agents Configuration File Reference ((#configuration_files)) # Agents Configuration File Reference ((#configuration_files))
This topic describes the parameters for configuring Consul agents. For information about how to start Consul agents, refer to [Starting the Consul Agent](/consul/docs/agent#starting-the-consul-agent). This topic describes the parameters for configuring Consul agents. For information about how to start Consul agents, refer to [Starting the Consul Agent](/consul/docs/agent#starting-the-consul-agent).
## Overview ## Overview
@ -63,25 +63,25 @@ telemetry {
</CodeTabs> </CodeTabs>
### Time-to-live values ### Time-to-live values
Consul uses the Go `time` package to parse all time-to-live (TTL) values used in Consul agent configuration files. Specify integer and float values as a string and include one or more of the following units of time: Consul uses the Go `time` package to parse all time-to-live (TTL) values used in Consul agent configuration files. Specify integer and float values as a string and include one or more of the following units of time:
- `ns` - `ns`
- `us` - `us`
- `µs` - `µs`
- `ms` - `ms`
- `s` - `s`
- `m` - `m`
- `h` - `h`
Examples: Examples:
- `'300ms'` - `'300ms'`
- `'1.5h'` - `'1.5h'`
- `'2h45m'` - `'2h45m'`
Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDuration) for additional information. Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDuration) for additional information.
## General parameters ## General parameters
@ -549,9 +549,9 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati
- `https_handshake_timeout` - Configures the limit for how long the HTTPS server in both client and server agents will wait for a client to complete a TLS handshake. This should be kept conservative as it limits how many connections an unauthenticated attacker can open if `verify_incoming` is being using to authenticate clients (strongly recommended in production). Default value is `5s`. - `https_handshake_timeout` - Configures the limit for how long the HTTPS server in both client and server agents will wait for a client to complete a TLS handshake. This should be kept conservative as it limits how many connections an unauthenticated attacker can open if `verify_incoming` is being using to authenticate clients (strongly recommended in production). Default value is `5s`.
- `request_limits` - This object specifies configurations that limit the rate of RPC and gRPC requests on the Consul server. Limiting the rate of gRPC and RPC requests also limits HTTP requests to the Consul server. - `request_limits` - This object specifies configurations that limit the rate of RPC and gRPC requests on the Consul server. Limiting the rate of gRPC and RPC requests also limits HTTP requests to the Consul server.
- `mode` - String value that specifies an action to take if the rate of requests exceeds the limit. You can specify the following values: - `mode` - String value that specifies an action to take if the rate of requests exceeds the limit. You can specify the following values:
- `permissive`: The server continues to allow requests and records an error in the logs. - `permissive`: The server continues to allow requests and records an error in the logs.
- `enforcing`: The server stops accepting requests and records an error in the logs. - `enforcing`: The server stops accepting requests and records an error in the logs.
- `disabled`: Limits are not enforced or tracked. This is the default value for `mode`. - `disabled`: Limits are not enforced or tracked. This is the default value for `mode`.
- `read_rate` - Integer value that specifies the number of read requests per second. Default is `100`. - `read_rate` - Integer value that specifies the number of read requests per second. Default is `100`.
- `write_rate` - Integer value that specifies the number of write requests per second. Default is `100`. - `write_rate` - Integer value that specifies the number of write requests per second. Default is `100`.
- `rpc_handshake_timeout` - Configures the limit for how long servers will wait after a client TCP connection is established before they complete the connection handshake. When TLS is used, the same timeout applies to the TLS handshake separately from the initial protocol negotiation. All Consul clients should perform this immediately on establishing a new connection. This should be kept conservative as it limits how many connections an unauthenticated attacker can open if `verify_incoming` is being using to authenticate clients (strongly recommended in production). When `verify_incoming` is true on servers, this limits how long the connection socket and associated goroutines will be held open before the client successfully authenticates. Default value is `5s`. - `rpc_handshake_timeout` - Configures the limit for how long servers will wait after a client TCP connection is established before they complete the connection handshake. When TLS is used, the same timeout applies to the TLS handshake separately from the initial protocol negotiation. All Consul clients should perform this immediately on establishing a new connection. This should be kept conservative as it limits how many connections an unauthenticated attacker can open if `verify_incoming` is being using to authenticate clients (strongly recommended in production). When `verify_incoming` is true on servers, this limits how long the connection socket and associated goroutines will be held open before the client successfully authenticates. Default value is `5s`.
@ -736,6 +736,11 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati
- `server` Equivalent to the [`-server` command-line flag](/consul/docs/agent/config/cli-flags#_server). - `server` Equivalent to the [`-server` command-line flag](/consul/docs/agent/config/cli-flags#_server).
- `server_rejoin_age_max` - controls the allowed maximum age of a stale server attempting to rejoin a cluster.
If a server is not running for this period, then it will refuse to start up again until an operator intervenes. This is to protect
clusters from instability caused by decommissioned servers accidentally being started again.
Note: the default value is 7d and the minimum value is 6h.
- `non_voting_server` - **This field is deprecated in Consul 1.9.1. See the [`read_replica`](#read_replica) field instead.** - `non_voting_server` - **This field is deprecated in Consul 1.9.1. See the [`read_replica`](#read_replica) field instead.**
- `read_replica` - Equivalent to the [`-read-replica` command-line flag](/consul/docs/agent/config/cli-flags#_read_replica). - `read_replica` - Equivalent to the [`-read-replica` command-line flag](/consul/docs/agent/config/cli-flags#_read_replica).
@ -924,7 +929,7 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati
[`acl.tokens.agent_recovery`](#acl_tokens_agent_recovery).** [`acl.tokens.agent_recovery`](#acl_tokens_agent_recovery).**
- `config_file_service_registration` ((#acl_tokens_config_file_service_registration)) - Specifies the ACL - `config_file_service_registration` ((#acl_tokens_config_file_service_registration)) - Specifies the ACL
token the agent uses to register services and checks from [service](/consul/docs/services/usage/define-services) and [check](/consul/docs/services/usage/checks) definitions token the agent uses to register services and checks from [service](/consul/docs/services/usage/define-services) and [check](/consul/docs/services/usage/checks) definitions
specified in configuration files or fragments passed to the agent using the `-hcl` specified in configuration files or fragments passed to the agent using the `-hcl`
flag. flag.
@ -1626,12 +1631,12 @@ subsystem that provides Consul's service mesh capabilities.
- `backend` ((#raft_logstore_backend)) Specifies which storage - `backend` ((#raft_logstore_backend)) Specifies which storage
engine to use to persist logs. Valid options are `boltdb` or `wal`. Default engine to use to persist logs. Valid options are `boltdb` or `wal`. Default
is `boltdb`. The `wal` option specifies an experimental backend that is `boltdb`. The `wal` option specifies an experimental backend that
should be used with caution. Refer to should be used with caution. Refer to
[Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore)
for more information. for more information.
- `disable_log_cache` ((#raft_logstore_disable_log_cache)) Disables the in-memory cache for recent logs. We recommend using it for performance testing purposes, as no significant improvement has been measured when the cache is disabled. While the in-memory log cache theoretically prevents disk reads for recent logs, recent logs are also stored in the OS page cache, which does not slow either the `boltdb` or `wal` backend's ability to read them. - `disable_log_cache` ((#raft_logstore_disable_log_cache)) Disables the in-memory cache for recent logs. We recommend using it for performance testing purposes, as no significant improvement has been measured when the cache is disabled. While the in-memory log cache theoretically prevents disk reads for recent logs, recent logs are also stored in the OS page cache, which does not slow either the `boltdb` or `wal` backend's ability to read them.
- `verification` ((#raft_logstore_verification)) This is a nested object that - `verification` ((#raft_logstore_verification)) This is a nested object that
allows configuring the online verification of the LogStore. Verification allows configuring the online verification of the LogStore. Verification
@ -1652,43 +1657,43 @@ subsystem that provides Consul's service mesh capabilities.
on that server. The only correct response is to stop the server, remove its on that server. The only correct response is to stop the server, remove its
data directory, and restart so it can be caught back up with a correct data directory, and restart so it can be caught back up with a correct
server again. Please report verification failures including details about server again. Please report verification failures including details about
your hardware and workload via GitHub issues. Refer to your hardware and workload via GitHub issues. Refer to
[Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore)
for more information. for more information.
- `enabled` ((#raft_logstore_verification_enabled)) - Set to `true` to - `enabled` ((#raft_logstore_verification_enabled)) - Set to `true` to
allow this Consul server to write and verify log verification checkpoints allow this Consul server to write and verify log verification checkpoints
when elected leader. when elected leader.
- `interval` ((#raft_logstore_verification_interval)) - Specifies the time - `interval` ((#raft_logstore_verification_interval)) - Specifies the time
interval between checkpoints. There is no default value. You must interval between checkpoints. There is no default value. You must
configure the `interval` and set [`enabled`](#raft_logstore_verification_enabled) configure the `interval` and set [`enabled`](#raft_logstore_verification_enabled)
to `true` to correctly enable intervals. We recommend using an interval to `true` to correctly enable intervals. We recommend using an interval
between `30s` and `5m`. The performance overhead is insignificant when the between `30s` and `5m`. The performance overhead is insignificant when the
interval is set to `5m` or less. interval is set to `5m` or less.
- `boltdb` ((#raft_logstore_boltdb)) - Object that configures options for - `boltdb` ((#raft_logstore_boltdb)) - Object that configures options for
Raft's `boltdb` backend. It has no effect if the `backend` is not `boltdb`. Raft's `boltdb` backend. It has no effect if the `backend` is not `boltdb`.
- `no_freelist_sync` ((#raft_logstore_boltdb_no_freelist_sync)) - Set to - `no_freelist_sync` ((#raft_logstore_boltdb_no_freelist_sync)) - Set to
`true` to disable storing BoltDB's freelist to disk within the `true` to disable storing BoltDB's freelist to disk within the
`raft.db` file. Disabling freelist syncs reduces the disk IO required `raft.db` file. Disabling freelist syncs reduces the disk IO required
for write operations, but could potentially increase start up time for write operations, but could potentially increase start up time
because Consul must scan the database to find free space because Consul must scan the database to find free space
within the file. within the file.
- - `wal` ((#raft_logstore_wal)) - Object that configures the `wal` backend. - - `wal` ((#raft_logstore_wal)) - Object that configures the `wal` backend.
Refer to [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore) Refer to [Experimental WAL LogStore backend](/consul/docs/agent/wal-logstore)
for more information. for more information.
- `segment_size_mb` ((#raft_logstore_wal_segment_size_mb)) - Integer value - `segment_size_mb` ((#raft_logstore_wal_segment_size_mb)) - Integer value
that represents the target size in MB for each segment file before that represents the target size in MB for each segment file before
rolling to a new segment. The default value is `64` and is suitable for rolling to a new segment. The default value is `64` and is suitable for
most deployments. While a smaller value may use less disk space because you most deployments. While a smaller value may use less disk space because you
can reclaim space by deleting old segments sooner, the smaller segment that results can reclaim space by deleting old segments sooner, the smaller segment that results
may affect performance because safely rotating to a new file more may affect performance because safely rotating to a new file more
frequently can impact tail latencies. Larger values are unlikely frequently can impact tail latencies. Larger values are unlikely
to improve performance significantly. We recommend using this to improve performance significantly. We recommend using this
configuration for performance testing purposes. configuration for performance testing purposes.
- `raft_protocol` ((#raft_protocol)) Equivalent to the [`-raft-protocol` - `raft_protocol` ((#raft_protocol)) Equivalent to the [`-raft-protocol`

Loading…
Cancel
Save