diff --git a/command/agent/command.go b/command/agent/command.go index 8058bfb0ab..cc1ccf57a8 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -52,6 +52,7 @@ type Command struct { func (c *Command) readConfig() *Config { var cmdConfig Config var configFiles []string + var retryInterval string cmdFlags := flag.NewFlagSet("agent", flag.ContinueOnError) cmdFlags.Usage = func() { c.Ui.Output(c.Help()) } @@ -82,11 +83,26 @@ func (c *Command) readConfig() *Config { "enable re-joining after a previous leave") cmdFlags.Var((*AppendSliceValue)(&cmdConfig.StartJoin), "join", "address of agent to join on startup") + cmdFlags.Var((*AppendSliceValue)(&cmdConfig.RetryJoin), "retry-join", + "address of agent to join on startup with retry") + cmdFlags.IntVar(&cmdConfig.RetryMaxAttempts, "retry-max", 0, + "number of retries for joining") + cmdFlags.StringVar(&retryInterval, "retry-interval", "", + "interval between join attempts") if err := cmdFlags.Parse(c.args); err != nil { return nil } + if retryInterval != "" { + dur, err := time.ParseDuration(retryInterval) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error: %s", err)) + return nil + } + cmdConfig.RetryInterval = dur + } + config := DefaultConfig() if len(configFiles) > 0 { fileConfig, err := ReadConfigPaths(configFiles) @@ -353,6 +369,35 @@ func (c *Command) startupJoin(config *Config) error { return nil } +func (c *Command) retryJoin(config *Config, errCh chan<- struct{}) { + if len(config.RetryJoin) == 0 { + return + } + + logger := c.agent.logger + logger.Printf("[INFO] agent: Joining cluster...") + + attempt := 0 + for { + n, err := c.agent.JoinLAN(config.RetryJoin) + if err == nil { + logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n) + return + } + + attempt++ + if config.RetryMaxAttempts > 0 && attempt > config.RetryMaxAttempts { + logger.Printf("[ERROR] agent: max join retry exhausted, exiting") + close(errCh) + return + } + + logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err, + config.RetryInterval) + time.Sleep(config.RetryInterval) + } +} + func (c *Command) Run(args []string) int { c.Ui = &cli.PrefixedUi{ OutputPrefix: "==> ", @@ -492,12 +537,16 @@ func (c *Command) Run(args []string) int { c.Ui.Output("Log data will now stream in as it occurs:\n") logGate.Flush() + // Start retry join process + errCh := make(chan struct{}) + go c.retryJoin(config, errCh) + // Wait for exit - return c.handleSignals(config) + return c.handleSignals(config, errCh) } // handleSignals blocks until we get an exit-causing signal -func (c *Command) handleSignals(config *Config) int { +func (c *Command) handleSignals(config *Config, retryJoin <-chan struct{}) int { signalCh := make(chan os.Signal, 4) signal.Notify(signalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP) @@ -511,6 +560,8 @@ WAIT: sig = syscall.SIGHUP case <-c.ShutdownCh: sig = os.Interrupt + case <-retryJoin: + return 1 case <-c.agent.ShutdownCh(): // Agent is already shutdown! return 0 diff --git a/command/agent/config.go b/command/agent/config.go index c9a47fa2d6..419fa90914 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -182,6 +182,20 @@ type Config struct { // addresses, then the agent will error and exit. StartJoin []string `mapstructure:"start_join"` + // RetryJoin is a list of addresses to join with retry enabled. + RetryJoin []string `mapstructure:"retry_join"` + + // RetryMaxAttempts specifies the maximum number of times to retry joining a + // host on startup. This is useful for cases where we know the node will be + // online eventually. + RetryMaxAttempts int `mapstructure:"retry_max"` + + // RetryInterval specifies the amount of time to wait in between join + // attempts on agent start. The minimum allowed value is 1 second and + // the default is 30s. + RetryInterval time.Duration `mapstructure:"-" json:"-"` + RetryIntervalRaw string `mapstructure:"retry_interval"` + // UiDir is the directory containing the Web UI resources. // If provided, the UI endpoints will be enabled. UiDir string `mapstructure:"ui_dir"` @@ -321,6 +335,7 @@ func DefaultConfig() *Config { ACLTTL: 30 * time.Second, ACLDownPolicy: "extend-cache", ACLDefaultPolicy: "allow", + RetryInterval: 30 * time.Second, } } @@ -443,6 +458,14 @@ func DecodeConfig(r io.Reader) (*Config, error) { result.ACLTTL = dur } + if raw := result.RetryIntervalRaw; raw != "" { + dur, err := time.ParseDuration(raw) + if err != nil { + return nil, fmt.Errorf("RetryInterval invalid: %v", err) + } + result.RetryInterval = dur + } + return &result, nil } @@ -674,6 +697,9 @@ func MergeConfig(a, b *Config) *Config { if b.RejoinAfterLeave { result.RejoinAfterLeave = true } + if b.RetryMaxAttempts != 0 { + result.RetryMaxAttempts = b.RetryMaxAttempts + } if b.DNSConfig.NodeTTL != 0 { result.DNSConfig.NodeTTL = b.DNSConfig.NodeTTL } @@ -737,6 +763,11 @@ func MergeConfig(a, b *Config) *Config { result.StartJoin = append(result.StartJoin, a.StartJoin...) result.StartJoin = append(result.StartJoin, b.StartJoin...) + // Copy the retry join addresses + result.RetryJoin = make([]string, 0, len(a.RetryJoin)+len(b.RetryJoin)) + result.RetryJoin = append(result.RetryJoin, a.RetryJoin...) + result.RetryJoin = append(result.RetryJoin, b.RetryJoin...) + return &result }