mirror of https://github.com/hashicorp/consul
agent: first pass at join retry
parent
55f0f8d0ba
commit
7c91c08457
|
@ -52,6 +52,7 @@ type Command struct {
|
|||
func (c *Command) readConfig() *Config {
|
||||
var cmdConfig Config
|
||||
var configFiles []string
|
||||
var retryInterval string
|
||||
cmdFlags := flag.NewFlagSet("agent", flag.ContinueOnError)
|
||||
cmdFlags.Usage = func() { c.Ui.Output(c.Help()) }
|
||||
|
||||
|
@ -82,11 +83,26 @@ func (c *Command) readConfig() *Config {
|
|||
"enable re-joining after a previous leave")
|
||||
cmdFlags.Var((*AppendSliceValue)(&cmdConfig.StartJoin), "join",
|
||||
"address of agent to join on startup")
|
||||
cmdFlags.Var((*AppendSliceValue)(&cmdConfig.RetryJoin), "retry-join",
|
||||
"address of agent to join on startup with retry")
|
||||
cmdFlags.IntVar(&cmdConfig.RetryMaxAttempts, "retry-max", 0,
|
||||
"number of retries for joining")
|
||||
cmdFlags.StringVar(&retryInterval, "retry-interval", "",
|
||||
"interval between join attempts")
|
||||
|
||||
if err := cmdFlags.Parse(c.args); err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if retryInterval != "" {
|
||||
dur, err := time.ParseDuration(retryInterval)
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Error: %s", err))
|
||||
return nil
|
||||
}
|
||||
cmdConfig.RetryInterval = dur
|
||||
}
|
||||
|
||||
config := DefaultConfig()
|
||||
if len(configFiles) > 0 {
|
||||
fileConfig, err := ReadConfigPaths(configFiles)
|
||||
|
@ -353,6 +369,35 @@ func (c *Command) startupJoin(config *Config) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (c *Command) retryJoin(config *Config, errCh chan<- struct{}) {
|
||||
if len(config.RetryJoin) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
logger := c.agent.logger
|
||||
logger.Printf("[INFO] agent: Joining cluster...")
|
||||
|
||||
attempt := 0
|
||||
for {
|
||||
n, err := c.agent.JoinLAN(config.RetryJoin)
|
||||
if err == nil {
|
||||
logger.Printf("[INFO] agent: Join completed. Synced with %d initial agents", n)
|
||||
return
|
||||
}
|
||||
|
||||
attempt++
|
||||
if config.RetryMaxAttempts > 0 && attempt > config.RetryMaxAttempts {
|
||||
logger.Printf("[ERROR] agent: max join retry exhausted, exiting")
|
||||
close(errCh)
|
||||
return
|
||||
}
|
||||
|
||||
logger.Printf("[WARN] agent: Join failed: %v, retrying in %v", err,
|
||||
config.RetryInterval)
|
||||
time.Sleep(config.RetryInterval)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Command) Run(args []string) int {
|
||||
c.Ui = &cli.PrefixedUi{
|
||||
OutputPrefix: "==> ",
|
||||
|
@ -492,12 +537,16 @@ func (c *Command) Run(args []string) int {
|
|||
c.Ui.Output("Log data will now stream in as it occurs:\n")
|
||||
logGate.Flush()
|
||||
|
||||
// Start retry join process
|
||||
errCh := make(chan struct{})
|
||||
go c.retryJoin(config, errCh)
|
||||
|
||||
// Wait for exit
|
||||
return c.handleSignals(config)
|
||||
return c.handleSignals(config, errCh)
|
||||
}
|
||||
|
||||
// handleSignals blocks until we get an exit-causing signal
|
||||
func (c *Command) handleSignals(config *Config) int {
|
||||
func (c *Command) handleSignals(config *Config, retryJoin <-chan struct{}) int {
|
||||
signalCh := make(chan os.Signal, 4)
|
||||
signal.Notify(signalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP)
|
||||
|
||||
|
@ -511,6 +560,8 @@ WAIT:
|
|||
sig = syscall.SIGHUP
|
||||
case <-c.ShutdownCh:
|
||||
sig = os.Interrupt
|
||||
case <-retryJoin:
|
||||
return 1
|
||||
case <-c.agent.ShutdownCh():
|
||||
// Agent is already shutdown!
|
||||
return 0
|
||||
|
|
|
@ -182,6 +182,20 @@ type Config struct {
|
|||
// addresses, then the agent will error and exit.
|
||||
StartJoin []string `mapstructure:"start_join"`
|
||||
|
||||
// RetryJoin is a list of addresses to join with retry enabled.
|
||||
RetryJoin []string `mapstructure:"retry_join"`
|
||||
|
||||
// RetryMaxAttempts specifies the maximum number of times to retry joining a
|
||||
// host on startup. This is useful for cases where we know the node will be
|
||||
// online eventually.
|
||||
RetryMaxAttempts int `mapstructure:"retry_max"`
|
||||
|
||||
// RetryInterval specifies the amount of time to wait in between join
|
||||
// attempts on agent start. The minimum allowed value is 1 second and
|
||||
// the default is 30s.
|
||||
RetryInterval time.Duration `mapstructure:"-" json:"-"`
|
||||
RetryIntervalRaw string `mapstructure:"retry_interval"`
|
||||
|
||||
// UiDir is the directory containing the Web UI resources.
|
||||
// If provided, the UI endpoints will be enabled.
|
||||
UiDir string `mapstructure:"ui_dir"`
|
||||
|
@ -321,6 +335,7 @@ func DefaultConfig() *Config {
|
|||
ACLTTL: 30 * time.Second,
|
||||
ACLDownPolicy: "extend-cache",
|
||||
ACLDefaultPolicy: "allow",
|
||||
RetryInterval: 30 * time.Second,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -443,6 +458,14 @@ func DecodeConfig(r io.Reader) (*Config, error) {
|
|||
result.ACLTTL = dur
|
||||
}
|
||||
|
||||
if raw := result.RetryIntervalRaw; raw != "" {
|
||||
dur, err := time.ParseDuration(raw)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("RetryInterval invalid: %v", err)
|
||||
}
|
||||
result.RetryInterval = dur
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
|
@ -674,6 +697,9 @@ func MergeConfig(a, b *Config) *Config {
|
|||
if b.RejoinAfterLeave {
|
||||
result.RejoinAfterLeave = true
|
||||
}
|
||||
if b.RetryMaxAttempts != 0 {
|
||||
result.RetryMaxAttempts = b.RetryMaxAttempts
|
||||
}
|
||||
if b.DNSConfig.NodeTTL != 0 {
|
||||
result.DNSConfig.NodeTTL = b.DNSConfig.NodeTTL
|
||||
}
|
||||
|
@ -737,6 +763,11 @@ func MergeConfig(a, b *Config) *Config {
|
|||
result.StartJoin = append(result.StartJoin, a.StartJoin...)
|
||||
result.StartJoin = append(result.StartJoin, b.StartJoin...)
|
||||
|
||||
// Copy the retry join addresses
|
||||
result.RetryJoin = make([]string, 0, len(a.RetryJoin)+len(b.RetryJoin))
|
||||
result.RetryJoin = append(result.RetryJoin, a.RetryJoin...)
|
||||
result.RetryJoin = append(result.RetryJoin, b.RetryJoin...)
|
||||
|
||||
return &result
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue