@ -457,7 +457,7 @@ func (a *Agent) Start() error {
if err := a . loadProxies ( c ) ; err != nil {
return err
}
if err := a . loadChecks ( c ); err != nil {
if err := a . loadChecks ( c , nil ); err != nil {
return err
}
if err := a . loadMetadata ( c ) ; err != nil {
@ -2109,10 +2109,10 @@ func (a *Agent) addServiceInternal(service *structs.NodeService, chkTypes []*str
a . PauseSync ( )
defer a . ResumeSync ( )
// Take a snapshot of the current state of checks (if any), and
// restore them before resuming anti-entropy.
// Take a snapshot of the current state of checks (if any), and when adding
// a check that already existed carry over the state before resuming
// anti-entropy.
snap := a . snapshotCheckState ( )
defer a . restoreCheckState ( snap )
var checks [ ] * structs . HealthCheck
@ -2143,6 +2143,13 @@ func (a *Agent) addServiceInternal(service *structs.NodeService, chkTypes []*str
check . Status = chkType . Status
}
// Restore the fields from the snapshot.
prev , ok := snap [ check . CheckID ]
if ok {
check . Output = prev . Output
check . Status = prev . Status
}
checks = append ( checks , check )
}
@ -3346,10 +3353,17 @@ func (a *Agent) unloadServices() error {
// loadChecks loads check definitions and/or persisted check definitions from
// disk and re-registers them with the local agent.
func ( a * Agent ) loadChecks ( conf * config . RuntimeConfig ) error {
func ( a * Agent ) loadChecks ( conf * config . RuntimeConfig , snap map [ types . CheckID ] * structs . HealthCheck ) error {
// Register the checks from config
for _ , check := range conf . Checks {
health := check . HealthCheck ( conf . NodeName )
// Restore the fields from the snapshot.
if prev , ok := snap [ health . CheckID ] ; ok {
health . Output = prev . Output
health . Status = prev . Status
}
chkType := check . CheckType ( )
if err := a . addCheckLocked ( health , chkType , false , check . Token , ConfigSourceLocal ) ; err != nil {
return fmt . Errorf ( "Failed to register check '%s': %v %v" , check . Name , err , check )
@ -3406,6 +3420,12 @@ func (a *Agent) loadChecks(conf *config.RuntimeConfig) error {
// services into the active pool
p . Check . Status = api . HealthCritical
// Restore the fields from the snapshot.
if prev , ok := snap [ p . Check . CheckID ] ; ok {
p . Check . Output = prev . Output
p . Check . Status = prev . Status
}
if err := a . addCheckLocked ( p . Check , p . ChkType , false , p . Token , ConfigSourceLocal ) ; err != nil {
// Purge the check if it is unable to be restored.
a . logger . Printf ( "[WARN] agent: Failed to restore check %q: %s" ,
@ -3634,15 +3654,6 @@ func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck {
return a . State . Checks ( )
}
// restoreCheckState is used to reset the health state based on a snapshot.
// This is done after we finish the reload to avoid any unnecessary flaps
// in health state and potential session invalidations.
func ( a * Agent ) restoreCheckState ( snap map [ types . CheckID ] * structs . HealthCheck ) {
for id , check := range snap {
a . State . UpdateCheck ( id , check . Status , check . Output )
}
}
// loadMetadata loads node metadata fields from the agent config and
// updates them on the local agent.
func ( a * Agent ) loadMetadata ( conf * config . RuntimeConfig ) error {
@ -3765,9 +3776,9 @@ func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error {
a . stateLock . Lock ( )
defer a . stateLock . Unlock ( )
// Snapshot the current state, and restore it afterwards
// Snapshot the current state, and use that to initialize the checks when
// they are recreated.
snap := a . snapshotCheckState ( )
defer a . restoreCheckState ( snap )
// First unload all checks, services, and metadata. This lets us begin the reload
// with a clean slate.
@ -3798,7 +3809,7 @@ func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error {
if err := a . loadProxies ( newCfg ) ; err != nil {
return fmt . Errorf ( "Failed reloading proxies: %s" , err )
}
if err := a . loadChecks ( newCfg ); err != nil {
if err := a . loadChecks ( newCfg , snap ); err != nil {
return fmt . Errorf ( "Failed reloading checks: %s" , err )
}
if err := a . loadMetadata ( newCfg ) ; err != nil {