agent: ensure node info sync and full sync. (#7189)

This fixes #7020. There are two problems this PR solves: * if the node info changes it is highly likely to get service and check registration permission errors unless those service tokens have node:write. Hopefully services you register don’t have this permission. * the timer for a full sync gets reset for every partial sync which means that many partial syncs are preventing a full sync from happening Instead of syncing node info last, after services and checks, and possibly saving one RPC because it is included in every service sync, I am syncing node info first. It is only ever going to be a single RPC that we are only doing when node info has changed. This way we are guaranteed to sync node info even when something goes wrong with services or checks which is more likely because there are more syncs happening for them.
5 years ago · 6a18f01b42
parent d364a64f9a
commit 6a18f01b42
3 changed files with 33 additions and 10 deletions
--- a/agent/ae/ae.go
+++ b/agent/ae/ae.go
@ -96,6 +96,10 @@ type StateSyncer struct {
 	// syncChangesEvent generates an event based on multiple conditions
 	// when the state machine is performing partial state syncs.
 	syncChangesEvent func() event
+
+	// nextFullSyncCh is a chan that receives a time.Time when the next
+	// full sync should occur.
+	nextFullSyncCh <-chan time.Time
 }

 const (
@ -148,6 +152,7 @@ func (s *StateSyncer) Run() {
 	if s.ClusterSize == nil {
 		panic("ClusterSize not set")
 	}
+	s.resetNextFullSyncCh()
 	s.runFSM(fullSyncState, s.nextFSMState)
 }

@ -245,8 +250,9 @@ func (s *StateSyncer) retrySyncFullEventFn() event {
 		}

 	// retry full sync after some time
-	// todo(fs): why don't we use s.Interval here?
+	// it is using retryFailInterval because it is retrying the sync
 	case <-time.After(s.retryFailInterval + s.stagger(s.retryFailInterval)):
+		s.resetNextFullSyncCh()
 		return syncFullTimerEvent

 	case <-s.ShutdownCh:
@ -266,13 +272,15 @@ func (s *StateSyncer) syncChangesEventFn() event {
 	case <-s.SyncFull.Notif():
 		select {
 		case <-time.After(s.stagger(s.serverUpInterval)):
+			s.resetNextFullSyncCh()
 			return syncFullNotifEvent
 		case <-s.ShutdownCh:
 			return shutdownEvent
 		}

 	// time for a full sync again
-	case <-time.After(s.Interval + s.stagger(s.Interval)):
+	case <-s.nextFullSyncCh:
+		s.resetNextFullSyncCh()
 		return syncFullTimerEvent

 	// do partial syncs on demand
@ -284,6 +292,16 @@ func (s *StateSyncer) syncChangesEventFn() event {
 	}
 }

+// resetNextFullSyncCh resets nextFullSyncCh and sets it to interval+stagger.
+// Call this function everytime a full sync is performed.
+func (s *StateSyncer) resetNextFullSyncCh() {
+	if s.stagger != nil {
+		s.nextFullSyncCh = time.After(s.Interval + s.stagger(s.Interval))
+	} else {
+		s.nextFullSyncCh = time.After(s.Interval)
+	}
+}
+
 // stubbed out for testing
 var libRandomStagger = lib.RandomStagger

--- a/agent/ae/ae_test.go
+++ b/agent/ae/ae_test.go
@ -400,5 +400,6 @@ func testSyncer(t *testing.T) *StateSyncer {
 	l := NewStateSyncer(nil, time.Second, nil, logger)
 	l.stagger = func(d time.Duration) time.Duration { return d }
 	l.ClusterSize = func() int { return 1 }
+	l.resetNextFullSyncCh()
 	return l
 }
--- a/agent/local/state.go
+++ b/agent/local/state.go
@ -1023,6 +1023,15 @@ func (l *State) SyncChanges() error {
 	l.Lock()
 	defer l.Unlock()

+	// Sync the node level info if we need to.
+	if l.nodeInfoInSync {
+		l.logger.Debug("Node info in sync")
+	} else {
+		if err := l.syncNodeInfo(); err != nil {
+			return err
+		}
+	}
+
 	// We will do node-level info syncing at the end, since it will get
 	// updated by a service or check sync anyway, given how the register
 	// API works.
@ -1064,14 +1073,7 @@ func (l *State) SyncChanges() error {
 			return err
 		}
 	}
-
-	// Now sync the node level info if we need to, and didn't do any of
-	// the other sync operations.
-	if l.nodeInfoInSync {
-		l.logger.Debug("Node info in sync")
-		return nil
-	}
-	return l.syncNodeInfo()
+	return nil
 }

 // deleteService is used to delete a service from the server
@ -1204,6 +1206,7 @@ func (l *State) syncService(key structs.ServiceID) error {
 		Service:         l.services[key].Service,
 		EnterpriseMeta:  key.EnterpriseMeta,
 		WriteRequest:    structs.WriteRequest{Token: st},
+		SkipNodeUpdate:  l.nodeInfoInSync,
 	}

 	// Backwards-compatibility for Consul < 0.5
@ -1266,6 +1269,7 @@ func (l *State) syncCheck(key structs.CheckID) error {
 		Check:           c.Check,
 		EnterpriseMeta:  c.Check.EnterpriseMeta,
 		WriteRequest:    structs.WriteRequest{Token: ct},
+		SkipNodeUpdate:  l.nodeInfoInSync,
 	}

 	var serviceKey structs.ServiceID