mirror of https://github.com/hashicorp/consul
Implement Leader Routine Management (#6580)
* Implement leader routine manager Switch over the following to use it for go routine management: • Config entry Replication • ACL replication - tokens, policies, roles and legacy tokens • ACL legacy token upgrade • ACL token reaping • Intention Replication • Secondary CA Roots Watching • CA Root Pruning Also added the StopAll call into the Server Shutdown method to ensure all leader routines get killed off when shutting down. This should be mostly unnecessary as `revokeLeadership` should manually stop each one but just in case we really want these to go away (eventually).pull/6581/head
parent
28221f66f2
commit
d65bbbfd4e
|
@ -9,17 +9,27 @@ import (
|
||||||
"golang.org/x/time/rate"
|
"golang.org/x/time/rate"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (s *Server) startACLTokenReaping() {
|
func (s *Server) reapExpiredTokens(ctx context.Context) error {
|
||||||
s.aclTokenReapLock.Lock()
|
limiter := rate.NewLimiter(aclTokenReapingRateLimit, aclTokenReapingBurst)
|
||||||
defer s.aclTokenReapLock.Unlock()
|
for {
|
||||||
|
if err := limiter.Wait(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
if s.aclTokenReapEnabled {
|
if s.LocalTokensEnabled() {
|
||||||
return
|
if _, err := s.reapExpiredLocalACLTokens(); err != nil {
|
||||||
|
s.logger.Printf("[ERR] acl: error reaping expired local ACL tokens: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if s.InACLDatacenter() {
|
||||||
|
if _, err := s.reapExpiredGlobalACLTokens(); err != nil {
|
||||||
|
s.logger.Printf("[ERR] acl: error reaping expired global ACL tokens: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
func (s *Server) startACLTokenReaping() {
|
||||||
s.aclTokenReapCancel = cancel
|
|
||||||
|
|
||||||
// Do a quick check for config settings that would imply the goroutine
|
// Do a quick check for config settings that would imply the goroutine
|
||||||
// below will just spin forever.
|
// below will just spin forever.
|
||||||
//
|
//
|
||||||
|
@ -30,41 +40,11 @@ func (s *Server) startACLTokenReaping() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
go func() {
|
s.leaderRoutineManager.Start(aclTokenReapingRoutineName, s.reapExpiredTokens)
|
||||||
limiter := rate.NewLimiter(aclTokenReapingRateLimit, aclTokenReapingBurst)
|
|
||||||
|
|
||||||
for {
|
|
||||||
if err := limiter.Wait(ctx); err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if s.LocalTokensEnabled() {
|
|
||||||
if _, err := s.reapExpiredLocalACLTokens(); err != nil {
|
|
||||||
s.logger.Printf("[ERR] acl: error reaping expired local ACL tokens: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if s.InACLDatacenter() {
|
|
||||||
if _, err := s.reapExpiredGlobalACLTokens(); err != nil {
|
|
||||||
s.logger.Printf("[ERR] acl: error reaping expired global ACL tokens: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
s.aclTokenReapEnabled = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) stopACLTokenReaping() {
|
func (s *Server) stopACLTokenReaping() {
|
||||||
s.aclTokenReapLock.Lock()
|
s.leaderRoutineManager.Stop(aclTokenReapingRoutineName)
|
||||||
defer s.aclTokenReapLock.Unlock()
|
|
||||||
|
|
||||||
if !s.aclTokenReapEnabled {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
s.aclTokenReapCancel()
|
|
||||||
s.aclTokenReapCancel = nil
|
|
||||||
s.aclTokenReapEnabled = false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) reapExpiredGlobalACLTokens() (int, error) {
|
func (s *Server) reapExpiredGlobalACLTokens() (int, error) {
|
||||||
|
|
|
@ -649,239 +649,244 @@ func (s *Server) initializeACLs(upgrade bool) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) startACLUpgrade() {
|
// This function is only intended to be run as a managed go routine, it will block until
|
||||||
s.aclUpgradeLock.Lock()
|
// the context passed in indicates that it should exit.
|
||||||
defer s.aclUpgradeLock.Unlock()
|
func (s *Server) legacyACLTokenUpgrade(ctx context.Context) error {
|
||||||
|
limiter := rate.NewLimiter(aclUpgradeRateLimit, int(aclUpgradeRateLimit))
|
||||||
|
for {
|
||||||
|
if err := limiter.Wait(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
if s.aclUpgradeEnabled {
|
// actually run the upgrade here
|
||||||
return
|
state := s.fsm.State()
|
||||||
}
|
tokens, waitCh, err := state.ACLTokenListUpgradeable(aclUpgradeBatchSize)
|
||||||
|
if err != nil {
|
||||||
|
s.logger.Printf("[WARN] acl: encountered an error while searching for tokens without accessor ids: %v", err)
|
||||||
|
}
|
||||||
|
// No need to check expiration time here, as that only exists for v2 tokens.
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
if len(tokens) == 0 {
|
||||||
s.aclUpgradeCancel = cancel
|
ws := memdb.NewWatchSet()
|
||||||
|
ws.Add(state.AbandonCh())
|
||||||
|
ws.Add(waitCh)
|
||||||
|
ws.Add(ctx.Done())
|
||||||
|
|
||||||
go func() {
|
// wait for more tokens to need upgrading or the aclUpgradeCh to be closed
|
||||||
limiter := rate.NewLimiter(aclUpgradeRateLimit, int(aclUpgradeRateLimit))
|
ws.Watch(nil)
|
||||||
for {
|
continue
|
||||||
if err := limiter.Wait(ctx); err != nil {
|
}
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// actually run the upgrade here
|
var newTokens structs.ACLTokens
|
||||||
state := s.fsm.State()
|
for _, token := range tokens {
|
||||||
tokens, waitCh, err := state.ACLTokenListUpgradeable(aclUpgradeBatchSize)
|
// This should be entirely unnecessary but is just a small safeguard against changing accessor IDs
|
||||||
if err != nil {
|
if token.AccessorID != "" {
|
||||||
s.logger.Printf("[WARN] acl: encountered an error while searching for tokens without accessor ids: %v", err)
|
|
||||||
}
|
|
||||||
// No need to check expiration time here, as that only exists for v2 tokens.
|
|
||||||
|
|
||||||
if len(tokens) == 0 {
|
|
||||||
ws := memdb.NewWatchSet()
|
|
||||||
ws.Add(state.AbandonCh())
|
|
||||||
ws.Add(waitCh)
|
|
||||||
ws.Add(ctx.Done())
|
|
||||||
|
|
||||||
// wait for more tokens to need upgrading or the aclUpgradeCh to be closed
|
|
||||||
ws.Watch(nil)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
var newTokens structs.ACLTokens
|
newToken := *token
|
||||||
for _, token := range tokens {
|
if token.SecretID == anonymousToken {
|
||||||
// This should be entirely unnecessary but is just a small safeguard against changing accessor IDs
|
newToken.AccessorID = structs.ACLTokenAnonymousID
|
||||||
if token.AccessorID != "" {
|
} else {
|
||||||
|
accessor, err := lib.GenerateUUID(s.checkTokenUUID)
|
||||||
|
if err != nil {
|
||||||
|
s.logger.Printf("[WARN] acl: failed to generate accessor during token auto-upgrade: %v", err)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
newToken.AccessorID = accessor
|
||||||
newToken := *token
|
|
||||||
if token.SecretID == anonymousToken {
|
|
||||||
newToken.AccessorID = structs.ACLTokenAnonymousID
|
|
||||||
} else {
|
|
||||||
accessor, err := lib.GenerateUUID(s.checkTokenUUID)
|
|
||||||
if err != nil {
|
|
||||||
s.logger.Printf("[WARN] acl: failed to generate accessor during token auto-upgrade: %v", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
newToken.AccessorID = accessor
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assign the global-management policy to legacy management tokens
|
|
||||||
if len(newToken.Policies) == 0 &&
|
|
||||||
len(newToken.ServiceIdentities) == 0 &&
|
|
||||||
len(newToken.Roles) == 0 &&
|
|
||||||
newToken.Type == structs.ACLTokenTypeManagement {
|
|
||||||
newToken.Policies = append(newToken.Policies, structs.ACLTokenPolicyLink{ID: structs.ACLPolicyGlobalManagementID})
|
|
||||||
}
|
|
||||||
|
|
||||||
// need to copy these as we are going to do a CAS operation.
|
|
||||||
newToken.CreateIndex = token.CreateIndex
|
|
||||||
newToken.ModifyIndex = token.ModifyIndex
|
|
||||||
|
|
||||||
newToken.SetHash(true)
|
|
||||||
|
|
||||||
newTokens = append(newTokens, &newToken)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
req := &structs.ACLTokenBatchSetRequest{Tokens: newTokens, CAS: true}
|
// Assign the global-management policy to legacy management tokens
|
||||||
|
if len(newToken.Policies) == 0 &&
|
||||||
resp, err := s.raftApply(structs.ACLTokenSetRequestType, req)
|
len(newToken.ServiceIdentities) == 0 &&
|
||||||
if err != nil {
|
len(newToken.Roles) == 0 &&
|
||||||
s.logger.Printf("[ERR] acl: failed to apply acl token upgrade batch: %v", err)
|
newToken.Type == structs.ACLTokenTypeManagement {
|
||||||
|
newToken.Policies = append(newToken.Policies, structs.ACLTokenPolicyLink{ID: structs.ACLPolicyGlobalManagementID})
|
||||||
}
|
}
|
||||||
|
|
||||||
if err, ok := resp.(error); ok {
|
// need to copy these as we are going to do a CAS operation.
|
||||||
s.logger.Printf("[ERR] acl: failed to apply acl token upgrade batch: %v", err)
|
newToken.CreateIndex = token.CreateIndex
|
||||||
}
|
newToken.ModifyIndex = token.ModifyIndex
|
||||||
|
|
||||||
|
newToken.SetHash(true)
|
||||||
|
|
||||||
|
newTokens = append(newTokens, &newToken)
|
||||||
}
|
}
|
||||||
}()
|
|
||||||
|
|
||||||
s.aclUpgradeEnabled = true
|
req := &structs.ACLTokenBatchSetRequest{Tokens: newTokens, CAS: true}
|
||||||
|
|
||||||
|
resp, err := s.raftApply(structs.ACLTokenSetRequestType, req)
|
||||||
|
if err != nil {
|
||||||
|
s.logger.Printf("[ERR] acl: failed to apply acl token upgrade batch: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err, ok := resp.(error); ok {
|
||||||
|
s.logger.Printf("[ERR] acl: failed to apply acl token upgrade batch: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Server) startACLUpgrade() {
|
||||||
|
if s.config.PrimaryDatacenter != s.config.Datacenter {
|
||||||
|
// token upgrades should only run in the primary
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
s.leaderRoutineManager.Start(aclUpgradeRoutineName, s.legacyACLTokenUpgrade)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) stopACLUpgrade() {
|
func (s *Server) stopACLUpgrade() {
|
||||||
s.aclUpgradeLock.Lock()
|
s.leaderRoutineManager.Stop(aclUpgradeRoutineName)
|
||||||
defer s.aclUpgradeLock.Unlock()
|
}
|
||||||
|
|
||||||
if !s.aclUpgradeEnabled {
|
// This function is only intended to be run as a managed go routine, it will block until
|
||||||
return
|
// the context passed in indicates that it should exit.
|
||||||
|
func (s *Server) runLegacyACLReplication(ctx context.Context) error {
|
||||||
|
var lastRemoteIndex uint64
|
||||||
|
limiter := rate.NewLimiter(rate.Limit(s.config.ACLReplicationRate), s.config.ACLReplicationBurst)
|
||||||
|
|
||||||
|
for {
|
||||||
|
if err := limiter.Wait(ctx); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.tokens.ReplicationToken() == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
index, exit, err := s.replicateLegacyACLs(lastRemoteIndex, ctx)
|
||||||
|
if exit {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
lastRemoteIndex = 0
|
||||||
|
s.updateACLReplicationStatusError()
|
||||||
|
s.logger.Printf("[WARN] consul: Legacy ACL replication error (will retry if still leader): %v", err)
|
||||||
|
} else {
|
||||||
|
lastRemoteIndex = index
|
||||||
|
s.updateACLReplicationStatusIndex(structs.ACLReplicateLegacy, index)
|
||||||
|
s.logger.Printf("[DEBUG] consul: Legacy ACL replication completed through remote index %d", index)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
s.aclUpgradeCancel()
|
|
||||||
s.aclUpgradeCancel = nil
|
|
||||||
s.aclUpgradeEnabled = false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) startLegacyACLReplication() {
|
func (s *Server) startLegacyACLReplication() {
|
||||||
s.aclReplicationLock.Lock()
|
if s.InACLDatacenter() {
|
||||||
defer s.aclReplicationLock.Unlock()
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if s.aclReplicationEnabled {
|
// unlike some other leader routines this initializes some extra state
|
||||||
|
// and therefore we want to prevent re-initialization if things are already
|
||||||
|
// running
|
||||||
|
if s.leaderRoutineManager.IsRunning(legacyACLReplicationRoutineName) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
s.initReplicationStatus()
|
s.initReplicationStatus()
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
s.aclReplicationCancel = cancel
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
var lastRemoteIndex uint64
|
|
||||||
limiter := rate.NewLimiter(rate.Limit(s.config.ACLReplicationRate), s.config.ACLReplicationBurst)
|
|
||||||
|
|
||||||
for {
|
|
||||||
if err := limiter.Wait(ctx); err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if s.tokens.ReplicationToken() == "" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
index, exit, err := s.replicateLegacyACLs(lastRemoteIndex, ctx)
|
|
||||||
if exit {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
lastRemoteIndex = 0
|
|
||||||
s.updateACLReplicationStatusError()
|
|
||||||
s.logger.Printf("[WARN] consul: Legacy ACL replication error (will retry if still leader): %v", err)
|
|
||||||
} else {
|
|
||||||
lastRemoteIndex = index
|
|
||||||
s.updateACLReplicationStatusIndex(structs.ACLReplicateLegacy, index)
|
|
||||||
s.logger.Printf("[DEBUG] consul: Legacy ACL replication completed through remote index %d", index)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
|
s.leaderRoutineManager.Start(legacyACLReplicationRoutineName, s.runLegacyACLReplication)
|
||||||
|
s.logger.Printf("[INFO] acl: started legacy ACL replication")
|
||||||
s.updateACLReplicationStatusRunning(structs.ACLReplicateLegacy)
|
s.updateACLReplicationStatusRunning(structs.ACLReplicateLegacy)
|
||||||
s.aclReplicationEnabled = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) startACLReplication() {
|
func (s *Server) startACLReplication() {
|
||||||
s.aclReplicationLock.Lock()
|
if s.InACLDatacenter() {
|
||||||
defer s.aclReplicationLock.Unlock()
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if s.aclReplicationEnabled {
|
// unlike some other leader routines this initializes some extra state
|
||||||
|
// and therefore we want to prevent re-initialization if things are already
|
||||||
|
// running
|
||||||
|
if s.leaderRoutineManager.IsRunning(aclPolicyReplicationRoutineName) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
s.initReplicationStatus()
|
s.initReplicationStatus()
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
s.leaderRoutineManager.Start(aclPolicyReplicationRoutineName, s.runACLPolicyReplicator)
|
||||||
s.aclReplicationCancel = cancel
|
s.leaderRoutineManager.Start(aclRoleReplicationRoutineName, s.runACLRoleReplicator)
|
||||||
|
|
||||||
s.startACLReplicator(ctx, structs.ACLReplicatePolicies, s.replicateACLPolicies)
|
|
||||||
s.startACLReplicator(ctx, structs.ACLReplicateRoles, s.replicateACLRoles)
|
|
||||||
|
|
||||||
if s.config.ACLTokenReplication {
|
if s.config.ACLTokenReplication {
|
||||||
s.startACLReplicator(ctx, structs.ACLReplicateTokens, s.replicateACLTokens)
|
s.leaderRoutineManager.Start(aclTokenReplicationRoutineName, s.runACLTokenReplicator)
|
||||||
s.updateACLReplicationStatusRunning(structs.ACLReplicateTokens)
|
s.updateACLReplicationStatusRunning(structs.ACLReplicateTokens)
|
||||||
} else {
|
} else {
|
||||||
s.updateACLReplicationStatusRunning(structs.ACLReplicatePolicies)
|
s.updateACLReplicationStatusRunning(structs.ACLReplicatePolicies)
|
||||||
}
|
}
|
||||||
|
|
||||||
s.aclReplicationEnabled = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type replicateFunc func(ctx context.Context, lastRemoteIndex uint64) (uint64, bool, error)
|
type replicateFunc func(ctx context.Context, lastRemoteIndex uint64) (uint64, bool, error)
|
||||||
|
|
||||||
func (s *Server) startACLReplicator(ctx context.Context, replicationType structs.ACLReplicationType, replicateFunc replicateFunc) {
|
// This function is only intended to be run as a managed go routine, it will block until
|
||||||
go func() {
|
// the context passed in indicates that it should exit.
|
||||||
var failedAttempts uint
|
func (s *Server) runACLPolicyReplicator(ctx context.Context) error {
|
||||||
limiter := rate.NewLimiter(rate.Limit(s.config.ACLReplicationRate), s.config.ACLReplicationBurst)
|
s.logger.Printf("[INFO] acl: started ACL Policy replication")
|
||||||
|
|
||||||
var lastRemoteIndex uint64
|
return s.runACLReplicator(ctx, structs.ACLReplicatePolicies, s.replicateACLPolicies)
|
||||||
for {
|
}
|
||||||
if err := limiter.Wait(ctx); err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if s.tokens.ReplicationToken() == "" {
|
// This function is only intended to be run as a managed go routine, it will block until
|
||||||
continue
|
// the context passed in indicates that it should exit.
|
||||||
}
|
func (s *Server) runACLRoleReplicator(ctx context.Context) error {
|
||||||
|
s.logger.Printf("[INFO] acl: started ACL Role replication")
|
||||||
|
return s.runACLReplicator(ctx, structs.ACLReplicateRoles, s.replicateACLRoles)
|
||||||
|
}
|
||||||
|
|
||||||
index, exit, err := replicateFunc(ctx, lastRemoteIndex)
|
// This function is only intended to be run as a managed go routine, it will block until
|
||||||
if exit {
|
// the context passed in indicates that it should exit.
|
||||||
return
|
func (s *Server) runACLTokenReplicator(ctx context.Context) error {
|
||||||
}
|
return s.runACLReplicator(ctx, structs.ACLReplicateTokens, s.replicateACLTokens)
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
// This function is only intended to be run as a managed go routine, it will block until
|
||||||
lastRemoteIndex = 0
|
// the context passed in indicates that it should exit.
|
||||||
s.updateACLReplicationStatusError()
|
func (s *Server) runACLReplicator(ctx context.Context, replicationType structs.ACLReplicationType, replicateFunc replicateFunc) error {
|
||||||
s.logger.Printf("[WARN] consul: ACL %s replication error (will retry if still leader): %v", replicationType.SingularNoun(), err)
|
var failedAttempts uint
|
||||||
if (1 << failedAttempts) < aclReplicationMaxRetryBackoff {
|
limiter := rate.NewLimiter(rate.Limit(s.config.ACLReplicationRate), s.config.ACLReplicationBurst)
|
||||||
failedAttempts++
|
|
||||||
}
|
|
||||||
|
|
||||||
select {
|
var lastRemoteIndex uint64
|
||||||
case <-ctx.Done():
|
for {
|
||||||
return
|
if err := limiter.Wait(ctx); err != nil {
|
||||||
case <-time.After((1 << failedAttempts) * time.Second):
|
return err
|
||||||
// do nothing
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
lastRemoteIndex = index
|
|
||||||
s.updateACLReplicationStatusIndex(replicationType, index)
|
|
||||||
s.logger.Printf("[DEBUG] consul: ACL %s replication completed through remote index %d", replicationType.SingularNoun(), index)
|
|
||||||
failedAttempts = 0
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}()
|
|
||||||
|
|
||||||
s.logger.Printf("[INFO] acl: started ACL %s replication", replicationType.SingularNoun())
|
if s.tokens.ReplicationToken() == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
index, exit, err := replicateFunc(ctx, lastRemoteIndex)
|
||||||
|
if exit {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
lastRemoteIndex = 0
|
||||||
|
s.updateACLReplicationStatusError()
|
||||||
|
s.logger.Printf("[WARN] consul: ACL %s replication error (will retry if still leader): %v", replicationType.SingularNoun(), err)
|
||||||
|
if (1 << failedAttempts) < aclReplicationMaxRetryBackoff {
|
||||||
|
failedAttempts++
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return nil
|
||||||
|
case <-time.After((1 << failedAttempts) * time.Second):
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
lastRemoteIndex = index
|
||||||
|
s.updateACLReplicationStatusIndex(replicationType, index)
|
||||||
|
s.logger.Printf("[DEBUG] consul: ACL %s replication completed through remote index %d", replicationType.SingularNoun(), index)
|
||||||
|
failedAttempts = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) stopACLReplication() {
|
func (s *Server) stopACLReplication() {
|
||||||
s.aclReplicationLock.Lock()
|
// these will be no-ops when not started
|
||||||
defer s.aclReplicationLock.Unlock()
|
s.leaderRoutineManager.Stop(legacyACLReplicationRoutineName)
|
||||||
|
s.leaderRoutineManager.Stop(aclPolicyReplicationRoutineName)
|
||||||
if !s.aclReplicationEnabled {
|
s.leaderRoutineManager.Stop(aclRoleReplicationRoutineName)
|
||||||
return
|
s.leaderRoutineManager.Stop(aclTokenReplicationRoutineName)
|
||||||
}
|
|
||||||
|
|
||||||
s.aclReplicationCancel()
|
|
||||||
s.aclReplicationCancel = nil
|
|
||||||
s.updateACLReplicationStatusStopped()
|
|
||||||
s.aclReplicationEnabled = false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) startConfigReplication() {
|
func (s *Server) startConfigReplication() {
|
||||||
|
@ -890,12 +895,12 @@ func (s *Server) startConfigReplication() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
s.configReplicator.Start()
|
s.leaderRoutineManager.Start(configReplicationRoutineName, s.configReplicator.Run)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) stopConfigReplication() {
|
func (s *Server) stopConfigReplication() {
|
||||||
// will be a no-op when not started
|
// will be a no-op when not started
|
||||||
s.configReplicator.Stop()
|
s.leaderRoutineManager.Stop(configReplicationRoutineName)
|
||||||
}
|
}
|
||||||
|
|
||||||
// getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
|
// getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
|
||||||
|
|
|
@ -439,52 +439,30 @@ func (s *Server) generateCASignRequest(csr string) *structs.CASignRequest {
|
||||||
|
|
||||||
// startConnectLeader starts multi-dc connect leader routines.
|
// startConnectLeader starts multi-dc connect leader routines.
|
||||||
func (s *Server) startConnectLeader() {
|
func (s *Server) startConnectLeader() {
|
||||||
s.connectLock.Lock()
|
|
||||||
defer s.connectLock.Unlock()
|
|
||||||
|
|
||||||
if s.connectEnabled {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
s.connectCh = make(chan struct{})
|
|
||||||
|
|
||||||
// Start the Connect secondary DC actions if enabled.
|
// Start the Connect secondary DC actions if enabled.
|
||||||
if s.config.ConnectEnabled && s.config.Datacenter != s.config.PrimaryDatacenter {
|
if s.config.ConnectEnabled && s.config.Datacenter != s.config.PrimaryDatacenter {
|
||||||
go s.secondaryCARootWatch(s.connectCh)
|
s.leaderRoutineManager.Start(secondaryCARootWatchRoutineName, s.secondaryCARootWatch)
|
||||||
go s.replicateIntentions(s.connectCh)
|
s.leaderRoutineManager.Start(intentionReplicationRoutineName, s.replicateIntentions)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
go s.runCARootPruning(s.connectCh)
|
s.leaderRoutineManager.Start(caRootPruningRoutineName, s.runCARootPruning)
|
||||||
|
|
||||||
s.connectEnabled = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// stopConnectLeader stops connect specific leader functions.
|
// stopConnectLeader stops connect specific leader functions.
|
||||||
func (s *Server) stopConnectLeader() {
|
func (s *Server) stopConnectLeader() {
|
||||||
s.connectLock.Lock()
|
s.leaderRoutineManager.Stop(secondaryCARootWatchRoutineName)
|
||||||
defer s.connectLock.Unlock()
|
s.leaderRoutineManager.Stop(intentionReplicationRoutineName)
|
||||||
|
s.leaderRoutineManager.Stop(caRootPruningRoutineName)
|
||||||
if !s.connectEnabled {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
s.actingSecondaryLock.Lock()
|
|
||||||
s.actingSecondaryCA = false
|
|
||||||
s.actingSecondaryLock.Unlock()
|
|
||||||
|
|
||||||
close(s.connectCh)
|
|
||||||
s.connectEnabled = false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) runCARootPruning(stopCh <-chan struct{}) {
|
func (s *Server) runCARootPruning(ctx context.Context) error {
|
||||||
ticker := time.NewTicker(caRootPruneInterval)
|
ticker := time.NewTicker(caRootPruneInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-stopCh:
|
case <-ctx.Done():
|
||||||
return
|
return nil
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
if err := s.pruneCARoots(); err != nil {
|
if err := s.pruneCARoots(); err != nil {
|
||||||
s.logger.Printf("[ERR] connect: error pruning CA roots: %v", err)
|
s.logger.Printf("[ERR] connect: error pruning CA roots: %v", err)
|
||||||
|
@ -549,7 +527,7 @@ func (s *Server) pruneCARoots() error {
|
||||||
// secondaryCARootWatch maintains a blocking query to the primary datacenter's
|
// secondaryCARootWatch maintains a blocking query to the primary datacenter's
|
||||||
// ConnectCA.Roots endpoint to monitor when it needs to request a new signed
|
// ConnectCA.Roots endpoint to monitor when it needs to request a new signed
|
||||||
// intermediate certificate.
|
// intermediate certificate.
|
||||||
func (s *Server) secondaryCARootWatch(stopCh <-chan struct{}) {
|
func (s *Server) secondaryCARootWatch(ctx context.Context) error {
|
||||||
args := structs.DCSpecificRequest{
|
args := structs.DCSpecificRequest{
|
||||||
Datacenter: s.config.PrimaryDatacenter,
|
Datacenter: s.config.PrimaryDatacenter,
|
||||||
QueryOptions: structs.QueryOptions{
|
QueryOptions: structs.QueryOptions{
|
||||||
|
@ -559,7 +537,7 @@ func (s *Server) secondaryCARootWatch(stopCh <-chan struct{}) {
|
||||||
|
|
||||||
s.logger.Printf("[DEBUG] connect: starting Connect CA root replication from primary datacenter %q", s.config.PrimaryDatacenter)
|
s.logger.Printf("[DEBUG] connect: starting Connect CA root replication from primary datacenter %q", s.config.PrimaryDatacenter)
|
||||||
|
|
||||||
retryLoopBackoff(stopCh, func() error {
|
retryLoopBackoff(ctx.Done(), func() error {
|
||||||
var roots structs.IndexedCARoots
|
var roots structs.IndexedCARoots
|
||||||
if err := s.forwardDC("ConnectCA.Roots", s.config.PrimaryDatacenter, &args, &roots); err != nil {
|
if err := s.forwardDC("ConnectCA.Roots", s.config.PrimaryDatacenter, &args, &roots); err != nil {
|
||||||
return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err)
|
return fmt.Errorf("Error retrieving the primary datacenter's roots: %v", err)
|
||||||
|
@ -598,18 +576,20 @@ func (s *Server) secondaryCARootWatch(stopCh <-chan struct{}) {
|
||||||
}, func(err error) {
|
}, func(err error) {
|
||||||
s.logger.Printf("[ERR] connect: %v", err)
|
s.logger.Printf("[ERR] connect: %v", err)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// replicateIntentions executes a blocking query to the primary datacenter to replicate
|
// replicateIntentions executes a blocking query to the primary datacenter to replicate
|
||||||
// the intentions there to the local state.
|
// the intentions there to the local state.
|
||||||
func (s *Server) replicateIntentions(stopCh <-chan struct{}) {
|
func (s *Server) replicateIntentions(ctx context.Context) error {
|
||||||
args := structs.DCSpecificRequest{
|
args := structs.DCSpecificRequest{
|
||||||
Datacenter: s.config.PrimaryDatacenter,
|
Datacenter: s.config.PrimaryDatacenter,
|
||||||
}
|
}
|
||||||
|
|
||||||
s.logger.Printf("[DEBUG] connect: starting Connect intention replication from primary datacenter %q", s.config.PrimaryDatacenter)
|
s.logger.Printf("[DEBUG] connect: starting Connect intention replication from primary datacenter %q", s.config.PrimaryDatacenter)
|
||||||
|
|
||||||
retryLoopBackoff(stopCh, func() error {
|
retryLoopBackoff(ctx.Done(), func() error {
|
||||||
// Always use the latest replication token value in case it changed while looping.
|
// Always use the latest replication token value in case it changed while looping.
|
||||||
args.QueryOptions.Token = s.tokens.ReplicationToken()
|
args.QueryOptions.Token = s.tokens.ReplicationToken()
|
||||||
|
|
||||||
|
@ -653,6 +633,7 @@ func (s *Server) replicateIntentions(stopCh <-chan struct{}) {
|
||||||
}, func(err error) {
|
}, func(err error) {
|
||||||
s.logger.Printf("[ERR] connect: error replicating intentions: %v", err)
|
s.logger.Printf("[ERR] connect: error replicating intentions: %v", err)
|
||||||
})
|
})
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// retryLoopBackoff loops a given function indefinitely, backing off exponentially
|
// retryLoopBackoff loops a given function indefinitely, backing off exponentially
|
||||||
|
|
|
@ -0,0 +1,120 @@
|
||||||
|
package consul
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
)
|
||||||
|
|
||||||
|
type LeaderRoutine func(ctx context.Context) error
|
||||||
|
|
||||||
|
type leaderRoutine struct {
|
||||||
|
running bool
|
||||||
|
cancel context.CancelFunc
|
||||||
|
}
|
||||||
|
|
||||||
|
type LeaderRoutineManager struct {
|
||||||
|
lock sync.RWMutex
|
||||||
|
logger *log.Logger
|
||||||
|
|
||||||
|
routines map[string]*leaderRoutine
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewLeaderRoutineManager(logger *log.Logger) *LeaderRoutineManager {
|
||||||
|
if logger == nil {
|
||||||
|
logger = log.New(os.Stderr, "", log.LstdFlags)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &LeaderRoutineManager{
|
||||||
|
logger: logger,
|
||||||
|
routines: make(map[string]*leaderRoutine),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *LeaderRoutineManager) IsRunning(name string) bool {
|
||||||
|
m.lock.Lock()
|
||||||
|
defer m.lock.Unlock()
|
||||||
|
|
||||||
|
if routine, ok := m.routines[name]; ok {
|
||||||
|
return routine.running
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *LeaderRoutineManager) Start(name string, routine LeaderRoutine) error {
|
||||||
|
return m.StartWithContext(nil, name, routine)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *LeaderRoutineManager) StartWithContext(parentCtx context.Context, name string, routine LeaderRoutine) error {
|
||||||
|
m.lock.Lock()
|
||||||
|
defer m.lock.Unlock()
|
||||||
|
|
||||||
|
if instance, ok := m.routines[name]; ok && instance.running {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if parentCtx == nil {
|
||||||
|
parentCtx = context.Background()
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(parentCtx)
|
||||||
|
instance := &leaderRoutine{
|
||||||
|
running: true,
|
||||||
|
cancel: cancel,
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
err := routine(ctx)
|
||||||
|
if err != nil && err != context.DeadlineExceeded && err != context.Canceled {
|
||||||
|
m.logger.Printf("[ERROR] leader: %s routine exited with error: %v", name, err)
|
||||||
|
} else {
|
||||||
|
m.logger.Printf("[DEBUG] leader: stopped %s routine", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
m.lock.Lock()
|
||||||
|
instance.running = false
|
||||||
|
m.lock.Unlock()
|
||||||
|
}()
|
||||||
|
|
||||||
|
m.routines[name] = instance
|
||||||
|
m.logger.Printf("[INFO] leader: started %s routine", name)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *LeaderRoutineManager) Stop(name string) error {
|
||||||
|
m.lock.Lock()
|
||||||
|
defer m.lock.Unlock()
|
||||||
|
|
||||||
|
instance, ok := m.routines[name]
|
||||||
|
if !ok {
|
||||||
|
// no running instance
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if !instance.running {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
m.logger.Printf("[DEBUG] leader: stopping %s routine", name)
|
||||||
|
instance.cancel()
|
||||||
|
delete(m.routines, name)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *LeaderRoutineManager) StopAll() {
|
||||||
|
m.lock.Lock()
|
||||||
|
defer m.lock.Unlock()
|
||||||
|
|
||||||
|
for name, routine := range m.routines {
|
||||||
|
if !routine.running {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
m.logger.Printf("[DEBUG] leader: stopping %s routine", name)
|
||||||
|
routine.cancel()
|
||||||
|
}
|
||||||
|
|
||||||
|
// just whipe out the entire map
|
||||||
|
m.routines = make(map[string]*leaderRoutine)
|
||||||
|
}
|
|
@ -0,0 +1,73 @@
|
||||||
|
package consul
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sync/atomic"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/hashicorp/consul/sdk/testutil"
|
||||||
|
"github.com/hashicorp/consul/sdk/testutil/retry"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLeaderRoutineManager(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
var runs uint32
|
||||||
|
var running uint32
|
||||||
|
// tlog := testutil.NewCancellableTestLogger(t)
|
||||||
|
// defer tlog.Cancel()
|
||||||
|
mgr := NewLeaderRoutineManager(testutil.TestLogger(t))
|
||||||
|
|
||||||
|
run := func(ctx context.Context) error {
|
||||||
|
atomic.StoreUint32(&running, 1)
|
||||||
|
defer atomic.StoreUint32(&running, 0)
|
||||||
|
atomic.AddUint32(&runs, 1)
|
||||||
|
<-ctx.Done()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsRunning on unregistered service should be false
|
||||||
|
require.False(t, mgr.IsRunning("not-found"))
|
||||||
|
|
||||||
|
// start
|
||||||
|
require.NoError(t, mgr.Start("run", run))
|
||||||
|
require.True(t, mgr.IsRunning("run"))
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
require.Equal(r, uint32(1), atomic.LoadUint32(&runs))
|
||||||
|
require.Equal(r, uint32(1), atomic.LoadUint32(&running))
|
||||||
|
})
|
||||||
|
require.NoError(t, mgr.Stop("run"))
|
||||||
|
|
||||||
|
// ensure the background go routine was actually cancelled
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
require.Equal(r, uint32(1), atomic.LoadUint32(&runs))
|
||||||
|
require.Equal(r, uint32(0), atomic.LoadUint32(&running))
|
||||||
|
})
|
||||||
|
|
||||||
|
// restart and stop
|
||||||
|
require.NoError(t, mgr.Start("run", run))
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
require.Equal(r, uint32(2), atomic.LoadUint32(&runs))
|
||||||
|
require.Equal(r, uint32(1), atomic.LoadUint32(&running))
|
||||||
|
})
|
||||||
|
|
||||||
|
require.NoError(t, mgr.Stop("run"))
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
require.Equal(r, uint32(0), atomic.LoadUint32(&running))
|
||||||
|
})
|
||||||
|
|
||||||
|
// start with a context
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
require.NoError(t, mgr.StartWithContext(ctx, "run", run))
|
||||||
|
cancel()
|
||||||
|
|
||||||
|
// The function should exit of its own accord due to the parent
|
||||||
|
// context being canceled
|
||||||
|
retry.Run(t, func(r *retry.R) {
|
||||||
|
require.Equal(r, uint32(3), atomic.LoadUint32(&runs))
|
||||||
|
require.Equal(r, uint32(0), atomic.LoadUint32(&running))
|
||||||
|
// the task should automatically set itself to not running if
|
||||||
|
// it exits early
|
||||||
|
require.False(r, mgr.IsRunning("run"))
|
||||||
|
})
|
||||||
|
}
|
|
@ -5,7 +5,7 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/hashicorp/consul/lib"
|
"github.com/hashicorp/consul/lib"
|
||||||
|
@ -40,15 +40,12 @@ type ReplicatorConfig struct {
|
||||||
type ReplicatorFunc func(ctx context.Context, lastRemoteIndex uint64) (index uint64, exit bool, err error)
|
type ReplicatorFunc func(ctx context.Context, lastRemoteIndex uint64) (index uint64, exit bool, err error)
|
||||||
|
|
||||||
type Replicator struct {
|
type Replicator struct {
|
||||||
name string
|
name string
|
||||||
lock sync.RWMutex
|
limiter *rate.Limiter
|
||||||
running bool
|
waiter *lib.RetryWaiter
|
||||||
cancel context.CancelFunc
|
replicateFn ReplicatorFunc
|
||||||
ctx context.Context
|
logger *log.Logger
|
||||||
limiter *rate.Limiter
|
lastRemoteIndex uint64
|
||||||
waiter *lib.RetryWaiter
|
|
||||||
replicate ReplicatorFunc
|
|
||||||
logger *log.Logger
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewReplicator(config *ReplicatorConfig) (*Replicator, error) {
|
func NewReplicator(config *ReplicatorConfig) (*Replicator, error) {
|
||||||
|
@ -74,64 +71,45 @@ func NewReplicator(config *ReplicatorConfig) (*Replicator, error) {
|
||||||
}
|
}
|
||||||
waiter := lib.NewRetryWaiter(minFailures, 0*time.Second, maxWait, lib.NewJitterRandomStagger(10))
|
waiter := lib.NewRetryWaiter(minFailures, 0*time.Second, maxWait, lib.NewJitterRandomStagger(10))
|
||||||
return &Replicator{
|
return &Replicator{
|
||||||
name: config.Name,
|
name: config.Name,
|
||||||
running: false,
|
limiter: limiter,
|
||||||
limiter: limiter,
|
waiter: waiter,
|
||||||
waiter: waiter,
|
replicateFn: config.ReplicateFn,
|
||||||
replicate: config.ReplicateFn,
|
logger: config.Logger,
|
||||||
logger: config.Logger,
|
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *Replicator) Start() {
|
func (r *Replicator) Run(ctx context.Context) error {
|
||||||
r.lock.Lock()
|
|
||||||
defer r.lock.Unlock()
|
|
||||||
|
|
||||||
if r.running {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
r.ctx, r.cancel = context.WithCancel(context.Background())
|
|
||||||
|
|
||||||
go r.run()
|
|
||||||
|
|
||||||
r.running = true
|
|
||||||
r.logger.Printf("[INFO] replication: started %s replication", r.name)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Replicator) run() {
|
|
||||||
var lastRemoteIndex uint64
|
|
||||||
|
|
||||||
defer r.logger.Printf("[INFO] replication: stopped %s replication", r.name)
|
defer r.logger.Printf("[INFO] replication: stopped %s replication", r.name)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
// This ensures we aren't doing too many successful replication rounds - mostly useful when
|
// This ensures we aren't doing too many successful replication rounds - mostly useful when
|
||||||
// the data within the primary datacenter is changing rapidly but we try to limit the amount
|
// the data within the primary datacenter is changing rapidly but we try to limit the amount
|
||||||
// of resources replication into the secondary datacenter should take
|
// of resources replication into the secondary datacenter should take
|
||||||
if err := r.limiter.Wait(r.ctx); err != nil {
|
if err := r.limiter.Wait(ctx); err != nil {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform a single round of replication
|
// Perform a single round of replication
|
||||||
index, exit, err := r.replicate(r.ctx, lastRemoteIndex)
|
index, exit, err := r.replicateFn(ctx, atomic.LoadUint64(&r.lastRemoteIndex))
|
||||||
if exit {
|
if exit {
|
||||||
// the replication function told us to exit
|
// the replication function told us to exit
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// reset the lastRemoteIndex when there is an RPC failure. This should cause a full sync to be done during
|
// reset the lastRemoteIndex when there is an RPC failure. This should cause a full sync to be done during
|
||||||
// the next round of replication
|
// the next round of replication
|
||||||
lastRemoteIndex = 0
|
atomic.StoreUint64(&r.lastRemoteIndex, 0)
|
||||||
r.logger.Printf("[WARN] replication: %s replication error (will retry if still leader): %v", r.name, err)
|
r.logger.Printf("[WARN] replication: %s replication error (will retry if still leader): %v", r.name, err)
|
||||||
} else {
|
} else {
|
||||||
lastRemoteIndex = index
|
atomic.StoreUint64(&r.lastRemoteIndex, index)
|
||||||
r.logger.Printf("[DEBUG] replication: %s replication completed through remote index %d", r.name, index)
|
r.logger.Printf("[DEBUG] replication: %s replication completed through remote index %d", r.name, index)
|
||||||
}
|
}
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-r.ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return nil
|
||||||
// wait some amount of time to prevent churning through many replication rounds while replication is failing
|
// wait some amount of time to prevent churning through many replication rounds while replication is failing
|
||||||
case <-r.waiter.WaitIfErr(err):
|
case <-r.waiter.WaitIfErr(err):
|
||||||
// do nothing
|
// do nothing
|
||||||
|
@ -139,16 +117,6 @@ func (r *Replicator) run() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *Replicator) Stop() {
|
func (r *Replicator) Index() uint64 {
|
||||||
r.lock.Lock()
|
return atomic.LoadUint64(&r.lastRemoteIndex)
|
||||||
defer r.lock.Unlock()
|
|
||||||
|
|
||||||
if !r.running {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
r.logger.Printf("[DEBUG] replication: stopping %s replication", r.name)
|
|
||||||
r.cancel()
|
|
||||||
r.cancel = nil
|
|
||||||
r.running = false
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,15 +4,19 @@ import (
|
||||||
"context"
|
"context"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/hashicorp/consul/sdk/testutil"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestReplicationRestart(t *testing.T) {
|
func TestReplicationRestart(t *testing.T) {
|
||||||
|
mgr := NewLeaderRoutineManager(testutil.TestLogger(t))
|
||||||
|
|
||||||
config := ReplicatorConfig{
|
config := ReplicatorConfig{
|
||||||
Name: "mock",
|
Name: "mock",
|
||||||
ReplicateFn: func(ctx context.Context, lastRemoteIndex uint64) (uint64, bool, error) {
|
ReplicateFn: func(ctx context.Context, lastRemoteIndex uint64) (uint64, bool, error) {
|
||||||
return 1, false, nil
|
return 1, false, nil
|
||||||
},
|
},
|
||||||
|
|
||||||
Rate: 1,
|
Rate: 1,
|
||||||
Burst: 1,
|
Burst: 1,
|
||||||
}
|
}
|
||||||
|
@ -20,9 +24,9 @@ func TestReplicationRestart(t *testing.T) {
|
||||||
repl, err := NewReplicator(&config)
|
repl, err := NewReplicator(&config)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
repl.Start()
|
mgr.Start("mock", repl.Run)
|
||||||
repl.Stop()
|
mgr.Stop("mock")
|
||||||
repl.Start()
|
mgr.Start("mock", repl.Run)
|
||||||
// Previously this would have segfaulted
|
// Previously this would have segfaulted
|
||||||
repl.Stop()
|
mgr.Stop("mock")
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
package consul
|
package consul
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
@ -88,6 +87,19 @@ const (
|
||||||
reconcileChSize = 256
|
reconcileChSize = 256
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
legacyACLReplicationRoutineName = "legacy ACL replication"
|
||||||
|
aclPolicyReplicationRoutineName = "ACL policy replication"
|
||||||
|
aclRoleReplicationRoutineName = "ACL role replication"
|
||||||
|
aclTokenReplicationRoutineName = "ACL token replication"
|
||||||
|
aclTokenReapingRoutineName = "acl token reaping"
|
||||||
|
aclUpgradeRoutineName = "legacy ACL token upgrade"
|
||||||
|
caRootPruningRoutineName = "CA root pruning"
|
||||||
|
configReplicationRoutineName = "config entry replication"
|
||||||
|
intentionReplicationRoutineName = "intention replication"
|
||||||
|
secondaryCARootWatchRoutineName = "secondary CA roots watch"
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
ErrWANFederationDisabled = fmt.Errorf("WAN Federation is disabled")
|
ErrWANFederationDisabled = fmt.Errorf("WAN Federation is disabled")
|
||||||
)
|
)
|
||||||
|
@ -101,24 +113,6 @@ type Server struct {
|
||||||
// acls is used to resolve tokens to effective policies
|
// acls is used to resolve tokens to effective policies
|
||||||
acls *ACLResolver
|
acls *ACLResolver
|
||||||
|
|
||||||
// aclUpgradeCancel is used to cancel the ACL upgrade goroutine when we
|
|
||||||
// lose leadership
|
|
||||||
aclUpgradeCancel context.CancelFunc
|
|
||||||
aclUpgradeLock sync.RWMutex
|
|
||||||
aclUpgradeEnabled bool
|
|
||||||
|
|
||||||
// aclReplicationCancel is used to shut down the ACL replication goroutine
|
|
||||||
// when we lose leadership
|
|
||||||
aclReplicationCancel context.CancelFunc
|
|
||||||
aclReplicationLock sync.RWMutex
|
|
||||||
aclReplicationEnabled bool
|
|
||||||
|
|
||||||
// aclTokenReapCancel is used to shut down the ACL Token expiration reap
|
|
||||||
// goroutine when we lose leadership.
|
|
||||||
aclTokenReapCancel context.CancelFunc
|
|
||||||
aclTokenReapLock sync.RWMutex
|
|
||||||
aclTokenReapEnabled bool
|
|
||||||
|
|
||||||
aclAuthMethodValidators map[string]*authMethodValidatorEntry
|
aclAuthMethodValidators map[string]*authMethodValidatorEntry
|
||||||
aclAuthMethodValidatorLock sync.RWMutex
|
aclAuthMethodValidatorLock sync.RWMutex
|
||||||
|
|
||||||
|
@ -271,15 +265,13 @@ type Server struct {
|
||||||
shutdownCh chan struct{}
|
shutdownCh chan struct{}
|
||||||
shutdownLock sync.Mutex
|
shutdownLock sync.Mutex
|
||||||
|
|
||||||
// State for multi-dc connect leader logic
|
|
||||||
connectLock sync.RWMutex
|
|
||||||
connectEnabled bool
|
|
||||||
connectCh chan struct{}
|
|
||||||
|
|
||||||
// State for whether this datacenter is acting as a secondary CA.
|
// State for whether this datacenter is acting as a secondary CA.
|
||||||
actingSecondaryCA bool
|
actingSecondaryCA bool
|
||||||
actingSecondaryLock sync.RWMutex
|
actingSecondaryLock sync.RWMutex
|
||||||
|
|
||||||
|
// Manager to handle starting/stopping go routines when establishing/revoking raft leadership
|
||||||
|
leaderRoutineManager *LeaderRoutineManager
|
||||||
|
|
||||||
// embedded struct to hold all the enterprise specific data
|
// embedded struct to hold all the enterprise specific data
|
||||||
EnterpriseServer
|
EnterpriseServer
|
||||||
}
|
}
|
||||||
|
@ -354,24 +346,25 @@ func NewServerLogger(config *Config, logger *log.Logger, tokens *token.Store, tl
|
||||||
|
|
||||||
// Create server.
|
// Create server.
|
||||||
s := &Server{
|
s := &Server{
|
||||||
config: config,
|
config: config,
|
||||||
tokens: tokens,
|
tokens: tokens,
|
||||||
connPool: connPool,
|
connPool: connPool,
|
||||||
eventChLAN: make(chan serf.Event, serfEventChSize),
|
eventChLAN: make(chan serf.Event, serfEventChSize),
|
||||||
eventChWAN: make(chan serf.Event, serfEventChSize),
|
eventChWAN: make(chan serf.Event, serfEventChSize),
|
||||||
logger: logger,
|
logger: logger,
|
||||||
leaveCh: make(chan struct{}),
|
leaveCh: make(chan struct{}),
|
||||||
reconcileCh: make(chan serf.Member, reconcileChSize),
|
reconcileCh: make(chan serf.Member, reconcileChSize),
|
||||||
router: router.NewRouter(logger, config.Datacenter),
|
router: router.NewRouter(logger, config.Datacenter),
|
||||||
rpcServer: rpc.NewServer(),
|
rpcServer: rpc.NewServer(),
|
||||||
insecureRPCServer: rpc.NewServer(),
|
insecureRPCServer: rpc.NewServer(),
|
||||||
tlsConfigurator: tlsConfigurator,
|
tlsConfigurator: tlsConfigurator,
|
||||||
reassertLeaderCh: make(chan chan error),
|
reassertLeaderCh: make(chan chan error),
|
||||||
segmentLAN: make(map[string]*serf.Serf, len(config.Segments)),
|
segmentLAN: make(map[string]*serf.Serf, len(config.Segments)),
|
||||||
sessionTimers: NewSessionTimers(),
|
sessionTimers: NewSessionTimers(),
|
||||||
tombstoneGC: gc,
|
tombstoneGC: gc,
|
||||||
serverLookup: NewServerLookup(),
|
serverLookup: NewServerLookup(),
|
||||||
shutdownCh: shutdownCh,
|
shutdownCh: shutdownCh,
|
||||||
|
leaderRoutineManager: NewLeaderRoutineManager(logger),
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize enterprise specific server functionality
|
// Initialize enterprise specific server functionality
|
||||||
|
@ -812,6 +805,11 @@ func (s *Server) Shutdown() error {
|
||||||
s.shutdown = true
|
s.shutdown = true
|
||||||
close(s.shutdownCh)
|
close(s.shutdownCh)
|
||||||
|
|
||||||
|
// ensure that any leader routines still running get canceled
|
||||||
|
if s.leaderRoutineManager != nil {
|
||||||
|
s.leaderRoutineManager.StopAll()
|
||||||
|
}
|
||||||
|
|
||||||
if s.serfLAN != nil {
|
if s.serfLAN != nil {
|
||||||
s.serfLAN.Shutdown()
|
s.serfLAN.Shutdown()
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue