Retry cluster join on "too many learners" error

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
pull/7371/head
Brad Davidson 2023-04-25 22:35:22 +00:00 committed by Brad Davidson
parent f1b6a3549c
commit 91afb38799
1 changed files with 13 additions and 3 deletions

View File

@ -419,10 +419,20 @@ func (e *ETCD) Start(ctx context.Context, clientAccessInfo *clientaccess.Info) e
for {
select {
case <-time.After(30 * time.Second):
logrus.Infof("Waiting for agent to become ready before joining ETCD cluster")
logrus.Infof("Waiting for agent to become ready before joining etcd cluster")
case <-e.config.Runtime.AgentReady:
if err := e.join(ctx, clientAccessInfo); err != nil {
logrus.Fatalf("ETCD join failed: %v", err)
if err := wait.PollImmediateUntilWithContext(ctx, time.Second, func(ctx context.Context) (bool, error) {
if err := e.join(ctx, clientAccessInfo); err != nil {
// Retry the join if waiting for another member to be promoted, or waiting for peers to connect after promotion
if errors.Is(err, rpctypes.ErrTooManyLearners) || errors.Is(err, rpctypes.ErrGRPCUnhealthy) {
logrus.Infof("Waiting for other members to finish joining etcd cluster")
return false, nil
}
return false, err
}
return true, nil
}); err != nil {
logrus.Fatalf("etcd cluster join failed: %v", err)
}
return
case <-ctx.Done():