Retry cluster join on "too many learners" error

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
2023-04-25 22:35:22 +00:00 · 2023-04-25 22:35:22 +00:00 · 91afb38799
parent f1b6a3549c
commit 91afb38799
1 changed files with 13 additions and 3 deletions
--- a/pkg/etcd/etcd.go
+++ b/pkg/etcd/etcd.go
@ -419,10 +419,20 @@ func (e *ETCD) Start(ctx context.Context, clientAccessInfo *clientaccess.Info) e
 		for {
 			select {
 			case <-time.After(30 * time.Second):
-				logrus.Infof("Waiting for agent to become ready before joining ETCD cluster")
+				logrus.Infof("Waiting for agent to become ready before joining etcd cluster")
 			case <-e.config.Runtime.AgentReady:
-				if err := e.join(ctx, clientAccessInfo); err != nil {
-					logrus.Fatalf("ETCD join failed: %v", err)
+				if err := wait.PollImmediateUntilWithContext(ctx, time.Second, func(ctx context.Context) (bool, error) {
+					if err := e.join(ctx, clientAccessInfo); err != nil {
+						// Retry the join if waiting for another member to be promoted, or waiting for peers to connect after promotion
+						if errors.Is(err, rpctypes.ErrTooManyLearners) || errors.Is(err, rpctypes.ErrGRPCUnhealthy) {
+							logrus.Infof("Waiting for other members to finish joining etcd cluster")
+							return false, nil
+						}
+						return false, err
+					}
+					return true, nil
+				}); err != nil {
+					logrus.Fatalf("etcd cluster join failed: %v", err)
 				}
 				return
 			case <-ctx.Done():