From 91afb38799369c9217ed6c78ed4ddc69e8d7a959 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Tue, 25 Apr 2023 22:35:22 +0000 Subject: [PATCH] Retry cluster join on "too many learners" error Signed-off-by: Brad Davidson --- pkg/etcd/etcd.go | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pkg/etcd/etcd.go b/pkg/etcd/etcd.go index 0942330ea9..5e88bb6f10 100644 --- a/pkg/etcd/etcd.go +++ b/pkg/etcd/etcd.go @@ -419,10 +419,20 @@ func (e *ETCD) Start(ctx context.Context, clientAccessInfo *clientaccess.Info) e for { select { case <-time.After(30 * time.Second): - logrus.Infof("Waiting for agent to become ready before joining ETCD cluster") + logrus.Infof("Waiting for agent to become ready before joining etcd cluster") case <-e.config.Runtime.AgentReady: - if err := e.join(ctx, clientAccessInfo); err != nil { - logrus.Fatalf("ETCD join failed: %v", err) + if err := wait.PollImmediateUntilWithContext(ctx, time.Second, func(ctx context.Context) (bool, error) { + if err := e.join(ctx, clientAccessInfo); err != nil { + // Retry the join if waiting for another member to be promoted, or waiting for peers to connect after promotion + if errors.Is(err, rpctypes.ErrTooManyLearners) || errors.Is(err, rpctypes.ErrGRPCUnhealthy) { + logrus.Infof("Waiting for other members to finish joining etcd cluster") + return false, nil + } + return false, err + } + return true, nil + }); err != nil { + logrus.Fatalf("etcd cluster join failed: %v", err) } return case <-ctx.Done():