From ca39614d4e7cb9963ee002dc798884167f2d2acf Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Wed, 29 May 2024 18:17:29 +0000 Subject: [PATCH] Fix issue caused by sole server marked as failed under load If health checks are failing for all servers, make a second pass through the server list with health-checks ignored before returning failure Signed-off-by: Brad Davidson --- pkg/agent/loadbalancer/loadbalancer.go | 9 +++++++-- pkg/agent/loadbalancer/servers.go | 10 ++++++++-- pkg/etcd/etcdproxy.go | 2 +- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/pkg/agent/loadbalancer/loadbalancer.go b/pkg/agent/loadbalancer/loadbalancer.go index 2348fcb087..feddb4d872 100644 --- a/pkg/agent/loadbalancer/loadbalancer.go +++ b/pkg/agent/loadbalancer/loadbalancer.go @@ -158,6 +158,7 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net lb.mutex.RLock() defer lb.mutex.RUnlock() + var allChecksFailed bool startIndex := lb.nextServerIndex for { targetServer := lb.currentServerAddress @@ -165,7 +166,7 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net server := lb.servers[targetServer] if server == nil || targetServer == "" { logrus.Debugf("Nil server for load balancer %s: %s", lb.serviceName, targetServer) - } else if server.healthCheck() { + } else if allChecksFailed || server.healthCheck() { conn, err := server.dialContext(ctx, network, targetServer) if err == nil { return conn, nil @@ -189,7 +190,11 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net startIndex = maxIndex } if lb.nextServerIndex == startIndex { - return nil, errors.New("all servers failed") + if allChecksFailed { + return nil, errors.New("all servers failed") + } + logrus.Debugf("Health checks for all servers in load balancer %s have failed: retrying with health checks ignored", lb.serviceName) + allChecksFailed = true } } } diff --git a/pkg/agent/loadbalancer/servers.go b/pkg/agent/loadbalancer/servers.go index 78ee88d74f..7dc80e4932 100644 --- a/pkg/agent/loadbalancer/servers.go +++ b/pkg/agent/loadbalancer/servers.go @@ -227,13 +227,19 @@ func (lb *LoadBalancer) SetHealthCheck(address string, healthCheck func() bool) // runHealthChecks periodically health-checks all servers. Any servers that fail the health-check will have their // connections closed, to force clients to switch over to a healthy server. func (lb *LoadBalancer) runHealthChecks(ctx context.Context) { + previousStatus := map[string]bool{} wait.Until(func() { lb.mutex.RLock() defer lb.mutex.RUnlock() - for _, server := range lb.servers { - if !server.healthCheck() { + for address, server := range lb.servers { + status := server.healthCheck() + if status == false && previousStatus[address] == true { + // Only close connections when the server transitions from healthy to unhealthy; + // we don't want to re-close all the connections every time as we might be ignoring + // health checks due to all servers being marked unhealthy. defer server.closeAll() } + previousStatus[address] = status } }, time.Second, ctx.Done()) logrus.Debugf("Stopped health checking for load balancer %s", lb.serviceName) diff --git a/pkg/etcd/etcdproxy.go b/pkg/etcd/etcdproxy.go index 40bee876b1..55918850b3 100644 --- a/pkg/etcd/etcdproxy.go +++ b/pkg/etcd/etcdproxy.go @@ -130,7 +130,7 @@ func (e etcdproxy) createHealthCheck(ctx context.Context, address string) func() statusCode = resp.StatusCode } if err != nil || statusCode != http.StatusOK { - logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", url, err, statusCode) + logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", address, err, statusCode) connected = false } else { connected = true