Fix issue caused by sole server marked as failed under load

If health checks are failing for all servers, make a second pass through the server list with health-checks ignored before returning failure

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
(cherry picked from commit ca39614d4e)
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
pull/10290/head
Brad Davidson 2024-05-29 18:17:29 +00:00 committed by Brad Davidson
parent da89ab5052
commit 19692f5d17
3 changed files with 16 additions and 5 deletions

View File

@ -158,6 +158,7 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
lb.mutex.RLock() lb.mutex.RLock()
defer lb.mutex.RUnlock() defer lb.mutex.RUnlock()
var allChecksFailed bool
startIndex := lb.nextServerIndex startIndex := lb.nextServerIndex
for { for {
targetServer := lb.currentServerAddress targetServer := lb.currentServerAddress
@ -165,7 +166,7 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
server := lb.servers[targetServer] server := lb.servers[targetServer]
if server == nil || targetServer == "" { if server == nil || targetServer == "" {
logrus.Debugf("Nil server for load balancer %s: %s", lb.serviceName, targetServer) logrus.Debugf("Nil server for load balancer %s: %s", lb.serviceName, targetServer)
} else if server.healthCheck() { } else if allChecksFailed || server.healthCheck() {
conn, err := server.dialContext(ctx, network, targetServer) conn, err := server.dialContext(ctx, network, targetServer)
if err == nil { if err == nil {
return conn, nil return conn, nil
@ -189,7 +190,11 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
startIndex = maxIndex startIndex = maxIndex
} }
if lb.nextServerIndex == startIndex { if lb.nextServerIndex == startIndex {
return nil, errors.New("all servers failed") if allChecksFailed {
return nil, errors.New("all servers failed")
}
logrus.Debugf("Health checks for all servers in load balancer %s have failed: retrying with health checks ignored", lb.serviceName)
allChecksFailed = true
} }
} }
} }

View File

@ -227,13 +227,19 @@ func (lb *LoadBalancer) SetHealthCheck(address string, healthCheck func() bool)
// runHealthChecks periodically health-checks all servers. Any servers that fail the health-check will have their // runHealthChecks periodically health-checks all servers. Any servers that fail the health-check will have their
// connections closed, to force clients to switch over to a healthy server. // connections closed, to force clients to switch over to a healthy server.
func (lb *LoadBalancer) runHealthChecks(ctx context.Context) { func (lb *LoadBalancer) runHealthChecks(ctx context.Context) {
previousStatus := map[string]bool{}
wait.Until(func() { wait.Until(func() {
lb.mutex.RLock() lb.mutex.RLock()
defer lb.mutex.RUnlock() defer lb.mutex.RUnlock()
for _, server := range lb.servers { for address, server := range lb.servers {
if !server.healthCheck() { status := server.healthCheck()
if status == false && previousStatus[address] == true {
// Only close connections when the server transitions from healthy to unhealthy;
// we don't want to re-close all the connections every time as we might be ignoring
// health checks due to all servers being marked unhealthy.
defer server.closeAll() defer server.closeAll()
} }
previousStatus[address] = status
} }
}, time.Second, ctx.Done()) }, time.Second, ctx.Done())
logrus.Debugf("Stopped health checking for load balancer %s", lb.serviceName) logrus.Debugf("Stopped health checking for load balancer %s", lb.serviceName)

View File

@ -130,7 +130,7 @@ func (e etcdproxy) createHealthCheck(ctx context.Context, address string) func()
statusCode = resp.StatusCode statusCode = resp.StatusCode
} }
if err != nil || statusCode != http.StatusOK { if err != nil || statusCode != http.StatusOK {
logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", url, err, statusCode) logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", address, err, statusCode)
connected = false connected = false
} else { } else {
connected = true connected = true