mirror of https://github.com/k3s-io/k3s
Fix issue caused by sole server marked as failed under load
If health checks are failing for all servers, make a second pass through the server list with health-checks ignored before returning failure
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
(cherry picked from commit ca39614d4e
)
Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
pull/10288/head
parent
2e7b394713
commit
8262c02cdd
|
@ -158,6 +158,7 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
|
|||
lb.mutex.RLock()
|
||||
defer lb.mutex.RUnlock()
|
||||
|
||||
var allChecksFailed bool
|
||||
startIndex := lb.nextServerIndex
|
||||
for {
|
||||
targetServer := lb.currentServerAddress
|
||||
|
@ -165,7 +166,7 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
|
|||
server := lb.servers[targetServer]
|
||||
if server == nil || targetServer == "" {
|
||||
logrus.Debugf("Nil server for load balancer %s: %s", lb.serviceName, targetServer)
|
||||
} else if server.healthCheck() {
|
||||
} else if allChecksFailed || server.healthCheck() {
|
||||
conn, err := server.dialContext(ctx, network, targetServer)
|
||||
if err == nil {
|
||||
return conn, nil
|
||||
|
@ -189,7 +190,11 @@ func (lb *LoadBalancer) dialContext(ctx context.Context, network, _ string) (net
|
|||
startIndex = maxIndex
|
||||
}
|
||||
if lb.nextServerIndex == startIndex {
|
||||
return nil, errors.New("all servers failed")
|
||||
if allChecksFailed {
|
||||
return nil, errors.New("all servers failed")
|
||||
}
|
||||
logrus.Debugf("Health checks for all servers in load balancer %s have failed: retrying with health checks ignored", lb.serviceName)
|
||||
allChecksFailed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -227,13 +227,19 @@ func (lb *LoadBalancer) SetHealthCheck(address string, healthCheck func() bool)
|
|||
// runHealthChecks periodically health-checks all servers. Any servers that fail the health-check will have their
|
||||
// connections closed, to force clients to switch over to a healthy server.
|
||||
func (lb *LoadBalancer) runHealthChecks(ctx context.Context) {
|
||||
previousStatus := map[string]bool{}
|
||||
wait.Until(func() {
|
||||
lb.mutex.RLock()
|
||||
defer lb.mutex.RUnlock()
|
||||
for _, server := range lb.servers {
|
||||
if !server.healthCheck() {
|
||||
for address, server := range lb.servers {
|
||||
status := server.healthCheck()
|
||||
if status == false && previousStatus[address] == true {
|
||||
// Only close connections when the server transitions from healthy to unhealthy;
|
||||
// we don't want to re-close all the connections every time as we might be ignoring
|
||||
// health checks due to all servers being marked unhealthy.
|
||||
defer server.closeAll()
|
||||
}
|
||||
previousStatus[address] = status
|
||||
}
|
||||
}, time.Second, ctx.Done())
|
||||
logrus.Debugf("Stopped health checking for load balancer %s", lb.serviceName)
|
||||
|
|
|
@ -130,7 +130,7 @@ func (e etcdproxy) createHealthCheck(ctx context.Context, address string) func()
|
|||
statusCode = resp.StatusCode
|
||||
}
|
||||
if err != nil || statusCode != http.StatusOK {
|
||||
logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", url, err, statusCode)
|
||||
logrus.Debugf("Health check %s failed: %v (StatusCode: %d)", address, err, statusCode)
|
||||
connected = false
|
||||
} else {
|
||||
connected = true
|
||||
|
|
Loading…
Reference in New Issue