From fea32fb7d7ab3ba8cb33e8226b7caf2d15778aa1 Mon Sep 17 00:00:00 2001 From: James Bardin Date: Wed, 11 Oct 2017 10:42:10 -0400 Subject: [PATCH] retry locks on network errors (#3553) * retry locks on network errors When communicating with a local agent and watching a lock, a dropped connection between the agent and server will show up as a server error and immediately be retried. However if the client is connected to a remote server, a dropped connection immediately aborts the lock. * Updates comment about it being unsafe for writes. --- api/api.go | 13 ++++++++++--- api/api_test.go | 19 ++++++++++++------- api/lock.go | 2 +- api/semaphore.go | 2 +- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/api/api.go b/api/api.go index 3c28afe289..87cd3c5a37 100644 --- a/api/api.go +++ b/api/api.go @@ -555,13 +555,20 @@ func durToMsec(dur time.Duration) string { // serverError is a string we look for to detect 500 errors. const serverError = "Unexpected response code: 500" -// IsServerError returns true for 500 errors from the Consul servers, these are -// usually retryable at a later time. -func IsServerError(err error) bool { +// IsRetryableError returns true for 500 errors from the Consul servers, and +// network connection errors. These are usually retryable at a later time. +// This applies to reads but NOT to writes. This may return true for errors +// on writes that may have still gone through, so do not use this to retry +// any write operations. +func IsRetryableError(err error) bool { if err == nil { return false } + if _, ok := err.(net.Error); ok { + return true + } + // TODO (slackpad) - Make a real error type here instead of using // a string check. return strings.Contains(err.Error(), serverError) diff --git a/api/api_test.go b/api/api_test.go index ba3a448ab1..f06bc1b304 100644 --- a/api/api_test.go +++ b/api/api_test.go @@ -4,6 +4,7 @@ import ( crand "crypto/rand" "crypto/tls" "fmt" + "net" "net/http" "os" "path/filepath" @@ -529,17 +530,21 @@ func TestAPI_durToMsec(t *testing.T) { } } -func TestAPI_IsServerError(t *testing.T) { +func TestAPI_IsRetryableError(t *testing.T) { t.Parallel() - if IsServerError(nil) { - t.Fatalf("should not be a server error") + if IsRetryableError(nil) { + t.Fatal("should not be a retryable error") } - if IsServerError(fmt.Errorf("not the error you are looking for")) { - t.Fatalf("should not be a server error") + if IsRetryableError(fmt.Errorf("not the error you are looking for")) { + t.Fatal("should not be a retryable error") } - if !IsServerError(fmt.Errorf(serverError)) { - t.Fatalf("should be a server error") + if !IsRetryableError(fmt.Errorf(serverError)) { + t.Fatal("should be a retryable error") + } + + if !IsRetryableError(&net.OpError{Err: fmt.Errorf("network conn error")}) { + t.Fatal("should be a retryable error") } } diff --git a/api/lock.go b/api/lock.go index 466ef5fdf1..d3187113f7 100644 --- a/api/lock.go +++ b/api/lock.go @@ -370,7 +370,7 @@ RETRY: // by doing retries. Note that we have to attempt the retry in a non- // blocking fashion so that we have a clean place to reset the retry // counter if service is restored. - if retries > 0 && IsServerError(err) { + if retries > 0 && IsRetryableError(err) { time.Sleep(l.opts.MonitorRetryTime) retries-- opts.WaitIndex = 0 diff --git a/api/semaphore.go b/api/semaphore.go index 9ddbdc49e7..d848dfd0b1 100644 --- a/api/semaphore.go +++ b/api/semaphore.go @@ -492,7 +492,7 @@ RETRY: // by doing retries. Note that we have to attempt the retry in a non- // blocking fashion so that we have a clean place to reset the retry // counter if service is restored. - if retries > 0 && IsServerError(err) { + if retries > 0 && IsRetryableError(err) { time.Sleep(s.opts.MonitorRetryTime) retries-- opts.WaitIndex = 0