RPC Timeout/Retries account for blocking requests (#8978)

pull/10315/head
Dhia Ayachi 4 years ago committed by GitHub
parent b130f355ee
commit f785c5b332
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,4 @@
```release-note:bug
use the MaxQueryTime instead of RPCHoldTimeout for blocking RPC queries
[[GH-8978](https://github.com/hashicorp/consul/pull/8978)].
```

@ -1665,7 +1665,7 @@ func TestCatalog_ListServices_Stale(t *testing.T) {
defer codec.Close() defer codec.Close()
// Run the query, do not wait for leader, never any contact with leader, should fail // Run the query, do not wait for leader, never any contact with leader, should fail
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err == nil || err.Error() != structs.ErrNoLeader.Error() { if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err.Error() != structs.ErrNoLeader.Error() {
t.Fatalf("expected %v but got err: %v and %v", structs.ErrNoLeader, err, out) t.Fatalf("expected %v but got err: %v and %v", structs.ErrNoLeader, err, out)
} }
@ -1677,6 +1677,7 @@ func TestCatalog_ListServices_Stale(t *testing.T) {
testrpc.WaitForLeader(t, s2.RPC, "dc1") testrpc.WaitForLeader(t, s2.RPC, "dc1")
retry.Run(t, func(r *retry.R) { retry.Run(t, func(r *retry.R) {
out = structs.IndexedServices{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err != nil { if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err != nil {
r.Fatalf("err: %v", err) r.Fatalf("err: %v", err)
} }
@ -1696,24 +1697,25 @@ func TestCatalog_ListServices_Stale(t *testing.T) {
args.AllowStale = false args.AllowStale = false
// Since the leader is now down, non-stale query should fail now // Since the leader is now down, non-stale query should fail now
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err == nil || err.Error() != structs.ErrNoLeader.Error() { out = structs.IndexedServices{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err.Error() != structs.ErrLeaderNotTracked.Error() {
t.Fatalf("expected %v but got err: %v and %v", structs.ErrNoLeader, err, out) t.Fatalf("expected %v but got err: %v and %v", structs.ErrNoLeader, err, out)
} }
if out.KnownLeader {
t.Fatalf("should not have a leader anymore: %#v", out)
}
// With stale, request should still work // With stale, request should still work
args.AllowStale = true args.AllowStale = true
retry.Run(t, func(r *retry.R) {
out = structs.IndexedServices{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err != nil { if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListServices", &args, &out); err != nil {
t.Fatalf("err: %v", err) r.Fatalf("err: %v", err)
}
// Should find old service
if len(out.Services) != 1 {
t.Fatalf("bad: %#v", out)
} }
if out.KnownLeader || len(out.Services) != 1 {
if out.KnownLeader { r.Fatalf("got %t nodes want %d", out.KnownLeader, len(out.Services))
t.Fatalf("should not have a leader anymore: %#v", out)
} }
})
} }
func TestCatalog_ListServiceNodes(t *testing.T) { func TestCatalog_ListServiceNodes(t *testing.T) {

@ -285,19 +285,17 @@ TRY:
// Use the zero value for RPCInfo if the request doesn't implement RPCInfo // Use the zero value for RPCInfo if the request doesn't implement RPCInfo
info, _ := args.(structs.RPCInfo) info, _ := args.(structs.RPCInfo)
if retry := canRetry(info, rpcErr); !retry { if retry := canRetry(info, rpcErr, firstCheck, c.config); !retry {
return rpcErr return rpcErr
} }
// We can wait a bit and retry! // We can wait a bit and retry!
if time.Since(firstCheck) < c.config.RPCHoldTimeout { jitter := lib.RandomStagger(c.config.RPCHoldTimeout / structs.JitterFraction)
jitter := lib.RandomStagger(c.config.RPCHoldTimeout / jitterFraction)
select { select {
case <-time.After(jitter): case <-time.After(jitter):
goto TRY goto TRY
case <-c.shutdownCh: case <-c.shutdownCh:
} }
}
return rpcErr return rpcErr
} }

@ -75,12 +75,6 @@ var RPCSummaries = []prometheus.SummaryDefinition{
} }
const ( const (
// jitterFraction is a the limit to the amount of jitter we apply
// to a user specified MaxQueryTime. We divide the specified time by
// the fraction. So 16 == 6.25% limit of jitter. This same fraction
// is applied to the RPCHoldTimeout
jitterFraction = 16
// Warn if the Raft command is larger than this. // Warn if the Raft command is larger than this.
// If it's over 1MB something is probably being abusive. // If it's over 1MB something is probably being abusive.
raftWarnSize = 1024 * 1024 raftWarnSize = 1024 * 1024
@ -526,7 +520,14 @@ func (c *limitedConn) Read(b []byte) (n int, err error) {
} }
// canRetry returns true if the request and error indicate that a retry is safe. // canRetry returns true if the request and error indicate that a retry is safe.
func canRetry(info structs.RPCInfo, err error) bool { func canRetry(info structs.RPCInfo, err error, start time.Time, config *Config) bool {
if info != nil && info.HasTimedOut(start, config.RPCHoldTimeout, config.MaxQueryTime, config.DefaultQueryTime) {
// RPCInfo timeout may include extra time for MaxQueryTime
return false
} else if info == nil && time.Since(start) > config.RPCHoldTimeout {
// When not RPCInfo, timeout is only RPCHoldTimeout
return false
}
// No leader errors are always safe to retry since no state could have // No leader errors are always safe to retry since no state could have
// been changed. // been changed.
if structs.IsErrNoLeader(err) { if structs.IsErrNoLeader(err) {
@ -545,16 +546,16 @@ func canRetry(info structs.RPCInfo, err error) bool {
// ForwardRPC is used to forward an RPC request to a remote DC or to the local leader // ForwardRPC is used to forward an RPC request to a remote DC or to the local leader
// Returns a bool of if forwarding was performed, as well as any error // Returns a bool of if forwarding was performed, as well as any error
func (s *Server) ForwardRPC(method string, req structs.RPCInfo, reply interface{}) (bool, error) { func (s *Server) ForwardRPC(method string, info structs.RPCInfo, reply interface{}) (bool, error) {
var firstCheck time.Time firstCheck := time.Now()
// Handle DC forwarding // Handle DC forwarding
dc := req.RequestDatacenter() dc := info.RequestDatacenter()
if dc != s.config.Datacenter { if dc != s.config.Datacenter {
// Local tokens only work within the current datacenter. Check to see // Local tokens only work within the current datacenter. Check to see
// if we are attempting to forward one to a remote datacenter and strip // if we are attempting to forward one to a remote datacenter and strip
// it, falling back on the anonymous token on the other end. // it, falling back on the anonymous token on the other end.
if token := req.TokenSecret(); token != "" { if token := info.TokenSecret(); token != "" {
done, ident, err := s.ResolveIdentityFromToken(token) done, ident, err := s.ResolveIdentityFromToken(token)
if done { if done {
if err != nil && !acl.IsErrNotFound(err) { if err != nil && !acl.IsErrNotFound(err) {
@ -562,18 +563,18 @@ func (s *Server) ForwardRPC(method string, req structs.RPCInfo, reply interface{
} }
if ident != nil && ident.IsLocal() { if ident != nil && ident.IsLocal() {
// Strip it from the request. // Strip it from the request.
req.SetTokenSecret("") info.SetTokenSecret("")
defer req.SetTokenSecret(token) defer info.SetTokenSecret(token)
} }
} }
} }
err := s.forwardDC(method, dc, req, reply) err := s.forwardDC(method, dc, info, reply)
return true, err return true, err
} }
// Check if we can allow a stale read, ensure our local DB is initialized // Check if we can allow a stale read, ensure our local DB is initialized
if req.IsRead() && req.AllowStaleRead() && !s.raft.LastContact().IsZero() { if info.IsRead() && info.AllowStaleRead() && !s.raft.LastContact().IsZero() {
return false, nil return false, nil
} }
@ -596,20 +597,15 @@ CHECK_LEADER:
// Handle the case of a known leader // Handle the case of a known leader
if leader != nil { if leader != nil {
rpcErr = s.connPool.RPC(s.config.Datacenter, leader.ShortName, leader.Addr, rpcErr = s.connPool.RPC(s.config.Datacenter, leader.ShortName, leader.Addr,
method, req, reply) method, info, reply)
if rpcErr != nil && canRetry(req, rpcErr) { if rpcErr == nil {
goto RETRY return true, nil
} }
return true, rpcErr
} }
RETRY: if retry := canRetry(info, rpcErr, firstCheck, s.config); retry {
// Gate the request until there is a leader // Gate the request until there is a leader
if firstCheck.IsZero() { jitter := lib.RandomStagger(s.config.RPCHoldTimeout / structs.JitterFraction)
firstCheck = time.Now()
}
if time.Since(firstCheck) < s.config.RPCHoldTimeout {
jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction)
select { select {
case <-time.After(jitter): case <-time.After(jitter):
goto CHECK_LEADER goto CHECK_LEADER
@ -832,7 +828,7 @@ func (s *Server) blockingQuery(queryOpts structs.QueryOptionsCompat, queryMeta s
} }
// Apply a small amount of jitter to the request. // Apply a small amount of jitter to the request.
queryTimeout += lib.RandomStagger(queryTimeout / jitterFraction) queryTimeout += lib.RandomStagger(queryTimeout / structs.JitterFraction)
// wrap the base context with a deadline // wrap the base context with a deadline
ctx, cancel = context.WithDeadline(ctx, time.Now().Add(queryTimeout)) ctx, cancel = context.WithDeadline(ctx, time.Now().Add(queryTimeout))
@ -933,7 +929,7 @@ func (s *Server) consistentRead() error {
if s.isReadyForConsistentReads() { if s.isReadyForConsistentReads() {
return nil return nil
} }
jitter := lib.RandomStagger(s.config.RPCHoldTimeout / jitterFraction) jitter := lib.RandomStagger(s.config.RPCHoldTimeout / structs.JitterFraction)
deadline := time.Now().Add(s.config.RPCHoldTimeout) deadline := time.Now().Add(s.config.RPCHoldTimeout)
for time.Now().Before(deadline) { for time.Now().Before(deadline) {

@ -962,10 +962,17 @@ func TestCanRetry(t *testing.T) {
req structs.RPCInfo req structs.RPCInfo
err error err error
expected bool expected bool
timeout time.Time
} }
config := DefaultConfig()
now := time.Now()
config.RPCHoldTimeout = 7 * time.Second
run := func(t *testing.T, tc testCase) { run := func(t *testing.T, tc testCase) {
require.Equal(t, tc.expected, canRetry(tc.req, tc.err)) timeOutValue := tc.timeout
if timeOutValue.IsZero() {
timeOutValue = now
}
require.Equal(t, tc.expected, canRetry(tc.req, tc.err, timeOutValue, config))
} }
var testCases = []testCase{ var testCases = []testCase{
@ -990,6 +997,46 @@ func TestCanRetry(t *testing.T) {
err: io.EOF, err: io.EOF,
expected: true, expected: true,
}, },
{
name: "EOF error",
req: &structs.DCSpecificRequest{},
err: io.EOF,
expected: true,
},
{
name: "HasTimedOut implementation with no error",
req: &structs.DCSpecificRequest{},
err: nil,
expected: false,
},
{
name: "HasTimedOut implementation timedOut with no error",
req: &structs.DCSpecificRequest{},
err: nil,
expected: false,
timeout: now.Add(-(config.RPCHoldTimeout + time.Second)),
},
{
name: "HasTimedOut implementation timedOut (with EOF error)",
req: &structs.DCSpecificRequest{},
err: io.EOF,
expected: false,
timeout: now.Add(-(config.RPCHoldTimeout + time.Second)),
},
{
name: "HasTimedOut implementation timedOut blocking call",
req: &structs.DCSpecificRequest{QueryOptions: structs.QueryOptions{MaxQueryTime: 300, MinQueryIndex: 1}},
err: nil,
expected: false,
timeout: now.Add(-(config.RPCHoldTimeout + config.MaxQueryTime + time.Second)),
},
{
name: "HasTimedOut implementation timedOut blocking call (MaxQueryTime not set)",
req: &structs.DCSpecificRequest{QueryOptions: structs.QueryOptions{MinQueryIndex: 1}},
err: nil,
expected: false,
timeout: now.Add(-(config.RPCHoldTimeout + config.MaxQueryTime + time.Second)),
},
{ {
name: "EOF on write request", name: "EOF on write request",
err: io.EOF, err: io.EOF,
@ -1011,3 +1058,7 @@ type isReadRequest struct {
func (r isReadRequest) IsRead() bool { func (r isReadRequest) IsRead() bool {
return true return true
} }
func (r isReadRequest) HasTimedOut(since time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool {
return false
}

@ -159,6 +159,12 @@ const (
// we multiply by time.Second // we multiply by time.Second
lockDelayMinThreshold = 1000 lockDelayMinThreshold = 1000
// JitterFraction is a the limit to the amount of jitter we apply
// to a user specified MaxQueryTime. We divide the specified time by
// the fraction. So 16 == 6.25% limit of jitter. This same fraction
// is applied to the RPCHoldTimeout
JitterFraction = 16
// WildcardSpecifier is the string which should be used for specifying a wildcard // WildcardSpecifier is the string which should be used for specifying a wildcard
// The exact semantics of the wildcard is left up to the code where its used. // The exact semantics of the wildcard is left up to the code where its used.
WildcardSpecifier = "*" WildcardSpecifier = "*"
@ -193,6 +199,7 @@ type RPCInfo interface {
AllowStaleRead() bool AllowStaleRead() bool
TokenSecret() string TokenSecret() string
SetTokenSecret(string) SetTokenSecret(string)
HasTimedOut(since time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool
} }
// QueryOptions is used to specify various flags for read queries // QueryOptions is used to specify various flags for read queries
@ -291,6 +298,20 @@ func (q *QueryOptions) SetTokenSecret(s string) {
q.Token = s q.Token = s
} }
func (q QueryOptions) HasTimedOut(start time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool {
if q.MinQueryIndex > 0 {
if q.MaxQueryTime > maxQueryTime {
q.MaxQueryTime = maxQueryTime
} else if q.MaxQueryTime <= 0 {
q.MaxQueryTime = defaultQueryTime
}
q.MaxQueryTime += lib.RandomStagger(q.MaxQueryTime / JitterFraction)
return time.Since(start) > (q.MaxQueryTime + rpcHoldTimeout)
}
return time.Since(start) > rpcHoldTimeout
}
type WriteRequest struct { type WriteRequest struct {
// Token is the ACL token ID. If not provided, the 'anonymous' // Token is the ACL token ID. If not provided, the 'anonymous'
// token is assumed for backwards compatibility. // token is assumed for backwards compatibility.
@ -314,6 +335,10 @@ func (w *WriteRequest) SetTokenSecret(s string) {
w.Token = s w.Token = s
} }
func (w WriteRequest) HasTimedOut(start time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool {
return time.Since(start) > rpcHoldTimeout
}
// QueryMeta allows a query response to include potentially // QueryMeta allows a query response to include potentially
// useful metadata about a query // useful metadata about a query
type QueryMeta struct { type QueryMeta struct {

@ -16,6 +16,9 @@ var yamuxSessionShutdown = yamux.ErrSessionShutdown.Error()
// IsErrEOF returns true if we get an EOF error from the socket itself, or // IsErrEOF returns true if we get an EOF error from the socket itself, or
// an EOF equivalent error from yamux. // an EOF equivalent error from yamux.
func IsErrEOF(err error) bool { func IsErrEOF(err error) bool {
if err == nil {
return false
}
if errors.Is(err, io.EOF) { if errors.Is(err, io.EOF) {
return true return true
} }

@ -1,5 +1,7 @@
package pbautoconf package pbautoconf
import "time"
func (req *AutoConfigRequest) RequestDatacenter() string { func (req *AutoConfigRequest) RequestDatacenter() string {
return req.Datacenter return req.Datacenter
} }
@ -19,3 +21,7 @@ func (req *AutoConfigRequest) TokenSecret() string {
func (req *AutoConfigRequest) SetTokenSecret(token string) { func (req *AutoConfigRequest) SetTokenSecret(token string) {
req.ConsulToken = token req.ConsulToken = token
} }
func (req *AutoConfigRequest) HasTimedOut(start time.Time, rpcHoldTimeout, maxQueryTime, defaultQueryTime time.Duration) bool {
return time.Since(start) > rpcHoldTimeout
}

Loading…
Cancel
Save