Browse Source

Fix nil-pointer panics from proxycfg package. (#16277)

Prior to this PR, servers / agents would panic and crash if an ingress
or api gateway were configured to use a discovery chain that both:

1. Referenced a peered service
2. Had a mesh gateway mode of local

This could occur, because code for handling upstream watches was shared
between both connect-proxy and the gateways. As a short-term fix, this
PR ensures that the maps are always initialized for these gateway services.

This PR also wraps the proxycfg execution and service
registration calls with recover statements to ensure that future issues
like this do not put the server into an unrecoverable state.
spatel/NET-1847-repro
Derek Menteer 2 years ago committed by GitHub
parent
commit
6599a9be1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      agent/proxycfg/api_gateway.go
  2. 1
      agent/proxycfg/ingress_gateway.go
  3. 15
      agent/proxycfg/manager.go
  4. 16
      agent/proxycfg/state.go

1
agent/proxycfg/api_gateway.go

@ -73,6 +73,7 @@ func (h *handlerAPIGateway) initialize(ctx context.Context) (ConfigSnapshot, err
snap.APIGateway.WatchedDiscoveryChains = make(map[UpstreamID]context.CancelFunc)
snap.APIGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc)
snap.APIGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.APIGateway.WatchedLocalGWEndpoints = watch.NewMap[string, structs.CheckServiceNodes]()
snap.APIGateway.WatchedUpstreams = make(map[UpstreamID]map[string]context.CancelFunc)
snap.APIGateway.WatchedUpstreamEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)

1
agent/proxycfg/ingress_gateway.go

@ -67,6 +67,7 @@ func (s *handlerIngressGateway) initialize(ctx context.Context) (ConfigSnapshot,
snap.IngressGateway.WatchedUpstreamEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.IngressGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc)
snap.IngressGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes)
snap.IngressGateway.WatchedLocalGWEndpoints = watch.NewMap[string, structs.CheckServiceNodes]()
snap.IngressGateway.Listeners = make(map[IngressListenerKey]structs.IngressListener)
snap.IngressGateway.UpstreamPeerTrustBundles = watch.NewMap[string, *pbpeering.PeeringTrustBundle]()
snap.IngressGateway.PeerUpstreamEndpoints = watch.NewMap[UpstreamID, structs.CheckServiceNodes]()

15
agent/proxycfg/manager.go

@ -2,6 +2,7 @@ package proxycfg
import (
"errors"
"runtime/debug"
"sync"
"github.com/hashicorp/go-hclog"
@ -142,6 +143,20 @@ func (m *Manager) Register(id ProxyID, ns *structs.NodeService, source ProxySour
m.mu.Lock()
defer m.mu.Unlock()
defer func() {
if r := recover(); r != nil {
m.Logger.Error("unexpected panic during service manager registration",
"node", id.NodeName,
"service", id.ServiceID,
"message", r,
"stacktrace", string(debug.Stack()),
)
}
}()
return m.register(id, ns, source, token, overwrite)
}
func (m *Manager) register(id ProxyID, ns *structs.NodeService, source ProxySource, token string, overwrite bool) error {
state, ok := m.proxies[id]
if ok {
if state.source != source && !overwrite {

16
agent/proxycfg/state.go

@ -6,6 +6,7 @@ import (
"fmt"
"net"
"reflect"
"runtime/debug"
"sync/atomic"
"time"
@ -298,6 +299,21 @@ func newConfigSnapshotFromServiceInstance(s serviceInstance, config stateConfig)
}
func (s *state) run(ctx context.Context, snap *ConfigSnapshot) {
// Add a recover here so than any panics do not make their way up
// into the server / agent.
defer func() {
if r := recover(); r != nil {
s.logger.Error("unexpected panic while running proxycfg",
"node", s.serviceInstance.proxyID.NodeName,
"service", s.serviceInstance.proxyID.ServiceID,
"message", r,
"stacktrace", string(debug.Stack()))
}
}()
s.unsafeRun(ctx, snap)
}
func (s *state) unsafeRun(ctx context.Context, snap *ConfigSnapshot) {
// Close the channel we return from Watch when we stop so consumers can stop
// watching and clean up their goroutines. It's important we do this here and
// not in Close since this routine sends on this chan and so might panic if it

Loading…
Cancel
Save