From 6599a9be1df8749431ed0f5af7a7a8772ce4281e Mon Sep 17 00:00:00 2001 From: Derek Menteer <105233703+hashi-derek@users.noreply.github.com> Date: Wed, 15 Feb 2023 11:54:44 -0600 Subject: [PATCH] Fix nil-pointer panics from proxycfg package. (#16277) Prior to this PR, servers / agents would panic and crash if an ingress or api gateway were configured to use a discovery chain that both: 1. Referenced a peered service 2. Had a mesh gateway mode of local This could occur, because code for handling upstream watches was shared between both connect-proxy and the gateways. As a short-term fix, this PR ensures that the maps are always initialized for these gateway services. This PR also wraps the proxycfg execution and service registration calls with recover statements to ensure that future issues like this do not put the server into an unrecoverable state. --- agent/proxycfg/api_gateway.go | 1 + agent/proxycfg/ingress_gateway.go | 1 + agent/proxycfg/manager.go | 15 +++++++++++++++ agent/proxycfg/state.go | 16 ++++++++++++++++ 4 files changed, 33 insertions(+) diff --git a/agent/proxycfg/api_gateway.go b/agent/proxycfg/api_gateway.go index 28a6c42300..c9b5ad1007 100644 --- a/agent/proxycfg/api_gateway.go +++ b/agent/proxycfg/api_gateway.go @@ -73,6 +73,7 @@ func (h *handlerAPIGateway) initialize(ctx context.Context) (ConfigSnapshot, err snap.APIGateway.WatchedDiscoveryChains = make(map[UpstreamID]context.CancelFunc) snap.APIGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc) snap.APIGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes) + snap.APIGateway.WatchedLocalGWEndpoints = watch.NewMap[string, structs.CheckServiceNodes]() snap.APIGateway.WatchedUpstreams = make(map[UpstreamID]map[string]context.CancelFunc) snap.APIGateway.WatchedUpstreamEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes) diff --git a/agent/proxycfg/ingress_gateway.go b/agent/proxycfg/ingress_gateway.go index b6c1cd6e02..f0f75d8107 100644 --- a/agent/proxycfg/ingress_gateway.go +++ b/agent/proxycfg/ingress_gateway.go @@ -67,6 +67,7 @@ func (s *handlerIngressGateway) initialize(ctx context.Context) (ConfigSnapshot, snap.IngressGateway.WatchedUpstreamEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes) snap.IngressGateway.WatchedGateways = make(map[UpstreamID]map[string]context.CancelFunc) snap.IngressGateway.WatchedGatewayEndpoints = make(map[UpstreamID]map[string]structs.CheckServiceNodes) + snap.IngressGateway.WatchedLocalGWEndpoints = watch.NewMap[string, structs.CheckServiceNodes]() snap.IngressGateway.Listeners = make(map[IngressListenerKey]structs.IngressListener) snap.IngressGateway.UpstreamPeerTrustBundles = watch.NewMap[string, *pbpeering.PeeringTrustBundle]() snap.IngressGateway.PeerUpstreamEndpoints = watch.NewMap[UpstreamID, structs.CheckServiceNodes]() diff --git a/agent/proxycfg/manager.go b/agent/proxycfg/manager.go index eb5df5a855..c58268e7e0 100644 --- a/agent/proxycfg/manager.go +++ b/agent/proxycfg/manager.go @@ -2,6 +2,7 @@ package proxycfg import ( "errors" + "runtime/debug" "sync" "github.com/hashicorp/go-hclog" @@ -142,6 +143,20 @@ func (m *Manager) Register(id ProxyID, ns *structs.NodeService, source ProxySour m.mu.Lock() defer m.mu.Unlock() + defer func() { + if r := recover(); r != nil { + m.Logger.Error("unexpected panic during service manager registration", + "node", id.NodeName, + "service", id.ServiceID, + "message", r, + "stacktrace", string(debug.Stack()), + ) + } + }() + return m.register(id, ns, source, token, overwrite) +} + +func (m *Manager) register(id ProxyID, ns *structs.NodeService, source ProxySource, token string, overwrite bool) error { state, ok := m.proxies[id] if ok { if state.source != source && !overwrite { diff --git a/agent/proxycfg/state.go b/agent/proxycfg/state.go index 57964d54fe..d312c3b4c1 100644 --- a/agent/proxycfg/state.go +++ b/agent/proxycfg/state.go @@ -6,6 +6,7 @@ import ( "fmt" "net" "reflect" + "runtime/debug" "sync/atomic" "time" @@ -298,6 +299,21 @@ func newConfigSnapshotFromServiceInstance(s serviceInstance, config stateConfig) } func (s *state) run(ctx context.Context, snap *ConfigSnapshot) { + // Add a recover here so than any panics do not make their way up + // into the server / agent. + defer func() { + if r := recover(); r != nil { + s.logger.Error("unexpected panic while running proxycfg", + "node", s.serviceInstance.proxyID.NodeName, + "service", s.serviceInstance.proxyID.ServiceID, + "message", r, + "stacktrace", string(debug.Stack())) + } + }() + s.unsafeRun(ctx, snap) +} + +func (s *state) unsafeRun(ctx context.Context, snap *ConfigSnapshot) { // Close the channel we return from Watch when we stop so consumers can stop // watching and clean up their goroutines. It's important we do this here and // not in Close since this routine sends on this chan and so might panic if it