From 0f0b080514a4604135f889fb900706030e3f8cae Mon Sep 17 00:00:00 2001 From: Dan Stough Date: Tue, 13 Feb 2024 12:08:01 -0500 Subject: [PATCH] [CE] feat(v2dns): add v2 style query metrics (#20608) feat(v2dns): add v2 style query metrics --- agent/discovery/query_fetcher_v1.go | 10 +++++ agent/dns.go | 41 +++++++++--------- agent/dns/router.go | 65 +++++++++++++++++++---------- agent/dns/router_test.go | 13 +++++- agent/setup.go | 2 + 5 files changed, 87 insertions(+), 44 deletions(-) diff --git a/agent/discovery/query_fetcher_v1.go b/agent/discovery/query_fetcher_v1.go index 3cf1654dec..2b80bc9152 100644 --- a/agent/discovery/query_fetcher_v1.go +++ b/agent/discovery/query_fetcher_v1.go @@ -13,6 +13,7 @@ import ( "time" "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" "github.com/hashicorp/go-hclog" @@ -29,6 +30,15 @@ const ( staleCounterThreshold = 5 * time.Second ) +// DNSCounters pre-registers the staleness metric. +// This value is used by both the V1 and V2 DNS (V1 Catalog-only) servers. +var DNSCounters = []prometheus.CounterDefinition{ + { + Name: []string{"dns", "stale_queries"}, + Help: "Increments when an agent serves a query within the allowed stale threshold.", + }, +} + // v1DataFetcherDynamicConfig is used to store the dynamic configuration of the V1 data fetcher. type v1DataFetcherDynamicConfig struct { // Default request tenancy diff --git a/agent/dns.go b/agent/dns.go index c07fd12175..bfbee97fe7 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -16,7 +16,6 @@ import ( "time" "github.com/armon/go-metrics" - "github.com/armon/go-metrics/prometheus" "github.com/armon/go-radix" "github.com/hashicorp/go-hclog" "github.com/miekg/dns" @@ -33,24 +32,6 @@ import ( "github.com/hashicorp/consul/logging" ) -var DNSCounters = []prometheus.CounterDefinition{ - { - Name: []string{"dns", "stale_queries"}, - Help: "Increments when an agent serves a query within the allowed stale threshold.", - }, -} - -var DNSSummaries = []prometheus.SummaryDefinition{ - { - Name: []string{"dns", "ptr_query"}, - Help: "Measures the time spent handling a reverse DNS query for the given node.", - }, - { - Name: []string{"dns", "domain_query"}, - Help: "Measures the time spent handling a domain query for the given node.", - }, -} - const ( // UDP can fit ~25 A records in a 512B response, and ~14 AAAA // records. Limit further to prevent unintentional configuration @@ -406,8 +387,17 @@ func (d *DNSServer) getResponseDomain(questionName string) string { func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) { q := req.Question[0] defer func(s time.Time) { + // V1 DNS-style metrics metrics.MeasureSinceWithLabels([]string{"dns", "ptr_query"}, s, []metrics.Label{{Name: "node", Value: d.agent.config.NodeName}}) + + // V2 DNS-style metrics for forward compatibility + metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s, + []metrics.Label{ + {Name: "node", Value: d.agent.config.NodeName}, + {Name: "type", Value: dns.Type(dns.TypePTR).String()}, + }) + d.logger.Debug("request served from client", "question", q, "latency", time.Since(s).String(), @@ -519,12 +509,21 @@ func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) { func (d *DNSServer) handleQuery(resp dns.ResponseWriter, req *dns.Msg) { q := req.Question[0] defer func(s time.Time) { + // V1 DNS-style metrics metrics.MeasureSinceWithLabels([]string{"dns", "domain_query"}, s, []metrics.Label{{Name: "node", Value: d.agent.config.NodeName}}) + + // V2 DNS-style metrics for forward compatibility + metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s, + []metrics.Label{ + {Name: "node", Value: d.agent.config.NodeName}, + {Name: "type", Value: dns.Type(q.Qtype).String()}, + }) + d.logger.Debug("request served from client", "name", q.Name, - "type", dns.Type(q.Qtype), - "class", dns.Class(q.Qclass), + "type", dns.Type(q.Qtype).String(), + "class", dns.Class(q.Qclass).String(), "latency", time.Since(s).String(), "client", resp.RemoteAddr().String(), "client_network", resp.RemoteAddr().Network(), diff --git a/agent/dns/router.go b/agent/dns/router.go index 538e7c9111..cca002793a 100644 --- a/agent/dns/router.go +++ b/agent/dns/router.go @@ -13,6 +13,7 @@ import ( "sync/atomic" "time" + "github.com/armon/go-metrics" "github.com/armon/go-radix" "github.com/miekg/dns" @@ -47,8 +48,6 @@ var ( trailingSpacesRE = regexp.MustCompile(" +$") ) -// TODO (v2-dns): metrics - // Context is used augment a DNS message with Consul-specific metadata. type Context struct { Token string @@ -105,6 +104,7 @@ type Router struct { domain string altDomain string datacenter string + nodeName string logger hclog.Logger tokenFunc func() string @@ -124,8 +124,6 @@ func NewRouter(cfg Config) (*Router, error) { domain := dns.CanonicalName(cfg.AgentConfig.DNSDomain) altDomain := dns.CanonicalName(cfg.AgentConfig.DNSAltDomain) - // TODO (v2-dns): need to figure out tenancy information here in a way that work for V2 and V1 - logger := cfg.Logger.Named(logging.DNS) router := &Router{ @@ -135,6 +133,7 @@ func NewRouter(cfg Config) (*Router, error) { altDomain: altDomain, datacenter: cfg.AgentConfig.Datacenter, logger: logger, + nodeName: cfg.AgentConfig.NodeName, tokenFunc: cfg.TokenFunc, translateAddressFunc: cfg.TranslateAddressFunc, translateServiceAddressFunc: cfg.TranslateServiceAddressFunc, @@ -148,7 +147,37 @@ func NewRouter(cfg Config) (*Router, error) { // HandleRequest is used to process an individual DNS request. It returns a message in success or fail cases. func (r *Router) HandleRequest(req *dns.Msg, reqCtx Context, remoteAddress net.Addr) *dns.Msg { - return r.handleRequestRecursively(req, reqCtx, remoteAddress, maxRecursionLevelDefault) + configCtx := r.dynamicConfig.Load().(*RouterDynamicConfig) + + r.logger.Trace("received request", "question", req.Question[0].Name, "type", dns.Type(req.Question[0].Qtype).String()) + + err := validateAndNormalizeRequest(req) + if err != nil { + r.logger.Error("error parsing DNS query", "error", err) + if errors.Is(err, errInvalidQuestion) { + return createRefusedResponse(req) + } + return createServerFailureResponse(req, configCtx, false) + } + + defer func(s time.Time, q dns.Question) { + metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s, + []metrics.Label{ + {Name: "node", Value: r.nodeName}, + {Name: "type", Value: dns.Type(q.Qtype).String()}, + }) + + r.logger.Debug("request served from client", + "name", q.Name, + "type", dns.Type(q.Qtype).String(), + "class", dns.Class(q.Qclass).String(), + "latency", time.Since(s).String(), + "client", remoteAddress.String(), + "client_network", remoteAddress.Network(), + ) + }(time.Now(), req.Question[0]) + + return r.handleRequestRecursively(req, reqCtx, configCtx, remoteAddress, maxRecursionLevelDefault) } // getErrorFromECSNotGlobalError returns the underlying error from an ECSNotGlobalError, if it exists. @@ -161,20 +190,14 @@ func getErrorFromECSNotGlobalError(err error) error { // handleRequestRecursively is used to process an individual DNS request. It will recurse as needed // a maximum number of times and returns a message in success or fail cases. -func (r *Router) handleRequestRecursively(req *dns.Msg, reqCtx Context, +func (r *Router) handleRequestRecursively(req *dns.Msg, reqCtx Context, configCtx *RouterDynamicConfig, remoteAddress net.Addr, maxRecursionLevel int) *dns.Msg { - configCtx := r.dynamicConfig.Load().(*RouterDynamicConfig) - r.logger.Trace("received request", "question", req.Question[0].Name, "type", dns.Type(req.Question[0].Qtype).String()) - - err := validateAndNormalizeRequest(req) - if err != nil { - r.logger.Error("error parsing DNS query", "error", err) - if errors.Is(err, errInvalidQuestion) { - return createRefusedResponse(req) - } - return createServerFailureResponse(req, configCtx, false) - } + r.logger.Trace( + "received request", + "question", req.Question[0].Name, + "type", dns.Type(req.Question[0].Qtype).String(), + "recursion_remaining", maxRecursionLevel) responseDomain, needRecurse := r.parseDomain(req.Question[0].Name) if needRecurse && !canRecurse(configCtx) { @@ -655,7 +678,7 @@ func (r *Router) defaultAgentDNSRequestContext() Context { } // resolveCNAME is used to recursively resolve CNAME records -func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Context, +func (r *Router) resolveCNAME(cfgContext *RouterDynamicConfig, name string, reqCtx Context, remoteAddress net.Addr, maxRecursionLevel int) []dns.RR { // If the CNAME record points to a Consul address, resolve it internally // Convert query to lowercase because DNS is case-insensitive; d.domain and @@ -670,13 +693,13 @@ func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Cont req.SetQuestion(name, dns.TypeANY) // TODO: handle error response - resp := r.handleRequestRecursively(req, reqCtx, nil, maxRecursionLevel-1) + resp := r.handleRequestRecursively(req, reqCtx, cfgContext, nil, maxRecursionLevel-1) return resp.Answer } // Do nothing if we don't have a recursor - if !canRecurse(cfg) { + if !canRecurse(cfgContext) { return nil } @@ -685,7 +708,7 @@ func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Cont m.SetQuestion(name, dns.TypeA) // Make a DNS lookup request - recursorResponse, err := r.recursor.handle(m, cfg, remoteAddress) + recursorResponse, err := r.recursor.handle(m, cfgContext, remoteAddress) if err == nil { return recursorResponse.Answer } diff --git a/agent/dns/router_test.go b/agent/dns/router_test.go index f5014d6c88..ea19db9e4b 100644 --- a/agent/dns/router_test.go +++ b/agent/dns/router_test.go @@ -6,12 +6,13 @@ package dns import ( "errors" "fmt" - "github.com/hashicorp/consul/internal/dnsutil" "net" "reflect" "testing" "time" + "github.com/hashicorp/consul/internal/dnsutil" + "github.com/miekg/dns" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" @@ -2743,7 +2744,15 @@ func runHandleTestCases(t *testing.T, tc HandleTestCase) { if ctx == nil { ctx = &Context{} } - actual := router.HandleRequest(tc.request, *ctx, tc.remoteAddress) + + var remoteAddress net.Addr + if tc.remoteAddress != nil { + remoteAddress = tc.remoteAddress + } else { + remoteAddress = &net.UDPAddr{} + } + + actual := router.HandleRequest(tc.request, *ctx, remoteAddress) require.Equal(t, tc.response, actual) } diff --git a/agent/setup.go b/agent/setup.go index 7dc8cdb9a7..772cdfbcd5 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -28,6 +28,7 @@ import ( "github.com/hashicorp/consul/agent/consul/stream" "github.com/hashicorp/consul/agent/consul/usagemetrics" "github.com/hashicorp/consul/agent/consul/xdscapacity" + "github.com/hashicorp/consul/agent/discovery" "github.com/hashicorp/consul/agent/grpc-external/limiter" grpcInt "github.com/hashicorp/consul/agent/grpc-internal" "github.com/hashicorp/consul/agent/grpc-internal/balancer" @@ -434,6 +435,7 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G consul.CatalogCounters, consul.ClientCounters, consul.RPCCounters, + discovery.DNSCounters, grpcWare.StatsCounters, local.StateCounters, xds.StatsCounters,