feat(v2dns): add v2 style query metrics

pull/20608/head
DanStough 2024-02-12 18:05:32 -05:00
parent 35f1173689
commit 95a869a380
No known key found for this signature in database
GPG Key ID: 0D994ED7D73D7809
5 changed files with 89 additions and 46 deletions

View File

@ -13,6 +13,7 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
@ -29,6 +30,15 @@ const (
staleCounterThreshold = 5 * time.Second staleCounterThreshold = 5 * time.Second
) )
// DNSCounters pre-registers the staleness metric.
// This value is used by both the V1 and V2 DNS (V1 Catalog-only) servers.
var DNSCounters = []prometheus.CounterDefinition{
{
Name: []string{"dns", "stale_queries"},
Help: "Increments when an agent serves a query within the allowed stale threshold.",
},
}
// v1DataFetcherDynamicConfig is used to store the dynamic configuration of the V1 data fetcher. // v1DataFetcherDynamicConfig is used to store the dynamic configuration of the V1 data fetcher.
type v1DataFetcherDynamicConfig struct { type v1DataFetcherDynamicConfig struct {
// Default request tenancy // Default request tenancy

View File

@ -16,7 +16,6 @@ import (
"time" "time"
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/armon/go-radix" "github.com/armon/go-radix"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
"github.com/miekg/dns" "github.com/miekg/dns"
@ -33,24 +32,6 @@ import (
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
) )
var DNSCounters = []prometheus.CounterDefinition{
{
Name: []string{"dns", "stale_queries"},
Help: "Increments when an agent serves a query within the allowed stale threshold.",
},
}
var DNSSummaries = []prometheus.SummaryDefinition{
{
Name: []string{"dns", "ptr_query"},
Help: "Measures the time spent handling a reverse DNS query for the given node.",
},
{
Name: []string{"dns", "domain_query"},
Help: "Measures the time spent handling a domain query for the given node.",
},
}
const ( const (
// UDP can fit ~25 A records in a 512B response, and ~14 AAAA // UDP can fit ~25 A records in a 512B response, and ~14 AAAA
// records. Limit further to prevent unintentional configuration // records. Limit further to prevent unintentional configuration
@ -406,8 +387,17 @@ func (d *DNSServer) getResponseDomain(questionName string) string {
func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) { func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) {
q := req.Question[0] q := req.Question[0]
defer func(s time.Time) { defer func(s time.Time) {
// V1 DNS-style metrics
metrics.MeasureSinceWithLabels([]string{"dns", "ptr_query"}, s, metrics.MeasureSinceWithLabels([]string{"dns", "ptr_query"}, s,
[]metrics.Label{{Name: "node", Value: d.agent.config.NodeName}}) []metrics.Label{{Name: "node", Value: d.agent.config.NodeName}})
// V2 DNS-style metrics for forward compatibility
metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s,
[]metrics.Label{
{Name: "node", Value: d.agent.config.NodeName},
{Name: "type", Value: dns.Type(dns.TypePTR).String()},
})
d.logger.Debug("request served from client", d.logger.Debug("request served from client",
"question", q, "question", q,
"latency", time.Since(s).String(), "latency", time.Since(s).String(),
@ -519,12 +509,21 @@ func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) {
func (d *DNSServer) handleQuery(resp dns.ResponseWriter, req *dns.Msg) { func (d *DNSServer) handleQuery(resp dns.ResponseWriter, req *dns.Msg) {
q := req.Question[0] q := req.Question[0]
defer func(s time.Time) { defer func(s time.Time) {
// V1 DNS-style metrics
metrics.MeasureSinceWithLabels([]string{"dns", "domain_query"}, s, metrics.MeasureSinceWithLabels([]string{"dns", "domain_query"}, s,
[]metrics.Label{{Name: "node", Value: d.agent.config.NodeName}}) []metrics.Label{{Name: "node", Value: d.agent.config.NodeName}})
// V2 DNS-style metrics for forward compatibility
metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s,
[]metrics.Label{
{Name: "node", Value: d.agent.config.NodeName},
{Name: "type", Value: dns.Type(q.Qtype).String()},
})
d.logger.Debug("request served from client", d.logger.Debug("request served from client",
"name", q.Name, "name", q.Name,
"type", dns.Type(q.Qtype), "type", dns.Type(q.Qtype).String(),
"class", dns.Class(q.Qclass), "class", dns.Class(q.Qclass).String(),
"latency", time.Since(s).String(), "latency", time.Since(s).String(),
"client", resp.RemoteAddr().String(), "client", resp.RemoteAddr().String(),
"client_network", resp.RemoteAddr().Network(), "client_network", resp.RemoteAddr().Network(),

View File

@ -13,6 +13,7 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
"github.com/armon/go-metrics"
"github.com/armon/go-radix" "github.com/armon/go-radix"
"github.com/miekg/dns" "github.com/miekg/dns"
@ -47,8 +48,6 @@ var (
trailingSpacesRE = regexp.MustCompile(" +$") trailingSpacesRE = regexp.MustCompile(" +$")
) )
// TODO (v2-dns): metrics
// Context is used augment a DNS message with Consul-specific metadata. // Context is used augment a DNS message with Consul-specific metadata.
type Context struct { type Context struct {
Token string Token string
@ -105,6 +104,7 @@ type Router struct {
domain string domain string
altDomain string altDomain string
datacenter string datacenter string
nodeName string
logger hclog.Logger logger hclog.Logger
tokenFunc func() string tokenFunc func() string
@ -124,8 +124,6 @@ func NewRouter(cfg Config) (*Router, error) {
domain := dns.CanonicalName(cfg.AgentConfig.DNSDomain) domain := dns.CanonicalName(cfg.AgentConfig.DNSDomain)
altDomain := dns.CanonicalName(cfg.AgentConfig.DNSAltDomain) altDomain := dns.CanonicalName(cfg.AgentConfig.DNSAltDomain)
// TODO (v2-dns): need to figure out tenancy information here in a way that work for V2 and V1
logger := cfg.Logger.Named(logging.DNS) logger := cfg.Logger.Named(logging.DNS)
router := &Router{ router := &Router{
@ -135,6 +133,7 @@ func NewRouter(cfg Config) (*Router, error) {
altDomain: altDomain, altDomain: altDomain,
datacenter: cfg.AgentConfig.Datacenter, datacenter: cfg.AgentConfig.Datacenter,
logger: logger, logger: logger,
nodeName: cfg.AgentConfig.NodeName,
tokenFunc: cfg.TokenFunc, tokenFunc: cfg.TokenFunc,
translateAddressFunc: cfg.TranslateAddressFunc, translateAddressFunc: cfg.TranslateAddressFunc,
translateServiceAddressFunc: cfg.TranslateServiceAddressFunc, translateServiceAddressFunc: cfg.TranslateServiceAddressFunc,
@ -148,21 +147,6 @@ func NewRouter(cfg Config) (*Router, error) {
// HandleRequest is used to process an individual DNS request. It returns a message in success or fail cases. // HandleRequest is used to process an individual DNS request. It returns a message in success or fail cases.
func (r *Router) HandleRequest(req *dns.Msg, reqCtx Context, remoteAddress net.Addr) *dns.Msg { func (r *Router) HandleRequest(req *dns.Msg, reqCtx Context, remoteAddress net.Addr) *dns.Msg {
return r.handleRequestRecursively(req, reqCtx, remoteAddress, maxRecursionLevelDefault)
}
// getErrorFromECSNotGlobalError returns the underlying error from an ECSNotGlobalError, if it exists.
func getErrorFromECSNotGlobalError(err error) error {
if errors.Is(err, discovery.ErrECSNotGlobal) {
return err.(discovery.ECSNotGlobalError).Unwrap()
}
return err
}
// handleRequestRecursively is used to process an individual DNS request. It will recurse as needed
// a maximum number of times and returns a message in success or fail cases.
func (r *Router) handleRequestRecursively(req *dns.Msg, reqCtx Context,
remoteAddress net.Addr, maxRecursionLevel int) *dns.Msg {
configCtx := r.dynamicConfig.Load().(*RouterDynamicConfig) configCtx := r.dynamicConfig.Load().(*RouterDynamicConfig)
r.logger.Trace("received request", "question", req.Question[0].Name, "type", dns.Type(req.Question[0].Qtype).String()) r.logger.Trace("received request", "question", req.Question[0].Name, "type", dns.Type(req.Question[0].Qtype).String())
@ -176,6 +160,45 @@ func (r *Router) handleRequestRecursively(req *dns.Msg, reqCtx Context,
return createServerFailureResponse(req, configCtx, false) return createServerFailureResponse(req, configCtx, false)
} }
defer func(s time.Time, q dns.Question) {
metrics.MeasureSinceWithLabels([]string{"dns", "query"}, s,
[]metrics.Label{
{Name: "node", Value: r.nodeName},
{Name: "type", Value: dns.Type(q.Qtype).String()},
})
r.logger.Debug("request served from client",
"name", q.Name,
"type", dns.Type(q.Qtype).String(),
"class", dns.Class(q.Qclass).String(),
"latency", time.Since(s).String(),
"client", remoteAddress.String(),
"client_network", remoteAddress.Network(),
)
}(time.Now(), req.Question[0])
return r.handleRequestRecursively(req, reqCtx, configCtx, remoteAddress, maxRecursionLevelDefault)
}
// getErrorFromECSNotGlobalError returns the underlying error from an ECSNotGlobalError, if it exists.
func getErrorFromECSNotGlobalError(err error) error {
if errors.Is(err, discovery.ErrECSNotGlobal) {
return err.(discovery.ECSNotGlobalError).Unwrap()
}
return err
}
// handleRequestRecursively is used to process an individual DNS request. It will recurse as needed
// a maximum number of times and returns a message in success or fail cases.
func (r *Router) handleRequestRecursively(req *dns.Msg, reqCtx Context, configCtx *RouterDynamicConfig,
remoteAddress net.Addr, maxRecursionLevel int) *dns.Msg {
r.logger.Trace(
"received request",
"question", req.Question[0].Name,
"type", dns.Type(req.Question[0].Qtype).String(),
"recursion_remaining", maxRecursionLevel)
responseDomain, needRecurse := r.parseDomain(req.Question[0].Name) responseDomain, needRecurse := r.parseDomain(req.Question[0].Name)
if needRecurse && !canRecurse(configCtx) { if needRecurse && !canRecurse(configCtx) {
// This is the same error as an unmatched domain // This is the same error as an unmatched domain
@ -655,7 +678,7 @@ func (r *Router) defaultAgentDNSRequestContext() Context {
} }
// resolveCNAME is used to recursively resolve CNAME records // resolveCNAME is used to recursively resolve CNAME records
func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Context, func (r *Router) resolveCNAME(cfgContext *RouterDynamicConfig, name string, reqCtx Context,
remoteAddress net.Addr, maxRecursionLevel int) []dns.RR { remoteAddress net.Addr, maxRecursionLevel int) []dns.RR {
// If the CNAME record points to a Consul address, resolve it internally // If the CNAME record points to a Consul address, resolve it internally
// Convert query to lowercase because DNS is case-insensitive; d.domain and // Convert query to lowercase because DNS is case-insensitive; d.domain and
@ -670,13 +693,13 @@ func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Cont
req.SetQuestion(name, dns.TypeANY) req.SetQuestion(name, dns.TypeANY)
// TODO: handle error response // TODO: handle error response
resp := r.handleRequestRecursively(req, reqCtx, nil, maxRecursionLevel-1) resp := r.handleRequestRecursively(req, reqCtx, cfgContext, nil, maxRecursionLevel-1)
return resp.Answer return resp.Answer
} }
// Do nothing if we don't have a recursor // Do nothing if we don't have a recursor
if !canRecurse(cfg) { if !canRecurse(cfgContext) {
return nil return nil
} }
@ -685,7 +708,7 @@ func (r *Router) resolveCNAME(cfg *RouterDynamicConfig, name string, reqCtx Cont
m.SetQuestion(name, dns.TypeA) m.SetQuestion(name, dns.TypeA)
// Make a DNS lookup request // Make a DNS lookup request
recursorResponse, err := r.recursor.handle(m, cfg, remoteAddress) recursorResponse, err := r.recursor.handle(m, cfgContext, remoteAddress)
if err == nil { if err == nil {
return recursorResponse.Answer return recursorResponse.Answer
} }

View File

@ -6,12 +6,13 @@ package dns
import ( import (
"errors" "errors"
"fmt" "fmt"
"github.com/hashicorp/consul/internal/dnsutil"
"net" "net"
"reflect" "reflect"
"testing" "testing"
"time" "time"
"github.com/hashicorp/consul/internal/dnsutil"
"github.com/miekg/dns" "github.com/miekg/dns"
"github.com/stretchr/testify/mock" "github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
@ -2743,7 +2744,15 @@ func runHandleTestCases(t *testing.T, tc HandleTestCase) {
if ctx == nil { if ctx == nil {
ctx = &Context{} ctx = &Context{}
} }
actual := router.HandleRequest(tc.request, *ctx, tc.remoteAddress)
var remoteAddress net.Addr
if tc.remoteAddress != nil {
remoteAddress = tc.remoteAddress
} else {
remoteAddress = &net.UDPAddr{}
}
actual := router.HandleRequest(tc.request, *ctx, remoteAddress)
require.Equal(t, tc.response, actual) require.Equal(t, tc.response, actual)
} }

View File

@ -28,6 +28,7 @@ import (
"github.com/hashicorp/consul/agent/consul/stream" "github.com/hashicorp/consul/agent/consul/stream"
"github.com/hashicorp/consul/agent/consul/usagemetrics" "github.com/hashicorp/consul/agent/consul/usagemetrics"
"github.com/hashicorp/consul/agent/consul/xdscapacity" "github.com/hashicorp/consul/agent/consul/xdscapacity"
"github.com/hashicorp/consul/agent/discovery"
"github.com/hashicorp/consul/agent/grpc-external/limiter" "github.com/hashicorp/consul/agent/grpc-external/limiter"
grpcInt "github.com/hashicorp/consul/agent/grpc-internal" grpcInt "github.com/hashicorp/consul/agent/grpc-internal"
"github.com/hashicorp/consul/agent/grpc-internal/balancer" "github.com/hashicorp/consul/agent/grpc-internal/balancer"
@ -434,6 +435,7 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G
consul.CatalogCounters, consul.CatalogCounters,
consul.ClientCounters, consul.ClientCounters,
consul.RPCCounters, consul.RPCCounters,
discovery.DNSCounters,
grpcWare.StatsCounters, grpcWare.StatsCounters,
local.StateCounters, local.StateCounters,
xds.StatsCounters, xds.StatsCounters,