prometheus/discovery/dns/dns.go

388 lines
12 KiB
Go

// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dns
import (
"context"
"errors"
"fmt"
"log/slog"
"net"
"strconv"
"strings"
"sync"
"time"
"github.com/miekg/dns"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promslog"
"github.com/prometheus/prometheus/discovery"
"github.com/prometheus/prometheus/discovery/refresh"
"github.com/prometheus/prometheus/discovery/targetgroup"
)
const (
resolvConf = "/etc/resolv.conf"
dnsNameLabel = model.MetaLabelPrefix + "dns_name"
dnsSrvRecordPrefix = model.MetaLabelPrefix + "dns_srv_record_"
dnsSrvRecordTargetLabel = dnsSrvRecordPrefix + "target"
dnsSrvRecordPortLabel = dnsSrvRecordPrefix + "port"
dnsMxRecordPrefix = model.MetaLabelPrefix + "dns_mx_record_"
dnsMxRecordTargetLabel = dnsMxRecordPrefix + "target"
dnsNsRecordPrefix = model.MetaLabelPrefix + "dns_ns_record_"
dnsNsRecordTargetLabel = dnsNsRecordPrefix + "target"
// Constants for instrumentation.
namespace = "prometheus"
)
// DefaultSDConfig is the default DNS SD configuration.
var DefaultSDConfig = SDConfig{
RefreshInterval: model.Duration(30 * time.Second),
Type: "SRV",
}
func init() {
discovery.RegisterConfig(&SDConfig{})
}
// SDConfig is the configuration for DNS based service discovery.
type SDConfig struct {
Names []string `yaml:"names"`
RefreshInterval model.Duration `yaml:"refresh_interval,omitempty"`
Type string `yaml:"type"`
Port int `yaml:"port"` // Ignored for SRV records
}
// NewDiscovererMetrics implements discovery.Config.
func (*SDConfig) NewDiscovererMetrics(reg prometheus.Registerer, rmi discovery.RefreshMetricsInstantiator) discovery.DiscovererMetrics {
return newDiscovererMetrics(reg, rmi)
}
// Name returns the name of the Config.
func (*SDConfig) Name() string { return "dns" }
// NewDiscoverer returns a Discoverer for the Config.
func (c *SDConfig) NewDiscoverer(opts discovery.DiscovererOptions) (discovery.Discoverer, error) {
return NewDiscovery(*c, opts.Logger, opts.Metrics)
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (c *SDConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
*c = DefaultSDConfig
type plain SDConfig
err := unmarshal((*plain)(c))
if err != nil {
return err
}
if len(c.Names) == 0 {
return errors.New("DNS-SD config must contain at least one SRV record name")
}
switch strings.ToUpper(c.Type) {
case "SRV":
case "A", "AAAA", "MX", "NS":
if c.Port == 0 {
return errors.New("a port is required in DNS-SD configs for all record types except SRV")
}
default:
return fmt.Errorf("invalid DNS-SD records type %s", c.Type)
}
return nil
}
// Discovery periodically performs DNS-SD requests. It implements
// the Discoverer interface.
type Discovery struct {
*refresh.Discovery
names []string
port int
qtype uint16
logger *slog.Logger
metrics *dnsMetrics
lookupFn func(name string, qtype uint16, logger *slog.Logger) (*dns.Msg, error)
}
// NewDiscovery returns a new Discovery which periodically refreshes its targets.
func NewDiscovery(conf SDConfig, logger *slog.Logger, metrics discovery.DiscovererMetrics) (*Discovery, error) {
m, ok := metrics.(*dnsMetrics)
if !ok {
return nil, errors.New("invalid discovery metrics type")
}
if logger == nil {
logger = promslog.NewNopLogger()
}
qtype := dns.TypeSRV
switch strings.ToUpper(conf.Type) {
case "A":
qtype = dns.TypeA
case "AAAA":
qtype = dns.TypeAAAA
case "SRV":
qtype = dns.TypeSRV
case "MX":
qtype = dns.TypeMX
case "NS":
qtype = dns.TypeNS
}
d := &Discovery{
names: conf.Names,
qtype: qtype,
port: conf.Port,
logger: logger,
lookupFn: lookupWithSearchPath,
metrics: m,
}
d.Discovery = refresh.NewDiscovery(
refresh.Options{
Logger: logger,
Mech: "dns",
Interval: time.Duration(conf.RefreshInterval),
RefreshF: d.refresh,
MetricsInstantiator: m.refreshMetrics,
},
)
return d, nil
}
func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
var (
wg sync.WaitGroup
ch = make(chan *targetgroup.Group)
tgs = make([]*targetgroup.Group, 0, len(d.names))
)
wg.Add(len(d.names))
for _, name := range d.names {
go func(n string) {
if err := d.refreshOne(ctx, n, ch); err != nil && !errors.Is(err, context.Canceled) {
d.logger.Error("Error refreshing DNS targets", "err", err)
}
wg.Done()
}(name)
}
go func() {
wg.Wait()
close(ch)
}()
for tg := range ch {
tgs = append(tgs, tg)
}
return tgs, nil
}
func (d *Discovery) refreshOne(ctx context.Context, name string, ch chan<- *targetgroup.Group) error {
response, err := d.lookupFn(name, d.qtype, d.logger)
d.metrics.dnsSDLookupsCount.Inc()
if err != nil {
d.metrics.dnsSDLookupFailuresCount.Inc()
return err
}
tg := &targetgroup.Group{}
hostPort := func(a string, p int) model.LabelValue {
return model.LabelValue(net.JoinHostPort(a, strconv.Itoa(p)))
}
for _, record := range response.Answer {
var target, dnsSrvRecordTarget, dnsSrvRecordPort, dnsMxRecordTarget, dnsNsRecordTarget model.LabelValue
switch addr := record.(type) {
case *dns.SRV:
dnsSrvRecordTarget = model.LabelValue(addr.Target)
dnsSrvRecordPort = model.LabelValue(strconv.Itoa(int(addr.Port)))
// Remove the final dot from rooted DNS names to make them look more usual.
addr.Target = strings.TrimRight(addr.Target, ".")
target = hostPort(addr.Target, int(addr.Port))
case *dns.MX:
dnsMxRecordTarget = model.LabelValue(addr.Mx)
// Remove the final dot from rooted DNS names to make them look more usual.
addr.Mx = strings.TrimRight(addr.Mx, ".")
target = hostPort(addr.Mx, d.port)
case *dns.NS:
dnsNsRecordTarget = model.LabelValue(addr.Ns)
// Remove the final dot from rooted DNS names to make them look more usual.
addr.Ns = strings.TrimRight(addr.Ns, ".")
target = hostPort(addr.Ns, d.port)
case *dns.A:
target = hostPort(addr.A.String(), d.port)
case *dns.AAAA:
target = hostPort(addr.AAAA.String(), d.port)
case *dns.CNAME:
// CNAME responses can occur with "Type: A" dns_sd_config requests.
continue
default:
d.logger.Warn("Invalid record", "record", record)
continue
}
tg.Targets = append(tg.Targets, model.LabelSet{
model.AddressLabel: target,
dnsNameLabel: model.LabelValue(name),
dnsSrvRecordTargetLabel: dnsSrvRecordTarget,
dnsSrvRecordPortLabel: dnsSrvRecordPort,
dnsMxRecordTargetLabel: dnsMxRecordTarget,
dnsNsRecordTargetLabel: dnsNsRecordTarget,
})
}
tg.Source = name
select {
case <-ctx.Done():
return ctx.Err()
case ch <- tg:
}
return nil
}
// lookupWithSearchPath tries to get an answer for various permutations of
// the given name, appending the system-configured search path as necessary.
//
// There are three possible outcomes:
//
// 1. One of the permutations of the given name is recognized as
// "valid" by the DNS, in which case we consider ourselves "done"
// and that answer is returned. Note that, due to the way the DNS
// handles "name has resource records, but none of the specified type",
// the answer received may have an empty set of results.
//
// 2. All of the permutations of the given name are responded to by one of
// the servers in the "nameservers" list with the answer "that name does
// not exist" (NXDOMAIN). In that case, it can be considered
// pseudo-authoritative that there are no records for that name.
//
// 3. One or more of the names was responded to by all servers with some
// sort of error indication. In that case, we can't know if, in fact,
// there are records for the name or not, so whatever state the
// configuration is in, we should keep it that way until we know for
// sure (by, presumably, all the names getting answers in the future).
//
// Outcomes 1 and 2 are indicated by a valid response message (possibly an
// empty one) and no error. Outcome 3 is indicated by an error return. The
// error will be generic-looking, because trying to return all the errors
// returned by the combination of all name permutations and servers is a
// nightmare.
func lookupWithSearchPath(name string, qtype uint16, logger *slog.Logger) (*dns.Msg, error) {
conf, err := dns.ClientConfigFromFile(resolvConf)
if err != nil {
return nil, fmt.Errorf("could not load resolv.conf: %w", err)
}
allResponsesValid := true
for _, lname := range conf.NameList(name) {
response, err := lookupFromAnyServer(lname, qtype, conf, logger)
switch {
case err != nil:
// We can't go home yet, because a later name
// may give us a valid, successful answer. However
// we can no longer say "this name definitely doesn't
// exist", because we did not get that answer for
// at least one name.
allResponsesValid = false
case response.Rcode == dns.RcodeSuccess:
// Outcome 1: GOLD!
return response, nil
}
}
if allResponsesValid {
// Outcome 2: everyone says NXDOMAIN, that's good enough for me.
return &dns.Msg{}, nil
}
// Outcome 3: boned.
return nil, fmt.Errorf("could not resolve %q: all servers responded with errors to at least one search domain", name)
}
// lookupFromAnyServer uses all configured servers to try and resolve a specific
// name. If a viable answer is received from a server, then it is
// immediately returned, otherwise the other servers in the config are
// tried, and if none of them return a viable answer, an error is returned.
//
// A "viable answer" is one which indicates either:
//
// 1. "yes, I know that name, and here are its records of the requested type"
// (RCODE==SUCCESS, ANCOUNT > 0);
// 2. "yes, I know that name, but it has no records of the requested type"
// (RCODE==SUCCESS, ANCOUNT==0); or
// 3. "I know that name doesn't exist" (RCODE==NXDOMAIN).
//
// A non-viable answer is "anything else", which encompasses both various
// system-level problems (like network timeouts) and also
// valid-but-unexpected DNS responses (SERVFAIL, REFUSED, etc).
func lookupFromAnyServer(name string, qtype uint16, conf *dns.ClientConfig, logger *slog.Logger) (*dns.Msg, error) {
client := &dns.Client{}
for _, server := range conf.Servers {
servAddr := net.JoinHostPort(server, conf.Port)
msg, err := askServerForName(name, qtype, client, servAddr, true)
if err != nil {
logger.Warn("DNS resolution failed", "server", server, "name", name, "err", err)
continue
}
if msg.Rcode == dns.RcodeSuccess || msg.Rcode == dns.RcodeNameError {
// We have our answer. Time to go home.
return msg, nil
}
}
return nil, fmt.Errorf("could not resolve %s: no servers returned a viable answer", name)
}
// askServerForName makes a request to a specific DNS server for a specific
// name (and qtype). Retries with TCP in the event of response truncation,
// but otherwise just sends back whatever the server gave, whether that be a
// valid-looking response, or an error.
func askServerForName(name string, queryType uint16, client *dns.Client, servAddr string, edns bool) (*dns.Msg, error) {
msg := &dns.Msg{}
msg.SetQuestion(dns.Fqdn(name), queryType)
if edns {
msg.SetEdns0(dns.DefaultMsgSize, false)
}
response, _, err := client.Exchange(msg, servAddr)
if err != nil {
return nil, err
}
if response.Truncated {
if client.Net == "tcp" {
return nil, errors.New("got truncated message on TCP (64kiB limit exceeded?)")
}
client.Net = "tcp"
return askServerForName(name, queryType, client, servAddr, false)
}
return response, nil
}