diff --git a/command/agent/command.go b/command/agent/command.go index 9ca39af856..606649846a 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -227,8 +227,8 @@ func (c *Command) setupAgent(config *Config, logOutput io.Writer, logWriter *log return err } - server, err := NewDNSServer(agent, logOutput, config.Domain, - dnsAddr.String(), config.DNSRecursor) + server, err := NewDNSServer(agent, &config.DNSConfig, logOutput, + config.Domain, dnsAddr.String(), config.DNSRecursor) if err != nil { agent.Shutdown() c.Ui.Error(fmt.Sprintf("Error starting dns server: %s", err)) diff --git a/command/agent/config.go b/command/agent/config.go index 739c2f4fd9..4d038df892 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -27,6 +27,35 @@ type PortConfig struct { Server int // Server internal RPC } +// DNSConfig is used to fine tune the DNS sub-system. +// It can be used to control cache values, and stale +// reads +type DNSConfig struct { + // NodeTTL provides the TTL value for a node query + NodeTTL time.Duration `mapstructure:"-"` + NodeTTLRaw string `mapstructure:"node_ttl" json:"-"` + + // ServiceTTL provides the TTL value for a service + // query for given service. The "*" wildcard can be used + // to set a default for all services. + ServiceTTL map[string]time.Duration `mapstructure:"-"` + ServiceTTLRaw map[string]string `mapstructure:"service_ttl" json:"-"` + + // AllowStale is used to enable lookups with stale + // data. This gives horizontal read scalability since + // any Consul server can service the query instead of + // only the leader. + AllowStale bool `mapstructure:"allow_stale"` + + // MaxStale is used to bound how stale of a result is + // accepted for a DNS lookup. This can be used with + // AllowStale to limit how old of a value is served up. + // If the stale result exceeds this, another non-stale + // stale read is performed. + MaxStale time.Duration `mapstructure:"-"` + MaxStaleRaw string `mapstructure:"max_stale" json:"-"` +} + // Config is the configuration that can be set for an Agent. // Some of this is configurable as CLI flags, but most must // be set using a configuration file. @@ -50,6 +79,9 @@ type Config struct { // resolve non-consul domains DNSRecursor string `mapstructure:"recursor"` + // DNS configuration + DNSConfig DNSConfig `mapstructure:"dns_config"` + // Domain is the DNS domain for the records. Defaults to "consul." Domain string `mapstructure:"domain"` @@ -185,6 +217,9 @@ func DefaultConfig() *Config { SerfWan: consul.DefaultWANSerfPort, Server: 8300, }, + DNSConfig: DNSConfig{ + MaxStale: 5 * time.Second, + }, Protocol: consul.ProtocolVersionMax, AEInterval: time.Minute, } @@ -244,6 +279,36 @@ func DecodeConfig(r io.Reader) (*Config, error) { return nil, err } + // Handle time conversions + if raw := result.DNSConfig.NodeTTLRaw; raw != "" { + dur, err := time.ParseDuration(raw) + if err != nil { + return nil, fmt.Errorf("NodeTTL invalid: %v", err) + } + result.DNSConfig.NodeTTL = dur + } + + if raw := result.DNSConfig.MaxStaleRaw; raw != "" { + dur, err := time.ParseDuration(raw) + if err != nil { + return nil, fmt.Errorf("MaxStale invalid: %v", err) + } + result.DNSConfig.MaxStale = dur + } + + if len(result.DNSConfig.ServiceTTLRaw) != 0 { + if result.DNSConfig.ServiceTTL == nil { + result.DNSConfig.ServiceTTL = make(map[string]time.Duration) + } + for service, raw := range result.DNSConfig.ServiceTTLRaw { + dur, err := time.ParseDuration(raw) + if err != nil { + return nil, fmt.Errorf("ServiceTTL %s invalid: %v", service, err) + } + result.DNSConfig.ServiceTTL[service] = dur + } + } + return &result, nil } @@ -454,6 +519,23 @@ func MergeConfig(a, b *Config) *Config { if b.RejoinAfterLeave { result.RejoinAfterLeave = true } + if b.DNSConfig.NodeTTL != 0 { + result.DNSConfig.NodeTTL = b.DNSConfig.NodeTTL + } + if len(b.DNSConfig.ServiceTTL) != 0 { + if result.DNSConfig.ServiceTTL == nil { + result.DNSConfig.ServiceTTL = make(map[string]time.Duration) + } + for service, dur := range b.DNSConfig.ServiceTTL { + result.DNSConfig.ServiceTTL[service] = dur + } + } + if b.DNSConfig.AllowStale { + result.DNSConfig.AllowStale = true + } + if b.DNSConfig.MaxStale != 0 { + result.DNSConfig.MaxStale = b.DNSConfig.MaxStale + } // Copy the start join addresses result.StartJoin = make([]string, 0, len(a.StartJoin)+len(b.StartJoin)) diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 288e6fd196..767b17cf53 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -290,6 +290,40 @@ func TestDecodeConfig(t *testing.T) { if !config.RejoinAfterLeave { t.Fatalf("bad: %#v", config) } + + // DNS node ttl, max stale + input = `{"dns_config": {"node_ttl": "5s", "max_stale": "15s", "allow_stale": true}}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err != nil { + t.Fatalf("err: %s", err) + } + + if config.DNSConfig.NodeTTL != 5*time.Second { + t.Fatalf("bad: %#v", config) + } + if config.DNSConfig.MaxStale != 15*time.Second { + t.Fatalf("bad: %#v", config) + } + if !config.DNSConfig.AllowStale { + t.Fatalf("bad: %#v", config) + } + + // DNS service ttl + input = `{"dns_config": {"service_ttl": {"*": "1s", "api": "10s", "web": "30s"}}}` + config, err = DecodeConfig(bytes.NewReader([]byte(input))) + if err != nil { + t.Fatalf("err: %s", err) + } + + if config.DNSConfig.ServiceTTL["*"] != time.Second { + t.Fatalf("bad: %#v", config) + } + if config.DNSConfig.ServiceTTL["api"] != 10*time.Second { + t.Fatalf("bad: %#v", config) + } + if config.DNSConfig.ServiceTTL["web"] != 30*time.Second { + t.Fatalf("bad: %#v", config) + } } func TestDecodeConfig_Service(t *testing.T) { @@ -391,10 +425,18 @@ func TestMergeConfig(t *testing.T) { } b := &Config{ - Bootstrap: true, - Datacenter: "dc2", - DataDir: "/tmp/bar", - DNSRecursor: "127.0.0.2:1001", + Bootstrap: true, + Datacenter: "dc2", + DataDir: "/tmp/bar", + DNSRecursor: "127.0.0.2:1001", + DNSConfig: DNSConfig{ + NodeTTL: 10 * time.Second, + ServiceTTL: map[string]time.Duration{ + "api": 10 * time.Second, + }, + AllowStale: true, + MaxStale: 30 * time.Second, + }, Domain: "other", LogLevel: "info", NodeName: "baz", diff --git a/command/agent/dns.go b/command/agent/dns.go index f0b1ed9914..697f314fcb 100644 --- a/command/agent/dns.go +++ b/command/agent/dns.go @@ -23,6 +23,7 @@ const ( // service discovery endpoints using a DNS interface. type DNSServer struct { agent *Agent + config *DNSConfig dnsHandler *dns.ServeMux dnsServer *dns.Server dnsServerTCP *dns.Server @@ -32,7 +33,7 @@ type DNSServer struct { } // NewDNSServer starts a new DNS server to provide an agent interface -func NewDNSServer(agent *Agent, logOutput io.Writer, domain, bind, recursor string) (*DNSServer, error) { +func NewDNSServer(agent *Agent, config *DNSConfig, logOutput io.Writer, domain, bind, recursor string) (*DNSServer, error) { // Make sure domain is FQDN domain = dns.Fqdn(domain) @@ -55,6 +56,7 @@ func NewDNSServer(agent *Agent, logOutput io.Writer, domain, bind, recursor stri // Create the server srv := &DNSServer{ agent: agent, + config: config, dnsHandler: mux, dnsServer: server, dnsServerTCP: serverTCP, @@ -306,16 +308,25 @@ func (d *DNSServer) nodeLookup(network, datacenter, node string, req, resp *dns. // Make an RPC request args := structs.NodeSpecificRequest{ - Datacenter: datacenter, - Node: node, + Datacenter: datacenter, + Node: node, + QueryOptions: structs.QueryOptions{AllowStale: d.config.AllowStale}, } var out structs.IndexedNodeServices +RPC: if err := d.agent.RPC("Catalog.NodeServices", &args, &out); err != nil { d.logger.Printf("[ERR] dns: rpc error: %v", err) resp.SetRcode(req, dns.RcodeServerFailure) return } + // Verify that request is not too stale, redo the request + if args.AllowStale && out.LastContact > d.config.MaxStale { + args.AllowStale = false + d.logger.Printf("[WARN] dns: Query results too stale, re-requesting") + goto RPC + } + // If we have no address, return not found! if out.NodeServices == nil { resp.SetRcode(req, dns.RcodeNameError) @@ -323,14 +334,15 @@ func (d *DNSServer) nodeLookup(network, datacenter, node string, req, resp *dns. } // Add the node record - records := d.formatNodeRecord(&out.NodeServices.Node, req.Question[0].Name, qType) + records := d.formatNodeRecord(&out.NodeServices.Node, req.Question[0].Name, + qType, d.config.NodeTTL) if records != nil { resp.Answer = append(resp.Answer, records...) } } // formatNodeRecord takes a Node and returns an A, AAAA, or CNAME record -func (d *DNSServer) formatNodeRecord(node *structs.Node, qName string, qType uint16) (records []dns.RR) { +func (d *DNSServer) formatNodeRecord(node *structs.Node, qName string, qType uint16, ttl time.Duration) (records []dns.RR) { // Parse the IP ip := net.ParseIP(node.Address) var ipv4 net.IP @@ -344,7 +356,7 @@ func (d *DNSServer) formatNodeRecord(node *structs.Node, qName string, qType uin Name: qName, Rrtype: dns.TypeA, Class: dns.ClassINET, - Ttl: 0, + Ttl: uint32(ttl / time.Second), }, A: ip, }} @@ -355,7 +367,7 @@ func (d *DNSServer) formatNodeRecord(node *structs.Node, qName string, qType uin Name: qName, Rrtype: dns.TypeAAAA, Class: dns.ClassINET, - Ttl: 0, + Ttl: uint32(ttl / time.Second), }, AAAA: ip, }} @@ -368,7 +380,7 @@ func (d *DNSServer) formatNodeRecord(node *structs.Node, qName string, qType uin Name: qName, Rrtype: dns.TypeCNAME, Class: dns.ClassINET, - Ttl: 0, + Ttl: uint32(ttl / time.Second), }, Target: dns.Fqdn(node.Address), } @@ -398,24 +410,43 @@ func (d *DNSServer) formatNodeRecord(node *structs.Node, qName string, qType uin func (d *DNSServer) serviceLookup(network, datacenter, service, tag string, req, resp *dns.Msg) { // Make an RPC request args := structs.ServiceSpecificRequest{ - Datacenter: datacenter, - ServiceName: service, - ServiceTag: tag, - TagFilter: tag != "", + Datacenter: datacenter, + ServiceName: service, + ServiceTag: tag, + TagFilter: tag != "", + QueryOptions: structs.QueryOptions{AllowStale: d.config.AllowStale}, } var out structs.IndexedCheckServiceNodes +RPC: if err := d.agent.RPC("Health.ServiceNodes", &args, &out); err != nil { d.logger.Printf("[ERR] dns: rpc error: %v", err) resp.SetRcode(req, dns.RcodeServerFailure) return } + // Verify that request is not too stale, redo the request + if args.AllowStale && out.LastContact > d.config.MaxStale { + args.AllowStale = false + d.logger.Printf("[WARN] dns: Query results too stale, re-requesting") + goto RPC + } + // If we have no nodes, return not found! if len(out.Nodes) == 0 { resp.SetRcode(req, dns.RcodeNameError) return } + // Determine the TTL + var ttl time.Duration + if d.config.ServiceTTL != nil { + var ok bool + ttl, ok = d.config.ServiceTTL[service] + if !ok { + ttl = d.config.ServiceTTL["*"] + } + } + // Filter out any service nodes due to health checks out.Nodes = d.filterServiceNodes(out.Nodes) @@ -429,10 +460,10 @@ func (d *DNSServer) serviceLookup(network, datacenter, service, tag string, req, // Add various responses depending on the request qType := req.Question[0].Qtype - d.serviceNodeRecords(out.Nodes, req, resp) + d.serviceNodeRecords(out.Nodes, req, resp, ttl) if qType == dns.TypeSRV { - d.serviceSRVRecords(datacenter, out.Nodes, req, resp) + d.serviceSRVRecords(datacenter, out.Nodes, req, resp, ttl) } } @@ -464,7 +495,7 @@ func shuffleServiceNodes(nodes structs.CheckServiceNodes) { } // serviceNodeRecords is used to add the node records for a service lookup -func (d *DNSServer) serviceNodeRecords(nodes structs.CheckServiceNodes, req, resp *dns.Msg) { +func (d *DNSServer) serviceNodeRecords(nodes structs.CheckServiceNodes, req, resp *dns.Msg, ttl time.Duration) { qName := req.Question[0].Name qType := req.Question[0].Qtype handled := make(map[string]struct{}) @@ -478,7 +509,7 @@ func (d *DNSServer) serviceNodeRecords(nodes structs.CheckServiceNodes, req, res handled[addr] = struct{}{} // Add the node record - records := d.formatNodeRecord(&node.Node, qName, qType) + records := d.formatNodeRecord(&node.Node, qName, qType, ttl) if records != nil { resp.Answer = append(resp.Answer, records...) } @@ -486,7 +517,7 @@ func (d *DNSServer) serviceNodeRecords(nodes structs.CheckServiceNodes, req, res } // serviceARecords is used to add the SRV records for a service lookup -func (d *DNSServer) serviceSRVRecords(dc string, nodes structs.CheckServiceNodes, req, resp *dns.Msg) { +func (d *DNSServer) serviceSRVRecords(dc string, nodes structs.CheckServiceNodes, req, resp *dns.Msg, ttl time.Duration) { handled := make(map[string]struct{}) for _, node := range nodes { // Avoid duplicate entries, possible if a node has @@ -503,7 +534,7 @@ func (d *DNSServer) serviceSRVRecords(dc string, nodes structs.CheckServiceNodes Name: req.Question[0].Name, Rrtype: dns.TypeSRV, Class: dns.ClassINET, - Ttl: 0, + Ttl: uint32(ttl / time.Second), }, Priority: 1, Weight: 1, @@ -513,7 +544,7 @@ func (d *DNSServer) serviceSRVRecords(dc string, nodes structs.CheckServiceNodes resp.Answer = append(resp.Answer, srvRec) // Add the extra record - records := d.formatNodeRecord(&node.Node, srvRec.Target, dns.TypeANY) + records := d.formatNodeRecord(&node.Node, srvRec.Target, dns.TypeANY, ttl) if records != nil { resp.Extra = append(resp.Extra, records...) } diff --git a/command/agent/dns_test.go b/command/agent/dns_test.go index d4add63548..3a2804a4c5 100644 --- a/command/agent/dns_test.go +++ b/command/agent/dns_test.go @@ -8,14 +8,20 @@ import ( "os" "strings" "testing" + "time" ) func makeDNSServer(t *testing.T) (string, *DNSServer) { + config := &DNSConfig{} + return makeDNSServerConfig(t, config) +} + +func makeDNSServerConfig(t *testing.T, config *DNSConfig) (string, *DNSServer) { conf := nextConfig() addr, _ := conf.ClientListener(conf.Ports.DNS) dir, agent := makeAgent(t, conf) - server, err := NewDNSServer(agent, agent.logOutput, conf.Domain, - addr.String(), "8.8.8.8:53") + server, err := NewDNSServer(agent, config, agent.logOutput, + conf.Domain, addr.String(), "8.8.8.8:53") if err != nil { t.Fatalf("err: %v", err) } @@ -100,6 +106,9 @@ func TestDNS_NodeLookup(t *testing.T) { if aRec.A.String() != "127.0.0.1" { t.Fatalf("Bad: %#v", in.Answer[0]) } + if aRec.Hdr.Ttl != 0 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } // Re-do the query, but specify the DC m = new(dns.Msg) @@ -122,6 +131,9 @@ func TestDNS_NodeLookup(t *testing.T) { if aRec.A.String() != "127.0.0.1" { t.Fatalf("Bad: %#v", in.Answer[0]) } + if aRec.Hdr.Ttl != 0 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } } func TestDNS_NodeLookup_PeriodName(t *testing.T) { @@ -206,6 +218,9 @@ func TestDNS_NodeLookup_AAAA(t *testing.T) { if aRec.AAAA.String() != "::4242:4242" { t.Fatalf("Bad: %#v", in.Answer[0]) } + if aRec.Hdr.Ttl != 0 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } } func TestDNS_NodeLookup_CNAME(t *testing.T) { @@ -249,6 +264,9 @@ func TestDNS_NodeLookup_CNAME(t *testing.T) { if cnRec.Target != "www.google.com." { t.Fatalf("Bad: %#v", in.Answer[0]) } + if cnRec.Hdr.Ttl != 0 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } } func TestDNS_ServiceLookup(t *testing.T) { @@ -299,6 +317,9 @@ func TestDNS_ServiceLookup(t *testing.T) { if srvRec.Target != "foo.node.dc1.consul." { t.Fatalf("Bad: %#v", srvRec) } + if srvRec.Hdr.Ttl != 0 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } aRec, ok := in.Extra[0].(*dns.A) if !ok { @@ -310,6 +331,9 @@ func TestDNS_ServiceLookup(t *testing.T) { if aRec.A.String() != "127.0.0.1" { t.Fatalf("Bad: %#v", in.Extra[0]) } + if aRec.Hdr.Ttl != 0 { + t.Fatalf("Bad: %#v", in.Extra[0]) + } } func TestDNS_ServiceLookup_TagPeriod(t *testing.T) { @@ -760,3 +784,226 @@ func TestDNS_ServiceLookup_CNAME(t *testing.T) { } } } + +func TestDNS_NodeLookup_TTL(t *testing.T) { + config := &DNSConfig{ + NodeTTL: 10 * time.Second, + AllowStale: true, + MaxStale: time.Second, + } + + dir, srv := makeDNSServerConfig(t, config) + defer os.RemoveAll(dir) + defer srv.agent.Shutdown() + + testutil.WaitForLeader(t, srv.agent.RPC, "dc1") + + // Register node + args := &structs.RegisterRequest{ + Datacenter: "dc1", + Node: "foo", + Address: "127.0.0.1", + } + + var out struct{} + if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + m := new(dns.Msg) + m.SetQuestion("foo.node.consul.", dns.TypeANY) + + c := new(dns.Client) + addr, _ := srv.agent.config.ClientListener(srv.agent.config.Ports.DNS) + in, _, err := c.Exchange(m, addr.String()) + if err != nil { + t.Fatalf("err: %v", err) + } + + if len(in.Answer) != 1 { + t.Fatalf("Bad: %#v", in) + } + + aRec, ok := in.Answer[0].(*dns.A) + if !ok { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + if aRec.A.String() != "127.0.0.1" { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + if aRec.Hdr.Ttl != 10 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + + // Register node with IPv6 + args = &structs.RegisterRequest{ + Datacenter: "dc1", + Node: "bar", + Address: "::4242:4242", + } + if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Check an IPv6 record + m = new(dns.Msg) + m.SetQuestion("bar.node.consul.", dns.TypeANY) + + in, _, err = c.Exchange(m, addr.String()) + if err != nil { + t.Fatalf("err: %v", err) + } + + if len(in.Answer) != 1 { + t.Fatalf("Bad: %#v", in) + } + + aaaaRec, ok := in.Answer[0].(*dns.AAAA) + if !ok { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + if aaaaRec.AAAA.String() != "::4242:4242" { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + if aaaaRec.Hdr.Ttl != 10 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + + // Register node with CNAME + args = &structs.RegisterRequest{ + Datacenter: "dc1", + Node: "google", + Address: "www.google.com", + } + if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + m = new(dns.Msg) + m.SetQuestion("google.node.consul.", dns.TypeANY) + + in, _, err = c.Exchange(m, addr.String()) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should have the CNAME record + a few A records + if len(in.Answer) < 2 { + t.Fatalf("Bad: %#v", in) + } + + cnRec, ok := in.Answer[0].(*dns.CNAME) + if !ok { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + if cnRec.Target != "www.google.com." { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + if cnRec.Hdr.Ttl != 10 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } +} + +func TestDNS_ServiceLookup_TTL(t *testing.T) { + config := &DNSConfig{ + ServiceTTL: map[string]time.Duration{ + "db": 10 * time.Second, + "*": 5 * time.Second, + }, + AllowStale: true, + MaxStale: time.Second, + } + + dir, srv := makeDNSServerConfig(t, config) + defer os.RemoveAll(dir) + defer srv.agent.Shutdown() + + testutil.WaitForLeader(t, srv.agent.RPC, "dc1") + + // Register node with 2 services + args := &structs.RegisterRequest{ + Datacenter: "dc1", + Node: "foo", + Address: "127.0.0.1", + Service: &structs.NodeService{ + Service: "db", + Tags: []string{"master"}, + Port: 12345, + }, + } + + var out struct{} + if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + args = &structs.RegisterRequest{ + Datacenter: "dc1", + Node: "foo", + Address: "127.0.0.1", + Service: &structs.NodeService{ + Service: "api", + Port: 2222, + }, + } + if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + m := new(dns.Msg) + m.SetQuestion("db.service.consul.", dns.TypeSRV) + + c := new(dns.Client) + addr, _ := srv.agent.config.ClientListener(srv.agent.config.Ports.DNS) + in, _, err := c.Exchange(m, addr.String()) + if err != nil { + t.Fatalf("err: %v", err) + } + + if len(in.Answer) != 1 { + t.Fatalf("Bad: %#v", in) + } + + srvRec, ok := in.Answer[0].(*dns.SRV) + if !ok { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + if srvRec.Hdr.Ttl != 10 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + + aRec, ok := in.Extra[0].(*dns.A) + if !ok { + t.Fatalf("Bad: %#v", in.Extra[0]) + } + if aRec.Hdr.Ttl != 10 { + t.Fatalf("Bad: %#v", in.Extra[0]) + } + + m = new(dns.Msg) + m.SetQuestion("api.service.consul.", dns.TypeSRV) + in, _, err = c.Exchange(m, addr.String()) + if err != nil { + t.Fatalf("err: %v", err) + } + + if len(in.Answer) != 1 { + t.Fatalf("Bad: %#v", in) + } + + srvRec, ok = in.Answer[0].(*dns.SRV) + if !ok { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + if srvRec.Hdr.Ttl != 5 { + t.Fatalf("Bad: %#v", in.Answer[0]) + } + + aRec, ok = in.Extra[0].(*dns.A) + if !ok { + t.Fatalf("Bad: %#v", in.Extra[0]) + } + if aRec.Hdr.Ttl != 5 { + t.Fatalf("Bad: %#v", in.Extra[0]) + } +} diff --git a/website/source/docs/agent/dns.html.markdown b/website/source/docs/agent/dns.html.markdown index fe0b1fa6ea..46303a60d6 100644 --- a/website/source/docs/agent/dns.html.markdown +++ b/website/source/docs/agent/dns.html.markdown @@ -19,7 +19,7 @@ with no failing health checks. It's that simple! There are a number of [configuration options](/docs/agent/options.html) that are important for the DNS interface. They are `client_addr`, `ports.dns`, `recursor`, -and `domain`. By default Consul will listen on 127.0.0.1:8600 for DNS queries +`domain`, and `dns_config`. By default Consul will listen on 127.0.0.1:8600 for DNS queries in the "consul." domain, without support for DNS recursion. There are a few ways to use the DNS interface. One option is to use a custom @@ -118,3 +118,10 @@ without setting the truncate bit. This is to prevent a redundant lookup over TCP which generate additional load. If the lookup is done over TCP, the results are not truncated. +## Caching + +By default, all DNS results served by Consul set a 0 TTL value. This disables +caching of DNS results. However, there are many situations in which caching is +desirable for performance and scalability. This is discussed more in the guide +for [DNS Caching](/docs/guides/dns-cache.html). + diff --git a/website/source/docs/agent/options.html.markdown b/website/source/docs/agent/options.html.markdown index b185a1f458..bc95aea789 100644 --- a/website/source/docs/agent/options.html.markdown +++ b/website/source/docs/agent/options.html.markdown @@ -183,6 +183,30 @@ definitions support being updated during a reload. This flag can be used to change that domain. All queries in this domain are assumed to be handled by Consul, and will not be recursively resolved. +* `dns_config` - This object allows a number of sub-keys to be set which can tune + how DNS queries are perfomed. See this guide on [DNS caching](/docs/guides/dns-cache.html). + The following sub-keys are available: + + * `node_ttl` - By default, this is "0s", which means all node lookups are served with + a 0 TTL value. This can be set to allow node lookups to set a TTL value, which enables + DNS caching. This should be specified with the "s" suffix for second, or "m" for minute. + + * `service_ttl` - This is a sub-object, which allows for setting a TTL on service lookups + with a per-service policy. The "*" wildcard service can be specified and is used when + there is no specific policy available for a service. By default, all services are served + with a 0 TTL value. Setting this enables DNS caching. + + * `allow_stale` - Enables a stale query for DNS information. This allows any Consul + server to service the request, instead of only the leader. The advantage of this is + you get linear read scalability with Consul servers. By default, this is false, meaning + all requests are serviced by the leader. This provides stronger consistency but + with less throughput and higher latency. + + * `max_stale` - When `allow_stale` is specified, this is used to limit how + stale of a result will be used. By default, this is set to "5s", which means + if a Consul server is more than 5 seconds behind the leader, the query will be + re-evaluated on the leader to get more up-to-date results. + * `enable_debug` - When set, enables some additional debugging features. Currently, only used to set the runtime profiling HTTP endpoints. @@ -201,12 +225,12 @@ definitions support being updated during a reload. * `ports` - This is a nested object that allows setting the bind ports for the following keys: - * dns - The DNS server, -1 to disable. Default 8600. - * http - The HTTP api, -1 to disable. Default 8500. - * rpc - The RPC endpoint. Default 8400. - * serf_lan - The Serf LAN port. Default 8301. - * serf_wan - The Serf WAN port. Default 8302. - * server - Server RPC address. Default 8300. + * `dns` - The DNS server, -1 to disable. Default 8600. + * `http` - The HTTP api, -1 to disable. Default 8500. + * `rpc` - The RPC endpoint. Default 8400. + * `serf_lan` - The Serf LAN port. Default 8301. + * `serf_wan` - The Serf WAN port. Default 8302. + * `server` - Server RPC address. Default 8300. * `recursor` - This flag provides an address of an upstream DNS server that is used to recursively resolve queries if they are not inside the service domain for consul. For example, diff --git a/website/source/docs/guides/dns-cache.html.markdown b/website/source/docs/guides/dns-cache.html.markdown new file mode 100644 index 0000000000..37d79fc476 --- /dev/null +++ b/website/source/docs/guides/dns-cache.html.markdown @@ -0,0 +1,74 @@ +--- +layout: "docs" +page_title: "DNS Caching" +sidebar_current: "docs-guides-dns-cache" +--- + +# DNS Caching + +One of the main interfaces to Consul is DNS. Using DNS is a simple way +integrate Consul into an existing infrastructure without any high-touch +integration. + +By default, Consul serves all DNS results with a 0 TTL value. This prevents +any caching. The advantage of this is that each DNS lookup is always re-evaluated +and the most timely information is served. However this adds a latency hit +for each lookup and can potentially exhaust the query throughput of a cluster. + +For this reason, Consul provides a number of tuning parameters that can +be used to customize how DNS queries are handled. + +## Stale Reads + +Stale reads can be used to reduce latency and increase the throughput +of DNS queries. By default, all reads are serviced by a [single leader node](/docs/internals/consensus.html). +These reads are strongly consistent but are limited by the throughput +of a single node. Doing a stale read allows any Consul server to +service a query, but non-leader nodes may return data that is potentially +out-of-date. By allowing data to be slightly stale, we get horizontal +read scalability. Now any Consul server can service the request, so we +increase throughput by the number of servers in a cluster. + +The [settings](/docs/agent/options.html) used to control stale reads +are `dns_config.allow_stale` which must be set to enable stale reads, +and `dns_config.max_stale` which limits how stale results are allowed to +be. + +By default, `allow_stale` is disabled meaning no stale results may be served. +The default for `max_stale` is 5 seconds. This means that is `allow_stale` is +enabled, we will use data from any Consul server that is within 5 seconds +of the leader. + +## TTL Values + +TTL values can be set to allow DNS results to be cached upstream +of Consul which can be reduce the number of lookups and to amortize +the latency of doing a DNS lookup. By default, all TTLs are zero, +preventing any caching. + +To enable caching of node lookups (e.g. "foo.node.consul"), we can set +the `dns_config.node_ttl` value. This can be set to "10s" for example, +and all node lookups will serve results with a 10 second TTL. + +Service TTLs can be specified at a more fine grain level. You can set +a TTL on a per-service level, and additionally a wildcard can be specified +that matches if there is no specific service TTL provided. + +This is specified using the `dns_config.service_ttl` map. The "*" service +is the wildcard service. For example, if we specify: + +``` + { + "dns_config": { + "service_ttl": { + "*": "5s", + "web": "30s" + } + } + } +``` + +This sets all lookups to "web.service.consul" to use a 30 second TTL, +while lookups to "db.service.consul" or "api.service.consul" will use the +5 second TTL from the wildcard. + diff --git a/website/source/layouts/docs.erb b/website/source/layouts/docs.erb index 3d6c848083..7af4ba6a01 100644 --- a/website/source/layouts/docs.erb +++ b/website/source/layouts/docs.erb @@ -136,6 +136,10 @@ Bootstrapping +