From 613dd7d053ecbe339a9b5d54052ff26ce250f39c Mon Sep 17 00:00:00 2001 From: "R.B. Boyer" <4903+rboyer@users.noreply.github.com> Date: Tue, 17 Aug 2021 16:49:26 -0500 Subject: [PATCH 01/28] state: adjust streaming event generation to account for partitioned nodes (#10860) Also re-enabled some tests that had to be disabled in the prior PR. --- agent/consul/state/catalog.go | 2 + agent/consul/state/catalog_events.go | 92 +++++++++++++++----- agent/consul/state/catalog_events_oss.go | 23 +++++ agent/consul/state/catalog_events_test.go | 17 ++-- agent/consul/state/catalog_test.go | 88 +++++++++++-------- agent/consul/state/state_store_test.go | 5 +- agent/consul/state/store_integration_test.go | 14 ++- agent/consul/stream/event.go | 24 ++--- agent/consul/stream/event_publisher.go | 2 +- agent/consul/stream/event_publisher_test.go | 2 +- agent/consul/stream/event_test.go | 10 ++- agent/consul/stream/subscription.go | 5 +- 12 files changed, 202 insertions(+), 82 deletions(-) create mode 100644 agent/consul/state/catalog_events_oss.go diff --git a/agent/consul/state/catalog.go b/agent/consul/state/catalog.go index c26d10f603..eefdb82e31 100644 --- a/agent/consul/state/catalog.go +++ b/agent/consul/state/catalog.go @@ -202,6 +202,8 @@ func (s *Store) EnsureNode(idx uint64, node *structs.Node) error { func ensureNoNodeWithSimilarNameTxn(tx ReadTxn, node *structs.Node, allowClashWithoutID bool) error { // Retrieve all of the nodes + // TODO(partitions): since the node UUID field is not partitioned, do we have to do something additional here? + enodes, err := tx.Get(tableNodes, indexID+"_prefix", node.GetEnterpriseMeta()) if err != nil { return fmt.Errorf("Cannot lookup all nodes: %s", err) diff --git a/agent/consul/state/catalog_events.go b/agent/consul/state/catalog_events.go index 97be4638bc..c0c2fecde4 100644 --- a/agent/consul/state/catalog_events.go +++ b/agent/consul/state/catalog_events.go @@ -25,14 +25,15 @@ type EventPayloadCheckServiceNode struct { // when the change event is for a sidecar or gateway. overrideKey string overrideNamespace string + overridePartition string } func (e EventPayloadCheckServiceNode) HasReadPermission(authz acl.Authorizer) bool { return e.Value.CanRead(authz) == acl.Allow } -func (e EventPayloadCheckServiceNode) MatchesKey(key, namespace string) bool { - if key == "" && namespace == "" { +func (e EventPayloadCheckServiceNode) MatchesKey(key, namespace, partition string) bool { + if key == "" && namespace == "" && partition == "" { return true } @@ -48,8 +49,14 @@ func (e EventPayloadCheckServiceNode) MatchesKey(key, namespace string) bool { if e.overrideNamespace != "" { ns = e.overrideNamespace } + ap := e.Value.Service.EnterpriseMeta.PartitionOrDefault() + if e.overridePartition != "" { + ap = e.overridePartition + } + return (key == "" || strings.EqualFold(key, name)) && - (namespace == "" || strings.EqualFold(namespace, ns)) + (namespace == "" || strings.EqualFold(namespace, ns)) && + (partition == "" || strings.EqualFold(partition, ap)) } // serviceHealthSnapshot returns a stream.SnapshotFunc that provides a snapshot @@ -60,7 +67,7 @@ func serviceHealthSnapshot(db ReadDB, topic stream.Topic) stream.SnapshotFunc { defer tx.Abort() connect := topic == topicServiceHealthConnect - entMeta := structs.NewEnterpriseMetaInDefaultPartition(req.Namespace) + entMeta := structs.NewEnterpriseMetaWithPartition(req.Partition, req.Namespace) idx, nodes, err := checkServiceNodesTxn(tx, nil, req.Key, connect, &entMeta) if err != nil { return 0, err @@ -123,6 +130,11 @@ type serviceChange struct { change memdb.Change } +type nodeTuple struct { + Node string + Partition string +} + var serviceChangeIndirect = serviceChange{changeType: changeIndirect} // ServiceHealthEventsFromChanges returns all the service and Connect health @@ -130,13 +142,13 @@ var serviceChangeIndirect = serviceChange{changeType: changeIndirect} func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event, error) { var events []stream.Event - var nodeChanges map[string]changeType + var nodeChanges map[nodeTuple]changeType var serviceChanges map[nodeServiceTuple]serviceChange var termGatewayChanges map[structs.ServiceName]map[structs.ServiceName]serviceChange - markNode := func(node string, typ changeType) { + markNode := func(node nodeTuple, typ changeType) { if nodeChanges == nil { - nodeChanges = make(map[string]changeType) + nodeChanges = make(map[nodeTuple]changeType) } // If the caller has an actual node mutation ensure we store it even if the // node is already marked. If the caller is just marking the node dirty @@ -161,14 +173,15 @@ func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event for _, change := range changes.Changes { switch change.Table { - case "nodes": + case tableNodes: // Node changed in some way, if it's not a delete, we'll need to // re-deliver CheckServiceNode results for all services on that node but // we mark it anyway because if it _is_ a delete then we need to know that // later to avoid trying to deliver events when node level checks mark the // node as "changed". n := changeObject(change).(*structs.Node) - markNode(n.Node, changeTypeFromChange(change)) + tuple := newNodeTupleFromNode(n) + markNode(tuple, changeTypeFromChange(change)) case tableServices: sn := changeObject(change).(*structs.ServiceNode) @@ -187,7 +200,8 @@ func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event after := change.After.(*structs.HealthCheck) if after.ServiceID == "" || before.ServiceID == "" { // check before and/or after is node-scoped - markNode(after.Node, changeIndirect) + nt := newNodeTupleFromHealthCheck(after) + markNode(nt, changeIndirect) } else { // Check changed which means we just need to emit for the linked // service. @@ -206,7 +220,8 @@ func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event obj := changeObject(change).(*structs.HealthCheck) if obj.ServiceID == "" { // Node level check - markNode(obj.Node, changeIndirect) + nt := newNodeTupleFromHealthCheck(obj) + markNode(nt, changeIndirect) } else { markService(newNodeServiceTupleFromServiceHealthCheck(obj), serviceChangeIndirect) } @@ -250,7 +265,8 @@ func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event continue } // Rebuild events for all services on this node - es, err := newServiceHealthEventsForNode(tx, changes.Index, node) + es, err := newServiceHealthEventsForNode(tx, changes.Index, node.Node, + structs.WildcardEnterpriseMetaInPartition(node.Partition)) if err != nil { return nil, err } @@ -286,7 +302,7 @@ func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event } } - if _, ok := nodeChanges[tuple.Node]; ok { + if _, ok := nodeChanges[tuple.nodeTuple()]; ok { // We already rebuilt events for everything on this node, no need to send // a duplicate. continue @@ -303,7 +319,10 @@ func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event for serviceName, gsChange := range serviceChanges { gs := changeObject(gsChange.change).(*structs.GatewayService) - q := Query{Value: gs.Gateway.Name, EnterpriseMeta: gatewayName.EnterpriseMeta} + q := Query{ + Value: gs.Gateway.Name, + EnterpriseMeta: gatewayName.EnterpriseMeta, + } _, nodes, err := serviceNodesTxn(tx, nil, indexService, q) if err != nil { return nil, err @@ -320,6 +339,9 @@ func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event if gatewayName.EnterpriseMeta.NamespaceOrDefault() != serviceName.EnterpriseMeta.NamespaceOrDefault() { payload.overrideNamespace = serviceName.EnterpriseMeta.NamespaceOrDefault() } + if gatewayName.EnterpriseMeta.PartitionOrDefault() != serviceName.EnterpriseMeta.PartitionOrDefault() { + payload.overridePartition = serviceName.EnterpriseMeta.PartitionOrDefault() + } e.Payload = payload events = append(events, e) @@ -344,6 +366,9 @@ func ServiceHealthEventsFromChanges(tx ReadTxn, changes Changes) ([]stream.Event if gatewayName.EnterpriseMeta.NamespaceOrDefault() != serviceName.EnterpriseMeta.NamespaceOrDefault() { payload.overrideNamespace = serviceName.EnterpriseMeta.NamespaceOrDefault() } + if gatewayName.EnterpriseMeta.PartitionOrDefault() != serviceName.EnterpriseMeta.PartitionOrDefault() { + payload.overridePartition = serviceName.EnterpriseMeta.PartitionOrDefault() + } e.Payload = payload events = append(events, e) @@ -480,6 +505,9 @@ func copyEventForService(event stream.Event, service structs.ServiceName) stream if payload.Value.Service.EnterpriseMeta.NamespaceOrDefault() != service.EnterpriseMeta.NamespaceOrDefault() { payload.overrideNamespace = service.EnterpriseMeta.NamespaceOrDefault() } + if payload.Value.Service.EnterpriseMeta.PartitionOrDefault() != service.EnterpriseMeta.PartitionOrDefault() { + payload.overridePartition = service.EnterpriseMeta.PartitionOrDefault() + } event.Payload = payload return event @@ -497,13 +525,16 @@ func getPayloadCheckServiceNode(payload stream.Payload) *structs.CheckServiceNod // given node. This mirrors some of the the logic in the oddly-named // parseCheckServiceNodes but is more efficient since we know they are all on // the same node. -func newServiceHealthEventsForNode(tx ReadTxn, idx uint64, node string) ([]stream.Event, error) { - services, err := tx.Get(tableServices, indexNode, Query{Value: node}) +func newServiceHealthEventsForNode(tx ReadTxn, idx uint64, node string, entMeta *structs.EnterpriseMeta) ([]stream.Event, error) { + services, err := tx.Get(tableServices, indexNode, Query{ + Value: node, + EnterpriseMeta: *entMeta, + }) if err != nil { return nil, err } - n, checksFunc, err := getNodeAndChecks(tx, node) + n, checksFunc, err := getNodeAndChecks(tx, node, entMeta) if err != nil { return nil, err } @@ -521,9 +552,12 @@ func newServiceHealthEventsForNode(tx ReadTxn, idx uint64, node string) ([]strea // getNodeAndNodeChecks returns a the node structure and a function that returns // the full list of checks for a specific service on that node. -func getNodeAndChecks(tx ReadTxn, node string) (*structs.Node, serviceChecksFunc, error) { +func getNodeAndChecks(tx ReadTxn, node string, entMeta *structs.EnterpriseMeta) (*structs.Node, serviceChecksFunc, error) { // Fetch the node - nodeRaw, err := tx.First(tableNodes, indexID, Query{Value: node}) + nodeRaw, err := tx.First(tableNodes, indexID, Query{ + Value: node, + EnterpriseMeta: *entMeta, + }) if err != nil { return nil, nil, err } @@ -532,7 +566,10 @@ func getNodeAndChecks(tx ReadTxn, node string) (*structs.Node, serviceChecksFunc } n := nodeRaw.(*structs.Node) - iter, err := tx.Get(tableChecks, indexNode, Query{Value: node}) + iter, err := tx.Get(tableChecks, indexNode, Query{ + Value: node, + EnterpriseMeta: *entMeta, + }) if err != nil { return nil, nil, err } @@ -566,12 +603,16 @@ func getNodeAndChecks(tx ReadTxn, node string) (*structs.Node, serviceChecksFunc type serviceChecksFunc func(serviceID string) structs.HealthChecks func newServiceHealthEventForService(tx ReadTxn, idx uint64, tuple nodeServiceTuple) (stream.Event, error) { - n, checksFunc, err := getNodeAndChecks(tx, tuple.Node) + n, checksFunc, err := getNodeAndChecks(tx, tuple.Node, &tuple.EntMeta) if err != nil { return stream.Event{}, err } - svc, err := tx.Get(tableServices, indexID, NodeServiceQuery{EnterpriseMeta: tuple.EntMeta, Node: tuple.Node, Service: tuple.ServiceID}) + svc, err := tx.Get(tableServices, indexID, NodeServiceQuery{ + EnterpriseMeta: tuple.EntMeta, + Node: tuple.Node, + Service: tuple.ServiceID, + }) if err != nil { return stream.Event{}, err } @@ -615,9 +656,14 @@ func newServiceHealthEventDeregister(idx uint64, sn *structs.ServiceNode) stream // This is also important because if the service was deleted as part of a // whole node deregistering then the node record won't actually exist now // anyway and we'd have to plumb it through from the changeset above. + + entMeta := sn.EnterpriseMeta + entMeta.Normalize() + csn := &structs.CheckServiceNode{ Node: &structs.Node{ - Node: sn.Node, + Node: sn.Node, + Partition: entMeta.PartitionOrEmpty(), }, Service: sn.ToNodeService(), } diff --git a/agent/consul/state/catalog_events_oss.go b/agent/consul/state/catalog_events_oss.go new file mode 100644 index 0000000000..cf5231dc9a --- /dev/null +++ b/agent/consul/state/catalog_events_oss.go @@ -0,0 +1,23 @@ +// +build !consulent + +package state + +import "github.com/hashicorp/consul/agent/structs" + +func (nst nodeServiceTuple) nodeTuple() nodeTuple { + return nodeTuple{Node: nst.Node, Partition: ""} +} + +func newNodeTupleFromNode(node *structs.Node) nodeTuple { + return nodeTuple{ + Node: node.Node, + Partition: "", + } +} + +func newNodeTupleFromHealthCheck(hc *structs.HealthCheck) nodeTuple { + return nodeTuple{ + Node: hc.Node, + Partition: "", + } +} diff --git a/agent/consul/state/catalog_events_test.go b/agent/consul/state/catalog_events_test.go index 558f63e427..277dec11c3 100644 --- a/agent/consul/state/catalog_events_test.go +++ b/agent/consul/state/catalog_events_test.go @@ -1605,9 +1605,9 @@ func (tc eventsTestCase) run(t *testing.T) { assertDeepEqual(t, tc.WantEvents, got, cmpPartialOrderEvents, cmpopts.EquateEmpty()) } -func runCase(t *testing.T, name string, fn func(t *testing.T)) { +func runCase(t *testing.T, name string, fn func(t *testing.T)) bool { t.Helper() - t.Run(name, func(t *testing.T) { + return t.Run(name, func(t *testing.T) { t.Helper() t.Log("case:", name) fn(t) @@ -1680,7 +1680,11 @@ var cmpPartialOrderEvents = cmp.Options{ if payload.overrideNamespace != "" { ns = payload.overrideNamespace } - return fmt.Sprintf("%s/%s/%s/%s", e.Topic, csn.Node.Node, ns, name) + ap := csn.Service.EnterpriseMeta.PartitionOrDefault() + if payload.overridePartition != "" { + ap = payload.overridePartition + } + return fmt.Sprintf("%s/%s/%s/%s/%s", e.Topic, ap, csn.Node.Node, ns, name) } return key(i) < key(j) }), @@ -2172,6 +2176,7 @@ func newTestEventServiceHealthRegister(index uint64, nodeNum int, svc string) st Node: node, Address: addr, Datacenter: "dc1", + Partition: structs.NodeEnterpriseMetaInDefaultPartition().PartitionOrEmpty(), RaftIndex: structs.RaftIndex{ CreateIndex: index, ModifyIndex: index, @@ -2238,7 +2243,8 @@ func newTestEventServiceHealthDeregister(index uint64, nodeNum int, svc string) Op: pbsubscribe.CatalogOp_Deregister, Value: &structs.CheckServiceNode{ Node: &structs.Node{ - Node: fmt.Sprintf("node%d", nodeNum), + Node: fmt.Sprintf("node%d", nodeNum), + Partition: structs.NodeEnterpriseMetaInDefaultPartition().PartitionOrEmpty(), }, Service: &structs.NodeService{ ID: svc, @@ -2270,6 +2276,7 @@ func TestEventPayloadCheckServiceNode_FilterByKey(t *testing.T) { payload EventPayloadCheckServiceNode key string namespace string + partition string // TODO(partitions): create test cases for this being set expected bool } @@ -2278,7 +2285,7 @@ func TestEventPayloadCheckServiceNode_FilterByKey(t *testing.T) { t.Skip("cant test namespace matching without namespace support") } - require.Equal(t, tc.expected, tc.payload.MatchesKey(tc.key, tc.namespace)) + require.Equal(t, tc.expected, tc.payload.MatchesKey(tc.key, tc.namespace, tc.partition)) } var testCases = []testCase{ diff --git a/agent/consul/state/catalog_test.go b/agent/consul/state/catalog_test.go index d307ef56ba..c4b09aeae3 100644 --- a/agent/consul/state/catalog_test.go +++ b/agent/consul/state/catalog_test.go @@ -28,6 +28,7 @@ func makeRandomNodeID(t *testing.T) types.NodeID { func TestStateStore_GetNodeID(t *testing.T) { s := testStateStore(t) + _, out, err := s.GetNodeID(types.NodeID("wrongId")) if err == nil || out != nil || !strings.Contains(err.Error(), "node lookup by ID failed, wrong UUID") { t.Fatalf("want an error, nil value, err:=%q ; out:=%q", err.Error(), out) @@ -53,30 +54,53 @@ func TestStateStore_GetNodeID(t *testing.T) { Node: "node1", Address: "1.2.3.4", } - if err := s.EnsureRegistration(1, req); err != nil { - t.Fatalf("err: %s", err) - } + require.NoError(t, s.EnsureRegistration(1, req)) _, out, err = s.GetNodeID(nodeID) - if err != nil { - t.Fatalf("got err %s want nil", err) - } + require.NoError(t, err) if out == nil || out.ID != nodeID { t.Fatalf("out should not be nil and contain nodeId, but was:=%#v", out) } + // Case insensitive lookup should work as well _, out, err = s.GetNodeID(types.NodeID("00a916bC-a357-4a19-b886-59419fceeAAA")) - if err != nil { - t.Fatalf("got err %s want nil", err) - } + require.NoError(t, err) if out == nil || out.ID != nodeID { t.Fatalf("out should not be nil and contain nodeId, but was:=%#v", out) } } +func TestStateStore_GetNode(t *testing.T) { + s := testStateStore(t) + + // initially does not exist + idx, out, err := s.GetNode("node1", nil) + require.NoError(t, err) + require.Nil(t, out) + require.Equal(t, uint64(0), idx) + + // Create it + testRegisterNode(t, s, 1, "node1") + + // now exists + idx, out, err = s.GetNode("node1", nil) + require.NoError(t, err) + require.NotNil(t, out) + require.Equal(t, uint64(1), idx) + require.Equal(t, "node1", out.Node) + + // Case insensitive lookup should work as well + idx, out, err = s.GetNode("NoDe1", nil) + require.NoError(t, err) + require.NotNil(t, out) + require.Equal(t, uint64(1), idx) + require.Equal(t, "node1", out.Node) +} + func TestStateStore_ensureNoNodeWithSimilarNameTxn(t *testing.T) { t.Parallel() s := testStateStore(t) + nodeID := makeRandomNodeID(t) req := &structs.RegisterRequest{ ID: nodeID, @@ -90,9 +114,7 @@ func TestStateStore_ensureNoNodeWithSimilarNameTxn(t *testing.T) { Status: api.HealthPassing, }, } - if err := s.EnsureRegistration(1, req); err != nil { - t.Fatalf("err: %s", err) - } + require.NoError(t, s.EnsureRegistration(1, req)) req = &structs.RegisterRequest{ ID: types.NodeID(""), Node: "node2", @@ -103,31 +125,29 @@ func TestStateStore_ensureNoNodeWithSimilarNameTxn(t *testing.T) { Status: api.HealthPassing, }, } - if err := s.EnsureRegistration(2, req); err != nil { - t.Fatalf("err: %s", err) - } + require.NoError(t, s.EnsureRegistration(2, req)) + tx := s.db.WriteTxnRestore() defer tx.Abort() + node := &structs.Node{ ID: makeRandomNodeID(t), Node: "NOdE1", // Name is similar but case is different Address: "2.3.4.5", } + // Lets conflict with node1 (has an ID) - if err := ensureNoNodeWithSimilarNameTxn(tx, node, false); err == nil { - t.Fatalf("Should return an error since another name with similar name exists") - } - if err := ensureNoNodeWithSimilarNameTxn(tx, node, true); err == nil { - t.Fatalf("Should return an error since another name with similar name exists") - } + require.Error(t, ensureNoNodeWithSimilarNameTxn(tx, node, false), + "Should return an error since another name with similar name exists") + require.Error(t, ensureNoNodeWithSimilarNameTxn(tx, node, true), + "Should return an error since another name with similar name exists") + // Lets conflict with node without ID node.Node = "NoDe2" - if err := ensureNoNodeWithSimilarNameTxn(tx, node, false); err == nil { - t.Fatalf("Should return an error since another name with similar name exists") - } - if err := ensureNoNodeWithSimilarNameTxn(tx, node, true); err != nil { - t.Fatalf("Should not clash with another similar node name without ID, err:=%q", err) - } + require.Error(t, ensureNoNodeWithSimilarNameTxn(tx, node, false), + "Should return an error since another name with similar name exists") + require.NoError(t, ensureNoNodeWithSimilarNameTxn(tx, node, true), + "Should not clash with another similar node name without ID") // Set node1's Serf health to failing and replace it. newNode := &structs.Node{ @@ -135,17 +155,15 @@ func TestStateStore_ensureNoNodeWithSimilarNameTxn(t *testing.T) { Node: "node1", Address: "2.3.4.5", } - if err := ensureNoNodeWithSimilarNameTxn(tx, newNode, false); err == nil { - t.Fatalf("Should return an error since the previous node is still healthy") - } - s.ensureCheckTxn(tx, 5, false, &structs.HealthCheck{ + require.Error(t, ensureNoNodeWithSimilarNameTxn(tx, newNode, false), + "Should return an error since the previous node is still healthy") + + require.NoError(t, s.ensureCheckTxn(tx, 5, false, &structs.HealthCheck{ Node: "node1", CheckID: structs.SerfCheckID, Status: api.HealthCritical, - }) - if err := ensureNoNodeWithSimilarNameTxn(tx, newNode, false); err != nil { - t.Fatal(err) - } + })) + require.NoError(t, ensureNoNodeWithSimilarNameTxn(tx, newNode, false)) } func TestStateStore_EnsureRegistration(t *testing.T) { diff --git a/agent/consul/state/state_store_test.go b/agent/consul/state/state_store_test.go index 8e7da5bcab..68e2e08fea 100644 --- a/agent/consul/state/state_store_test.go +++ b/agent/consul/state/state_store_test.go @@ -94,7 +94,10 @@ func testRegisterNodeOpts(t *testing.T, s *Store, idx uint64, nodeID string, opt tx := s.db.Txn(false) defer tx.Abort() - n, err := tx.First(tableNodes, indexID, Query{Value: nodeID}) + n, err := tx.First(tableNodes, indexID, Query{ + Value: nodeID, + EnterpriseMeta: *node.GetEnterpriseMeta(), + }) if err != nil { t.Fatalf("err: %s", err) } diff --git a/agent/consul/state/store_integration_test.go b/agent/consul/state/store_integration_test.go index e31c4158f0..dc6cee8690 100644 --- a/agent/consul/state/store_integration_test.go +++ b/agent/consul/state/store_integration_test.go @@ -422,7 +422,19 @@ type nodePayload struct { node *structs.ServiceNode } -func (p nodePayload) MatchesKey(key, _ string) bool { +func (p nodePayload) MatchesKey(key, _, partition string) bool { + if key == "" && partition == "" { + return true + } + + if p.node == nil { + return false + } + + if structs.PartitionOrDefault(partition) != p.node.PartitionOrDefault() { + return false + } + return p.key == key } diff --git a/agent/consul/stream/event.go b/agent/consul/stream/event.go index 74df46b5e1..285710543e 100644 --- a/agent/consul/stream/event.go +++ b/agent/consul/stream/event.go @@ -26,12 +26,13 @@ type Event struct { // should not modify the state of the payload if the Event is being submitted to // EventPublisher.Publish. type Payload interface { - // MatchesKey must return true if the Payload should be included in a subscription - // requested with the key and namespace. - // Generally this means that the payload matches the key and namespace or - // the payload is a special framing event that should be returned to every - // subscription. - MatchesKey(key, namespace string) bool + // MatchesKey must return true if the Payload should be included in a + // subscription requested with the key, namespace, and partition. + // + // Generally this means that the payload matches the key, namespace, and + // partition or the payload is a special framing event that should be + // returned to every subscription. + MatchesKey(key, namespace, partition string) bool // HasReadPermission uses the acl.Authorizer to determine if the items in the // Payload are visible to the request. It returns true if the payload is @@ -80,10 +81,11 @@ func (p *PayloadEvents) filter(f func(Event) bool) bool { return true } -// MatchesKey filters the PayloadEvents to those which match the key and namespace. -func (p *PayloadEvents) MatchesKey(key, namespace string) bool { +// MatchesKey filters the PayloadEvents to those which match the key, +// namespace, and partition. +func (p *PayloadEvents) MatchesKey(key, namespace, partition string) bool { return p.filter(func(event Event) bool { - return event.Payload.MatchesKey(key, namespace) + return event.Payload.MatchesKey(key, namespace, partition) }) } @@ -115,7 +117,7 @@ func (e Event) IsNewSnapshotToFollow() bool { type framingEvent struct{} -func (framingEvent) MatchesKey(string, string) bool { +func (framingEvent) MatchesKey(string, string, string) bool { return true } @@ -135,7 +137,7 @@ type closeSubscriptionPayload struct { tokensSecretIDs []string } -func (closeSubscriptionPayload) MatchesKey(string, string) bool { +func (closeSubscriptionPayload) MatchesKey(string, string, string) bool { return false } diff --git a/agent/consul/stream/event_publisher.go b/agent/consul/stream/event_publisher.go index bfa3858b96..163fa81094 100644 --- a/agent/consul/stream/event_publisher.go +++ b/agent/consul/stream/event_publisher.go @@ -291,5 +291,5 @@ func (e *EventPublisher) setCachedSnapshotLocked(req *SubscribeRequest, snap *ev } func snapCacheKey(req *SubscribeRequest) string { - return fmt.Sprintf(req.Namespace + "/" + req.Key) + return req.Partition + "/" + req.Namespace + "/" + req.Key } diff --git a/agent/consul/stream/event_publisher_test.go b/agent/consul/stream/event_publisher_test.go index 2967ef8d3d..af7fc3c288 100644 --- a/agent/consul/stream/event_publisher_test.go +++ b/agent/consul/stream/event_publisher_test.go @@ -70,7 +70,7 @@ type simplePayload struct { noReadPerm bool } -func (p simplePayload) MatchesKey(key, _ string) bool { +func (p simplePayload) MatchesKey(key, _, _ string) bool { if key == "" { return true } diff --git a/agent/consul/stream/event_test.go b/agent/consul/stream/event_test.go index 8b36ee8d15..a3187017ca 100644 --- a/agent/consul/stream/event_test.go +++ b/agent/consul/stream/event_test.go @@ -35,7 +35,7 @@ func TestPayloadEvents_FilterByKey(t *testing.T) { events = append(events, tc.events...) pe := &PayloadEvents{Items: events} - ok := pe.MatchesKey(tc.req.Key, tc.req.Namespace) + ok := pe.MatchesKey(tc.req.Key, tc.req.Namespace, tc.req.Partition) require.Equal(t, tc.expectEvent, ok) if !tc.expectEvent { return @@ -133,6 +133,7 @@ func TestPayloadEvents_FilterByKey(t *testing.T) { } } +// TODO(partitions) func newNSEvent(key, namespace string) Event { return Event{Index: 22, Payload: nsPayload{key: key, namespace: namespace}} } @@ -141,11 +142,14 @@ type nsPayload struct { framingEvent key string namespace string + partition string value string } -func (p nsPayload) MatchesKey(key, namespace string) bool { - return (key == "" || key == p.key) && (namespace == "" || namespace == p.namespace) +func (p nsPayload) MatchesKey(key, namespace, partition string) bool { + return (key == "" || key == p.key) && + (namespace == "" || namespace == p.namespace) && + (partition == "" || partition == p.partition) } func TestPayloadEvents_HasReadPermission(t *testing.T) { diff --git a/agent/consul/stream/subscription.go b/agent/consul/stream/subscription.go index 03069ea931..9f47cd2ee3 100644 --- a/agent/consul/stream/subscription.go +++ b/agent/consul/stream/subscription.go @@ -62,6 +62,9 @@ type SubscribeRequest struct { // Namespace used to filter events in the topic. Only events matching the // namespace will be returned by the subscription. Namespace string + // Partition used to filter events in the topic. Only events matching the + // partition will be returned by the subscription. + Partition string // TODO(partitions): make this work // Token that was used to authenticate the request. If any ACL policy // changes impact the token the subscription will be forcefully closed. Token string @@ -102,7 +105,7 @@ func (s *Subscription) Next(ctx context.Context) (Event, error) { continue } event := newEventFromBatch(s.req, next.Events) - if !event.Payload.MatchesKey(s.req.Key, s.req.Namespace) { + if !event.Payload.MatchesKey(s.req.Key, s.req.Namespace, s.req.Partition) { continue } return event, nil From 8252a2691c03d38319c0832b5d6da6f29bae1dc7 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 13 Aug 2021 11:53:19 -0400 Subject: [PATCH 02/28] xds: document how authorization works --- agent/xds/delta.go | 2 +- agent/xds/server.go | 17 +++++++++++++---- contributing/service-mesh/README.md | 9 ++++++--- contributing/service-mesh/xds.md | 25 +++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 8 deletions(-) create mode 100644 contributing/service-mesh/xds.md diff --git a/agent/xds/delta.go b/agent/xds/delta.go index 762ea31c48..b2564dacfc 100644 --- a/agent/xds/delta.go +++ b/agent/xds/delta.go @@ -138,7 +138,7 @@ func (s *Server) processDelta(stream ADSDeltaStream, reqCh <-chan *envoy_discove } checkStreamACLs := func(cfgSnap *proxycfg.ConfigSnapshot) error { - return s.checkStreamACLs(stream.Context(), cfgSnap) + return s.authorize(stream.Context(), cfgSnap) } for { diff --git a/agent/xds/server.go b/agent/xds/server.go index d2e2b23081..a9b4c220b3 100644 --- a/agent/xds/server.go +++ b/agent/xds/server.go @@ -327,7 +327,7 @@ func (s *Server) process(stream ADSStream, reqCh <-chan *envoy_discovery_v3.Disc } checkStreamACLs := func(cfgSnap *proxycfg.ConfigSnapshot) error { - return s.checkStreamACLs(stream.Context(), cfgSnap) + return s.authorize(stream.Context(), cfgSnap) } for { @@ -564,13 +564,22 @@ func NewGRPCServer(s *Server, tlsConfigurator *tlsutil.Configurator) *grpc.Serve return srv } -func (s *Server) checkStreamACLs(streamCtx context.Context, cfgSnap *proxycfg.ConfigSnapshot) error { +// authorize the xDS request using the token stored in ctx. This authorization is +// a bit different from most interfaces. Instead of explicitly authorizing or +// filtering each piece of data in the response, the request is authorized +// by checking the token has `service:write` for the service ID of the destination +// service (for kind=ConnectProxy), or the gateway service (for other kinds). +// This authorization strategy requires that agent/proxycfg only fetches data +// using a token with the same permissions, and that it stores the data by +// proxy ID. We assume that any data in the snapshot was already filtered, +// which allows this authorization to be a shallow authorization check +// for all the data in a ConfigSnapshot. +func (s *Server) authorize(ctx context.Context, cfgSnap *proxycfg.ConfigSnapshot) error { if cfgSnap == nil { return status.Errorf(codes.Unauthenticated, "unauthenticated: no config snapshot") } - authz, err := s.ResolveToken(tokenFromContext(streamCtx)) - + authz, err := s.ResolveToken(tokenFromContext(ctx)) if acl.IsErrNotFound(err) { return status.Errorf(codes.Unauthenticated, "unauthenticated: %v", err) } else if acl.IsErrPermissionDenied(err) { diff --git a/contributing/service-mesh/README.md b/contributing/service-mesh/README.md index 3c770a88cc..7bc16f7f77 100644 --- a/contributing/service-mesh/README.md +++ b/contributing/service-mesh/README.md @@ -1,12 +1,15 @@ # Service Mesh (Connect) - call out: envoy/proxy is the data plane, Consul is the control plane -- agent/xds - gRPC service that implements - [xDS](https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol) -- [agent/proxycfg](https://github.com/hashicorp/consul/blob/master/agent/proxycfg/proxycfg.go) +- [xDS Server] - a gRPC service that implements [xDS] and handles requests from an [envoy proxy]. +- [agent/proxycfg] - CA Manager - certificate authority - command/connect/envoy - bootstrapping and running envoy - command/connect/proxy - built-in proxy that is dev-only and not supported for production. - `connect/` - "Native" service mesh +[xDS Server]: ./xds.md +[xDS]: https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol +[envoy proxy]: https://www.consul.io/docs/connect/proxies/envoy +[agent/proxycfg]: https://github.com/hashicorp/consul/blob/main/agent/proxycfg diff --git a/contributing/service-mesh/xds.md b/contributing/service-mesh/xds.md new file mode 100644 index 0000000000..73aeefb52b --- /dev/null +++ b/contributing/service-mesh/xds.md @@ -0,0 +1,25 @@ +# xDS Server + +The xDS Server is a gRPC service that implements [xDS] and handles requests from +an [envoy proxy]. + +[xDS]: https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol +[envoy proxy]: https://www.consul.io/docs/connect/proxies/envoy + + +## Authorization + +Requests to the xDS server are authorized based on an assumption of how +`proxycfg.ConfigSnapshot` are constructed. Most interfaces (HTTP, DNS, RPC) +authorize requests by authorizing the data in the response, or by filtering +out data that the requester is not authorized to view. The xDS server authorizes +requests by looking at the proxy ID in the request and ensuring the ACL token has +`service:write` access to either the destination service (for kind=ConnectProxy), or +the gateway service (for other kinds). + +This authorization strategy requires that [agent/proxycfg] only fetches data using a +token with the same permissions, and that it only stores data by proxy ID. We assume +that any data in the snapshot was already filtered, which allows this authorization to +only perform a shallow check against the proxy ID. + +[agent/proxycfg]: https://github.com/hashicorp/consul/blob/main/agent/proxycfg From e43cf462679b6fdd8b15ac7891747e970029ac4a Mon Sep 17 00:00:00 2001 From: Roopak Venkatakrishnan Date: Wed, 18 Aug 2021 02:56:30 +0000 Subject: [PATCH 03/28] Update x/sys to support go 1.17 --- go.mod | 2 +- go.sum | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 68a28aa550..9c7bb865f3 100644 --- a/go.mod +++ b/go.mod @@ -85,7 +85,7 @@ require ( golang.org/x/net v0.0.0-20210410081132-afb366fc7cd1 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 golang.org/x/sync v0.0.0-20210220032951-036812b2e83c - golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44 + golang.org/x/sys v0.0.0-20210817190340-bfb29a6856f2 golang.org/x/time v0.0.0-20200630173020-3af7569d3a1e google.golang.org/api v0.9.0 // indirect google.golang.org/appengine v1.6.0 // indirect diff --git a/go.sum b/go.sum index c573f6aed4..35862eacef 100644 --- a/go.sum +++ b/go.sum @@ -625,8 +625,9 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20201024232916-9f70ab9862d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210303074136-134d130e1a04/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44 h1:Bli41pIlzTzf3KEY06n+xnzK/BESIg2ze4Pgfh/aI8c= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210817190340-bfb29a6856f2 h1:c8PlLMqBbOHoqtjteWm5/kbe6rNY2pbRfbIMVnepueo= +golang.org/x/sys v0.0.0-20210817190340-bfb29a6856f2/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1 h1:v+OssWQX+hTHEmOBgwxdZxK4zHq3yOs8F9J7mk0PY8E= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= From e44bce3c4f944dac21d1c328822b08a861db2dd6 Mon Sep 17 00:00:00 2001 From: "R.B. Boyer" <4903+rboyer@users.noreply.github.com> Date: Wed, 18 Aug 2021 09:27:15 -0500 Subject: [PATCH 04/28] state: partition the usage metrics subsystem (#10867) --- agent/consul/fsm/snapshot_oss_test.go | 4 +- agent/consul/state/usage.go | 57 +++-- agent/consul/state/usage_oss.go | 12 +- agent/consul/state/usage_oss_test.go | 25 -- agent/consul/state/usage_test.go | 40 +++- agent/consul/usagemetrics/usagemetrics.go | 67 ++---- agent/consul/usagemetrics/usagemetrics_oss.go | 38 +++ .../usagemetrics/usagemetrics_oss_test.go | 223 ++++++++++++++---- .../consul/usagemetrics/usagemetrics_test.go | 118 ++------- 9 files changed, 320 insertions(+), 264 deletions(-) delete mode 100644 agent/consul/state/usage_oss_test.go diff --git a/agent/consul/fsm/snapshot_oss_test.go b/agent/consul/fsm/snapshot_oss_test.go index 4675d87ca1..13d8bb90b9 100644 --- a/agent/consul/fsm/snapshot_oss_test.go +++ b/agent/consul/fsm/snapshot_oss_test.go @@ -692,9 +692,9 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { require.Equal(t, fedState2, fedStateLoaded2) // Verify usage data is correctly updated - idx, nodeCount, err := fsm2.state.NodeCount() + idx, nodeUsage, err := fsm2.state.NodeUsage() require.NoError(t, err) - require.Equal(t, len(nodes), nodeCount) + require.Equal(t, len(nodes), nodeUsage.Nodes) require.NotZero(t, idx) // Verify system metadata is restored. diff --git a/agent/consul/state/usage.go b/agent/consul/state/usage.go index 536c49a6ca..5e1509766b 100644 --- a/agent/consul/state/usage.go +++ b/agent/consul/state/usage.go @@ -10,16 +10,18 @@ import ( const ( serviceNamesUsageTable = "service-names" + + tableUsage = "usage" ) // usageTableSchema returns a new table schema used for tracking various indexes // for the Raft log. func usageTableSchema() *memdb.TableSchema { return &memdb.TableSchema{ - Name: "usage", + Name: tableUsage, Indexes: map[string]*memdb.IndexSchema{ - "id": { - Name: "id", + indexID: { + Name: indexID, AllowMissing: false, Unique: true, Indexer: &memdb.StringFieldIndex{ @@ -46,6 +48,12 @@ type ServiceUsage struct { EnterpriseServiceUsage } +// NodeUsage contains all of the usage data related to nodes +type NodeUsage struct { + Nodes int + EnterpriseNodeUsage +} + type uniqueServiceState int const ( @@ -68,8 +76,10 @@ func updateUsage(tx WriteTxn, changes Changes) error { } switch change.Table { - case "nodes": + case tableNodes: usageDeltas[change.Table] += delta + addEnterpriseNodeUsage(usageDeltas, change) + case tableServices: svc := changeObject(change).(*structs.ServiceNode) usageDeltas[change.Table] += delta @@ -98,7 +108,8 @@ func updateUsage(tx WriteTxn, changes Changes) error { // This will happen when restoring from a snapshot, just take the max index // of the tables we are tracking. if idx == 0 { - idx = maxIndexTxn(tx, "nodes", tableServices) + // TODO(partitions? namespaces?) + idx = maxIndexTxn(tx, tableNodes, tableServices) } return writeUsageDeltas(tx, idx, usageDeltas) @@ -107,7 +118,10 @@ func updateUsage(tx WriteTxn, changes Changes) error { func updateServiceNameUsage(tx WriteTxn, usageDeltas map[string]int, serviceNameChanges map[structs.ServiceName]int) (map[structs.ServiceName]uniqueServiceState, error) { serviceStates := make(map[structs.ServiceName]uniqueServiceState, len(serviceNameChanges)) for svc, delta := range serviceNameChanges { - q := Query{Value: svc.Name, EnterpriseMeta: svc.EnterpriseMeta} + q := Query{ + Value: svc.Name, + EnterpriseMeta: svc.EnterpriseMeta, + } serviceIter, err := tx.Get(tableServices, indexService, q) if err != nil { return nil, err @@ -162,7 +176,7 @@ func serviceNameChanged(change memdb.Change) bool { // passed in will be recorded on the entry as well. func writeUsageDeltas(tx WriteTxn, idx uint64, usageDeltas map[string]int) error { for id, delta := range usageDeltas { - u, err := tx.First("usage", "id", id) + u, err := tx.First(tableUsage, indexID, id) if err != nil { return fmt.Errorf("failed to retrieve existing usage entry: %s", err) } @@ -175,7 +189,7 @@ func writeUsageDeltas(tx WriteTxn, idx uint64, usageDeltas map[string]int) error // large numbers. delta = 0 } - err := tx.Insert("usage", &UsageEntry{ + err := tx.Insert(tableUsage, &UsageEntry{ ID: id, Count: delta, Index: idx, @@ -192,7 +206,7 @@ func writeUsageDeltas(tx WriteTxn, idx uint64, usageDeltas map[string]int) error // large numbers. updated = 0 } - err := tx.Insert("usage", &UsageEntry{ + err := tx.Insert(tableUsage, &UsageEntry{ ID: id, Count: updated, Index: idx, @@ -205,17 +219,26 @@ func writeUsageDeltas(tx WriteTxn, idx uint64, usageDeltas map[string]int) error return nil } -// NodeCount returns the latest seen Raft index, a count of the number of nodes -// registered, and any errors. -func (s *Store) NodeCount() (uint64, int, error) { +// NodeUsage returns the latest seen Raft index, a compiled set of node usage +// data, and any errors. +func (s *Store) NodeUsage() (uint64, NodeUsage, error) { tx := s.db.ReadTxn() defer tx.Abort() - nodeUsage, err := firstUsageEntry(tx, "nodes") + nodes, err := firstUsageEntry(tx, tableNodes) if err != nil { - return 0, 0, fmt.Errorf("failed nodes lookup: %s", err) + return 0, NodeUsage{}, fmt.Errorf("failed nodes lookup: %s", err) } - return nodeUsage.Index, nodeUsage.Count, nil + + usage := NodeUsage{ + Nodes: nodes.Count, + } + results, err := compileEnterpriseNodeUsage(tx, usage) + if err != nil { + return 0, NodeUsage{}, fmt.Errorf("failed nodes lookup: %s", err) + } + + return nodes.Index, results, nil } // ServiceUsage returns the latest seen Raft index, a compiled set of service @@ -238,7 +261,7 @@ func (s *Store) ServiceUsage() (uint64, ServiceUsage, error) { ServiceInstances: serviceInstances.Count, Services: services.Count, } - results, err := compileEnterpriseUsage(tx, usage) + results, err := compileEnterpriseServiceUsage(tx, usage) if err != nil { return 0, ServiceUsage{}, fmt.Errorf("failed services lookup: %s", err) } @@ -247,7 +270,7 @@ func (s *Store) ServiceUsage() (uint64, ServiceUsage, error) { } func firstUsageEntry(tx ReadTxn, id string) (*UsageEntry, error) { - usage, err := tx.First("usage", "id", id) + usage, err := tx.First(tableUsage, indexID, id) if err != nil { return nil, err } diff --git a/agent/consul/state/usage_oss.go b/agent/consul/state/usage_oss.go index 355f1fbcc4..c45bf74d56 100644 --- a/agent/consul/state/usage_oss.go +++ b/agent/consul/state/usage_oss.go @@ -3,16 +3,24 @@ package state import ( - "github.com/hashicorp/consul/agent/structs" memdb "github.com/hashicorp/go-memdb" + + "github.com/hashicorp/consul/agent/structs" ) type EnterpriseServiceUsage struct{} +type EnterpriseNodeUsage struct{} + +func addEnterpriseNodeUsage(map[string]int, memdb.Change) {} func addEnterpriseServiceInstanceUsage(map[string]int, memdb.Change) {} func addEnterpriseServiceUsage(map[string]int, map[structs.ServiceName]uniqueServiceState) {} -func compileEnterpriseUsage(tx ReadTxn, usage ServiceUsage) (ServiceUsage, error) { +func compileEnterpriseServiceUsage(tx ReadTxn, usage ServiceUsage) (ServiceUsage, error) { + return usage, nil +} + +func compileEnterpriseNodeUsage(tx ReadTxn, usage NodeUsage) (NodeUsage, error) { return usage, nil } diff --git a/agent/consul/state/usage_oss_test.go b/agent/consul/state/usage_oss_test.go deleted file mode 100644 index b441c71635..0000000000 --- a/agent/consul/state/usage_oss_test.go +++ /dev/null @@ -1,25 +0,0 @@ -// +build !consulent - -package state - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestStateStore_Usage_ServiceUsage(t *testing.T) { - s := testStateStore(t) - - testRegisterNode(t, s, 0, "node1") - testRegisterNode(t, s, 1, "node2") - testRegisterService(t, s, 8, "node1", "service1") - testRegisterService(t, s, 9, "node2", "service1") - testRegisterService(t, s, 10, "node2", "service2") - - idx, usage, err := s.ServiceUsage() - require.NoError(t, err) - require.Equal(t, idx, uint64(10)) - require.Equal(t, 2, usage.Services) - require.Equal(t, 3, usage.ServiceInstances) -} diff --git a/agent/consul/state/usage_test.go b/agent/consul/state/usage_test.go index 1a0e8c13f5..2d650a5910 100644 --- a/agent/consul/state/usage_test.go +++ b/agent/consul/state/usage_test.go @@ -9,40 +9,40 @@ import ( "github.com/hashicorp/consul/agent/structs" ) -func TestStateStore_Usage_NodeCount(t *testing.T) { +func TestStateStore_Usage_NodeUsage(t *testing.T) { s := testStateStore(t) // No nodes have been registered, and thus no usage entry exists - idx, count, err := s.NodeCount() + idx, usage, err := s.NodeUsage() require.NoError(t, err) require.Equal(t, idx, uint64(0)) - require.Equal(t, count, 0) + require.Equal(t, usage.Nodes, 0) testRegisterNode(t, s, 0, "node1") testRegisterNode(t, s, 1, "node2") - idx, count, err = s.NodeCount() + idx, usage, err = s.NodeUsage() require.NoError(t, err) require.Equal(t, idx, uint64(1)) - require.Equal(t, count, 2) + require.Equal(t, usage.Nodes, 2) } -func TestStateStore_Usage_NodeCount_Delete(t *testing.T) { +func TestStateStore_Usage_NodeUsage_Delete(t *testing.T) { s := testStateStore(t) testRegisterNode(t, s, 0, "node1") testRegisterNode(t, s, 1, "node2") - idx, count, err := s.NodeCount() + idx, usage, err := s.NodeUsage() require.NoError(t, err) require.Equal(t, idx, uint64(1)) - require.Equal(t, count, 2) + require.Equal(t, usage.Nodes, 2) require.NoError(t, s.DeleteNode(2, "node2", nil)) - idx, count, err = s.NodeCount() + idx, usage, err = s.NodeUsage() require.NoError(t, err) require.Equal(t, idx, uint64(2)) - require.Equal(t, count, 1) + require.Equal(t, usage.Nodes, 1) } func TestStateStore_Usage_ServiceUsageEmpty(t *testing.T) { @@ -56,6 +56,22 @@ func TestStateStore_Usage_ServiceUsageEmpty(t *testing.T) { require.Equal(t, usage.ServiceInstances, 0) } +func TestStateStore_Usage_ServiceUsage(t *testing.T) { + s := testStateStore(t) + + testRegisterNode(t, s, 0, "node1") + testRegisterNode(t, s, 1, "node2") + testRegisterService(t, s, 8, "node1", "service1") + testRegisterService(t, s, 9, "node2", "service1") + testRegisterService(t, s, 10, "node2", "service2") + + idx, usage, err := s.ServiceUsage() + require.NoError(t, err) + require.Equal(t, idx, uint64(10)) + require.Equal(t, 2, usage.Services) + require.Equal(t, 3, usage.ServiceInstances) +} + func TestStateStore_Usage_ServiceUsage_DeleteNode(t *testing.T) { s := testStateStore(t) testRegisterNode(t, s, 1, "node1") @@ -116,10 +132,10 @@ func TestStateStore_Usage_Restore(t *testing.T) { }) require.NoError(t, restore.Commit()) - idx, count, err := s.NodeCount() + idx, nodeUsage, err := s.NodeUsage() require.NoError(t, err) require.Equal(t, idx, uint64(9)) - require.Equal(t, count, 1) + require.Equal(t, nodeUsage.Nodes, 1) idx, usage, err := s.ServiceUsage() require.NoError(t, err) diff --git a/agent/consul/usagemetrics/usagemetrics.go b/agent/consul/usagemetrics/usagemetrics.go index 353e9a45df..9669125c66 100644 --- a/agent/consul/usagemetrics/usagemetrics.go +++ b/agent/consul/usagemetrics/usagemetrics.go @@ -8,10 +8,11 @@ import ( "github.com/armon/go-metrics/prometheus" "github.com/armon/go-metrics" - "github.com/hashicorp/consul/agent/consul/state" - "github.com/hashicorp/consul/logging" "github.com/hashicorp/go-hclog" "github.com/hashicorp/serf/serf" + + "github.com/hashicorp/consul/agent/consul/state" + "github.com/hashicorp/consul/logging" ) var Gauges = []prometheus.GaugeDefinition{ @@ -145,15 +146,13 @@ func (u *UsageMetricsReporter) Run(ctx context.Context) { func (u *UsageMetricsReporter) runOnce() { state := u.stateProvider.State() - _, nodes, err := state.NodeCount() + + _, nodeUsage, err := state.NodeUsage() if err != nil { u.logger.Warn("failed to retrieve nodes from state store", "error", err) } - metrics.SetGaugeWithLabels( - []string{"consul", "state", "nodes"}, - float32(nodes), - u.metricLabels, - ) + + u.emitNodeUsage(nodeUsage) _, serviceUsage, err := state.ServiceUsage() if err != nil { @@ -162,65 +161,27 @@ func (u *UsageMetricsReporter) runOnce() { u.emitServiceUsage(serviceUsage) - servers, clients := u.memberUsage() - u.emitMemberUsage(servers, clients) + members := u.memberUsage() + u.emitMemberUsage(members) } -func (u *UsageMetricsReporter) memberUsage() (int, map[string]int) { +func (u *UsageMetricsReporter) memberUsage() []serf.Member { if u.getMembersFunc == nil { - return 0, nil + return nil } mems := u.getMembersFunc() if len(mems) <= 0 { u.logger.Warn("cluster reported zero members") - return 0, nil } - servers := 0 - clients := make(map[string]int) - + out := make([]serf.Member, 0, len(mems)) for _, m := range mems { if m.Status != serf.StatusAlive { continue } - - switch m.Tags["role"] { - case "node": - clients[m.Tags["segment"]]++ - case "consul": - servers++ - } + out = append(out, m) } - return servers, clients -} - -func (u *UsageMetricsReporter) emitMemberUsage(servers int, clients map[string]int) { - totalClients := 0 - - for seg, c := range clients { - segmentLabel := metrics.Label{Name: "segment", Value: seg} - labels := append([]metrics.Label{segmentLabel}, u.metricLabels...) - - metrics.SetGaugeWithLabels( - []string{"consul", "members", "clients"}, - float32(c), - labels, - ) - - totalClients += c - } - - metrics.SetGaugeWithLabels( - []string{"consul", "members", "clients"}, - float32(totalClients), - u.metricLabels, - ) - - metrics.SetGaugeWithLabels( - []string{"consul", "members", "servers"}, - float32(servers), - u.metricLabels, - ) + return out } diff --git a/agent/consul/usagemetrics/usagemetrics_oss.go b/agent/consul/usagemetrics/usagemetrics_oss.go index 1cf7e6bda2..68c8852117 100644 --- a/agent/consul/usagemetrics/usagemetrics_oss.go +++ b/agent/consul/usagemetrics/usagemetrics_oss.go @@ -4,9 +4,47 @@ package usagemetrics import ( "github.com/armon/go-metrics" + + "github.com/hashicorp/serf/serf" + "github.com/hashicorp/consul/agent/consul/state" ) +func (u *UsageMetricsReporter) emitNodeUsage(nodeUsage state.NodeUsage) { + metrics.SetGaugeWithLabels( + []string{"consul", "state", "nodes"}, + float32(nodeUsage.Nodes), + u.metricLabels, + ) +} + +func (u *UsageMetricsReporter) emitMemberUsage(members []serf.Member) { + var ( + servers int + clients int + ) + for _, m := range members { + switch m.Tags["role"] { + case "node": + clients++ + case "consul": + servers++ + } + } + + metrics.SetGaugeWithLabels( + []string{"consul", "members", "clients"}, + float32(clients), + u.metricLabels, + ) + + metrics.SetGaugeWithLabels( + []string{"consul", "members", "servers"}, + float32(servers), + u.metricLabels, + ) +} + func (u *UsageMetricsReporter) emitServiceUsage(serviceUsage state.ServiceUsage) { metrics.SetGaugeWithLabels( []string{"consul", "state", "services"}, diff --git a/agent/consul/usagemetrics/usagemetrics_oss_test.go b/agent/consul/usagemetrics/usagemetrics_oss_test.go index e232014358..810c05f3cb 100644 --- a/agent/consul/usagemetrics/usagemetrics_oss_test.go +++ b/agent/consul/usagemetrics/usagemetrics_oss_test.go @@ -9,16 +9,151 @@ import ( "github.com/armon/go-metrics" "github.com/stretchr/testify/require" + "github.com/hashicorp/serf/serf" + "github.com/hashicorp/consul/agent/consul/state" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/sdk/testutil" - "github.com/hashicorp/serf/serf" ) func newStateStore() (*state.Store, error) { return state.NewStateStore(nil), nil } +func TestUsageReporter_emitNodeUsage_OSS(t *testing.T) { + type testCase struct { + modfiyStateStore func(t *testing.T, s *state.Store) + getMembersFunc getMembersFunc + expectedGauges map[string]metrics.GaugeValue + } + cases := map[string]testCase{ + "empty-state": { + expectedGauges: map[string]metrics.GaugeValue{ + // --- node --- + "consul.usage.test.consul.state.nodes;datacenter=dc1": { + Name: "consul.usage.test.consul.state.nodes", + Value: 0, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + // --- member --- + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 0, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 0, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + // --- service --- + "consul.usage.test.consul.state.services;datacenter=dc1": { + Name: "consul.usage.test.consul.state.services", + Value: 0, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + "consul.usage.test.consul.state.service_instances;datacenter=dc1": { + Name: "consul.usage.test.consul.state.service_instances", + Value: 0, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + }, + getMembersFunc: func() []serf.Member { return []serf.Member{} }, + }, + "nodes": { + modfiyStateStore: func(t *testing.T, s *state.Store) { + require.NoError(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"})) + require.NoError(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) + require.NoError(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) + }, + getMembersFunc: func() []serf.Member { + return []serf.Member{ + { + Name: "foo", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "bar", + Tags: map[string]string{"role": "consul"}, + Status: serf.StatusAlive, + }, + { + Name: "baz", + Tags: map[string]string{"role": "node"}, + Status: serf.StatusAlive, + }, + } + }, + expectedGauges: map[string]metrics.GaugeValue{ + // --- node --- + "consul.usage.test.consul.state.nodes;datacenter=dc1": { + Name: "consul.usage.test.consul.state.nodes", + Value: 3, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + // --- member --- + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 2, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 1, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + // --- service --- + "consul.usage.test.consul.state.services;datacenter=dc1": { + Name: "consul.usage.test.consul.state.services", + Value: 0, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + "consul.usage.test.consul.state.service_instances;datacenter=dc1": { + Name: "consul.usage.test.consul.state.service_instances", + Value: 0, + Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, + }, + }, + }, + } + + for name, tcase := range cases { + t.Run(name, func(t *testing.T) { + // Only have a single interval for the test + sink := metrics.NewInmemSink(1*time.Minute, 1*time.Minute) + cfg := metrics.DefaultConfig("consul.usage.test") + cfg.EnableHostname = false + metrics.NewGlobal(cfg, sink) + + mockStateProvider := &mockStateProvider{} + s, err := newStateStore() + require.NoError(t, err) + if tcase.modfiyStateStore != nil { + tcase.modfiyStateStore(t, s) + } + mockStateProvider.On("State").Return(s) + + reporter, err := NewUsageMetricsReporter( + new(Config). + WithStateProvider(mockStateProvider). + WithLogger(testutil.Logger(t)). + WithDatacenter("dc1"). + WithGetMembersFunc(tcase.getMembersFunc), + ) + require.NoError(t, err) + + reporter.runOnce() + + intervals := sink.Data() + require.Len(t, intervals, 1) + intv := intervals[0] + + assertEqualGaugeMaps(t, tcase.expectedGauges, intv.Gauges) + }) + } +} + func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { type testCase struct { modfiyStateStore func(t *testing.T, s *state.Store) @@ -28,11 +163,28 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { cases := map[string]testCase{ "empty-state": { expectedGauges: map[string]metrics.GaugeValue{ + // --- node --- "consul.usage.test.consul.state.nodes;datacenter=dc1": { Name: "consul.usage.test.consul.state.nodes", Value: 0, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, }, + // --- member --- + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 0, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, + "consul.usage.test.consul.members.clients;datacenter=dc1": { + Name: "consul.usage.test.consul.members.clients", + Value: 0, + Labels: []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + }, + }, + // --- service --- "consul.usage.test.consul.state.services;datacenter=dc1": { Name: "consul.usage.test.consul.state.services", Value: 0, @@ -47,35 +199,21 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { {Name: "datacenter", Value: "dc1"}, }, }, - "consul.usage.test.consul.members.clients;datacenter=dc1": { - Name: "consul.usage.test.consul.members.clients", - Value: 0, - Labels: []metrics.Label{ - {Name: "datacenter", Value: "dc1"}, - }, - }, - "consul.usage.test.consul.members.servers;datacenter=dc1": { - Name: "consul.usage.test.consul.members.servers", - Value: 0, - Labels: []metrics.Label{ - {Name: "datacenter", Value: "dc1"}, - }, - }, }, getMembersFunc: func() []serf.Member { return []serf.Member{} }, }, "nodes-and-services": { modfiyStateStore: func(t *testing.T, s *state.Store) { - require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"})) - require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) - require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) - require.Nil(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"})) + require.NoError(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"})) + require.NoError(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) + require.NoError(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) + require.NoError(t, s.EnsureNode(4, &structs.Node{Node: "qux", Address: "127.0.0.3"})) // Typical services and some consul services spread across two nodes - require.Nil(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000})) - require.Nil(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000})) - require.Nil(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) - require.Nil(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) + require.NoError(t, s.EnsureService(5, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: nil, Address: "", Port: 5000})) + require.NoError(t, s.EnsureService(6, "bar", &structs.NodeService{ID: "api", Service: "api", Tags: nil, Address: "", Port: 5000})) + require.NoError(t, s.EnsureService(7, "foo", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) + require.NoError(t, s.EnsureService(8, "bar", &structs.NodeService{ID: "consul", Service: "consul", Tags: nil})) }, getMembersFunc: func() []serf.Member { return []serf.Member{ @@ -102,21 +240,16 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { } }, expectedGauges: map[string]metrics.GaugeValue{ + // --- node --- "consul.usage.test.consul.state.nodes;datacenter=dc1": { Name: "consul.usage.test.consul.state.nodes", Value: 4, Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, }, - "consul.usage.test.consul.state.services;datacenter=dc1": { - Name: "consul.usage.test.consul.state.services", - Value: 3, - Labels: []metrics.Label{ - {Name: "datacenter", Value: "dc1"}, - }, - }, - "consul.usage.test.consul.state.service_instances;datacenter=dc1": { - Name: "consul.usage.test.consul.state.service_instances", - Value: 4, + // --- member --- + "consul.usage.test.consul.members.servers;datacenter=dc1": { + Name: "consul.usage.test.consul.members.servers", + Value: 2, Labels: []metrics.Label{ {Name: "datacenter", Value: "dc1"}, }, @@ -128,26 +261,18 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { {Name: "datacenter", Value: "dc1"}, }, }, - "consul.usage.test.consul.members.servers;datacenter=dc1": { - Name: "consul.usage.test.consul.members.servers", - Value: 2, + // --- service --- + "consul.usage.test.consul.state.services;datacenter=dc1": { + Name: "consul.usage.test.consul.state.services", + Value: 3, Labels: []metrics.Label{ {Name: "datacenter", Value: "dc1"}, }, }, - "consul.usage.test.consul.members.clients;segment=a;datacenter=dc1": { - Name: "consul.usage.test.consul.members.clients", - Value: 1, + "consul.usage.test.consul.state.service_instances;datacenter=dc1": { + Name: "consul.usage.test.consul.state.service_instances", + Value: 4, Labels: []metrics.Label{ - {Name: "segment", Value: "a"}, - {Name: "datacenter", Value: "dc1"}, - }, - }, - "consul.usage.test.consul.members.clients;segment=b;datacenter=dc1": { - Name: "consul.usage.test.consul.members.clients", - Value: 1, - Labels: []metrics.Label{ - {Name: "segment", Value: "b"}, {Name: "datacenter", Value: "dc1"}, }, }, @@ -185,7 +310,7 @@ func TestUsageReporter_emitServiceUsage_OSS(t *testing.T) { require.Len(t, intervals, 1) intv := intervals[0] - require.Equal(t, tcase.expectedGauges, intv.Gauges) + assertEqualGaugeMaps(t, tcase.expectedGauges, intv.Gauges) }) } } diff --git a/agent/consul/usagemetrics/usagemetrics_test.go b/agent/consul/usagemetrics/usagemetrics_test.go index 1c4be1d5b1..cf0cd01e01 100644 --- a/agent/consul/usagemetrics/usagemetrics_test.go +++ b/agent/consul/usagemetrics/usagemetrics_test.go @@ -2,16 +2,12 @@ package usagemetrics import ( "testing" - "time" "github.com/armon/go-metrics" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" - "github.com/stretchr/testify/require" "github.com/hashicorp/consul/agent/consul/state" - "github.com/hashicorp/consul/agent/structs" - "github.com/hashicorp/consul/sdk/testutil" - "github.com/hashicorp/serf/serf" ) type mockStateProvider struct { @@ -23,106 +19,20 @@ func (m *mockStateProvider) State() *state.Store { return retValues.Get(0).(*state.Store) } -func TestUsageReporter_Run_Nodes(t *testing.T) { - type testCase struct { - modfiyStateStore func(t *testing.T, s *state.Store) - getMembersFunc getMembersFunc - expectedGauges map[string]metrics.GaugeValue - } - cases := map[string]testCase{ - "empty-state": { - expectedGauges: map[string]metrics.GaugeValue{ - "consul.usage.test.consul.state.nodes;datacenter=dc1": { - Name: "consul.usage.test.consul.state.nodes", - Value: 0, - Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, - }, - }, - getMembersFunc: func() []serf.Member { return []serf.Member{} }, - }, - "nodes": { - modfiyStateStore: func(t *testing.T, s *state.Store) { - require.Nil(t, s.EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.1"})) - require.Nil(t, s.EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.2"})) - require.Nil(t, s.EnsureNode(3, &structs.Node{Node: "baz", Address: "127.0.0.2"})) - }, - getMembersFunc: func() []serf.Member { - return []serf.Member{ - { - Name: "foo", - Tags: map[string]string{"role": "consul"}, - Status: serf.StatusAlive, - }, - { - Name: "bar", - Tags: map[string]string{"role": "consul"}, - Status: serf.StatusAlive, - }, - { - Name: "baz", - Tags: map[string]string{"role": "node"}, - Status: serf.StatusAlive, - }, - } - }, - expectedGauges: map[string]metrics.GaugeValue{ - "consul.usage.test.consul.state.nodes;datacenter=dc1": { - Name: "consul.usage.test.consul.state.nodes", - Value: 3, - Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, - }, - "consul.usage.test.consul.members.clients;datacenter=dc1": { - Name: "consul.usage.test.consul.members.clients", - Value: 1, - Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, - }, - "consul.usage.test.consul.members.servers;datacenter=dc1": { - Name: "consul.usage.test.consul.members.servers", - Value: 2, - Labels: []metrics.Label{{Name: "datacenter", Value: "dc1"}}, - }, - }, - }, +func assertEqualGaugeMaps(t *testing.T, expectedMap, foundMap map[string]metrics.GaugeValue) { + t.Helper() + + for key := range foundMap { + if _, ok := expectedMap[key]; !ok { + t.Errorf("found unexpected gauge key: %s", key) + } } - for name, tcase := range cases { - t.Run(name, func(t *testing.T) { - // Only have a single interval for the test - sink := metrics.NewInmemSink(1*time.Minute, 1*time.Minute) - cfg := metrics.DefaultConfig("consul.usage.test") - cfg.EnableHostname = false - metrics.NewGlobal(cfg, sink) - - mockStateProvider := &mockStateProvider{} - s, err := newStateStore() - require.NoError(t, err) - if tcase.modfiyStateStore != nil { - tcase.modfiyStateStore(t, s) - } - mockStateProvider.On("State").Return(s) - - reporter, err := NewUsageMetricsReporter( - new(Config). - WithStateProvider(mockStateProvider). - WithLogger(testutil.Logger(t)). - WithDatacenter("dc1"). - WithGetMembersFunc(tcase.getMembersFunc), - ) - require.NoError(t, err) - - reporter.runOnce() - - intervals := sink.Data() - require.Len(t, intervals, 1) - intv := intervals[0] - - // Range over the expected values instead of just doing an Equal - // comparison on the maps because of different metrics emitted between - // OSS and Ent. The enterprise and OSS tests have a full equality - // comparison on the maps. - for key, expected := range tcase.expectedGauges { - require.Equal(t, expected, intv.Gauges[key]) - } - }) + for key, expected := range expectedMap { + if _, ok := foundMap[key]; !ok { + t.Errorf("did not find expected gauge key: %s", key) + continue + } + assert.Equal(t, expected, foundMap[key], "gauge key mismatch on %q", key) } } From 251026e374c3a0d1ab606a18573b004333815d84 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 6 Aug 2021 13:07:13 -0400 Subject: [PATCH 05/28] debug: remove unused --- command/debug/debug.go | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/command/debug/debug.go b/command/debug/debug.go index a0ec146f5f..dcb03fca0f 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -679,30 +679,17 @@ func (c *cmd) createArchiveTemp(path string) (tempName string, err error) { // defaultTargets specifies the list of all targets that // will be captured by default func (c *cmd) defaultTargets() []string { - return append(c.dynamicTargets(), c.staticTargets()...) -} - -// dynamicTargets returns all the supported targets -// that are retrieved at the interval specified -func (c *cmd) dynamicTargets() []string { - return []string{"metrics", "logs", "pprof"} -} - -// staticTargets returns all the supported targets -// that are retrieved at the start of the command execution -func (c *cmd) staticTargets() []string { - return []string{"host", "agent", "cluster"} + return []string{"metrics", "logs", "pprof", "host", "agent", "cluster"} } func (c *cmd) Synopsis() string { - return synopsis + return "Records a debugging archive for operators" } func (c *cmd) Help() string { return c.help } -const synopsis = "Records a debugging archive for operators" const help = ` Usage: consul debug [options] From bbf6a94c9af79b7d0d1281979ba8ec7edcf51a41 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 6 Aug 2021 13:30:32 -0400 Subject: [PATCH 06/28] debug: rename cluster target to members The API is called members. Using the same name as the API should help describe the contents of the file. --- command/debug/debug.go | 117 ++++++++++++++++++++---------------- command/debug/debug_test.go | 8 +-- 2 files changed, 69 insertions(+), 56 deletions(-) diff --git a/command/debug/debug.go b/command/debug/debug.go index dcb03fca0f..ccb2607889 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -114,7 +114,7 @@ func (c *cmd) init() { fmt.Sprintf("One or more types of information to capture. This can be used "+ "to capture a subset of information, and defaults to capturing "+ "everything available. Possible information for capture: %s. "+ - "This can be repeated multiple times.", strings.Join(c.defaultTargets(), ", "))) + "This can be repeated multiple times.", strings.Join(defaultTargets, ", "))) c.flags.DurationVar(&c.interval, "interval", debugInterval, fmt.Sprintf("The interval in which to capture dynamic information such as "+ "telemetry, and profiling. Defaults to %s.", debugInterval)) @@ -195,7 +195,7 @@ func (c *cmd) Run(args []string) int { } // Capture dynamic information from the target agent, blocking for duration - if c.configuredTarget("metrics") || c.configuredTarget("logs") || c.configuredTarget("pprof") { + if c.captureTarget(targetMetrics) || c.captureTarget(targetLogs) || c.captureTarget(targetProfiles) { g := new(errgroup.Group) g.Go(c.captureInterval) g.Go(c.captureLongRunning) @@ -264,11 +264,11 @@ func (c *cmd) prepare() (version string, err error) { // If none are specified we will collect information from // all by default if len(c.capture) == 0 { - c.capture = c.defaultTargets() + c.capture = defaultTargets } for _, t := range c.capture { - if !c.allowedTarget(t) { + if !allowedTarget(t) { return version, fmt.Errorf("target not found: %s", t) } } @@ -288,56 +288,48 @@ func (c *cmd) prepare() (version string, err error) { // captureStatic captures static target information and writes it // to the output path func (c *cmd) captureStatic() error { - // Collect errors via multierror as we want to gracefully - // fail if an API is inaccessible var errs error - // Collect the named outputs here - outputs := make(map[string]interface{}) - - // Capture host information - if c.configuredTarget("host") { + if c.captureTarget(targetHost) { host, err := c.client.Agent().Host() if err != nil { errs = multierror.Append(errs, err) } - outputs["host"] = host + if err := writeJSONFile(filepath.Join(c.output, targetHost+".json"), host); err != nil { + errs = multierror.Append(errs, err) + } } - // Capture agent information - if c.configuredTarget("agent") { + if c.captureTarget(targetAgent) { agent, err := c.client.Agent().Self() if err != nil { errs = multierror.Append(errs, err) } - outputs["agent"] = agent + if err := writeJSONFile(filepath.Join(c.output, targetAgent+".json"), agent); err != nil { + errs = multierror.Append(errs, err) + } } - // Capture cluster members information, including WAN - if c.configuredTarget("cluster") { + if c.captureTarget(targetMembers) { members, err := c.client.Agent().Members(true) if err != nil { errs = multierror.Append(errs, err) } - outputs["cluster"] = members - } - - // Write all outputs to disk as JSON - for output, v := range outputs { - marshaled, err := json.MarshalIndent(v, "", "\t") - if err != nil { - errs = multierror.Append(errs, err) - } - - err = ioutil.WriteFile(fmt.Sprintf("%s/%s.json", c.output, output), marshaled, 0644) - if err != nil { + if err := writeJSONFile(filepath.Join(c.output, targetMembers+".json"), members); err != nil { errs = multierror.Append(errs, err) } } - return errs } +func writeJSONFile(filename string, content interface{}) error { + marshaled, err := json.MarshalIndent(content, "", "\t") + if err != nil { + return err + } + return ioutil.WriteFile(filename, marshaled, 0644) +} + // captureInterval blocks for the duration of the command // specified by the duration flag, capturing the dynamic // targets at the interval specified @@ -380,7 +372,7 @@ func captureShortLived(c *cmd) error { if err != nil { return err } - if c.configuredTarget("pprof") { + if c.captureTarget(targetProfiles) { g.Go(func() error { return c.captureHeap(timestampDir) }) @@ -417,7 +409,7 @@ func (c *cmd) captureLongRunning() error { if s < 1 { s = 1 } - if c.configuredTarget("pprof") { + if c.captureTarget(targetProfiles) { g.Go(func() error { return c.captureProfile(s, timestampDir) }) @@ -426,12 +418,12 @@ func (c *cmd) captureLongRunning() error { return c.captureTrace(s, timestampDir) }) } - if c.configuredTarget("logs") { + if c.captureTarget(targetLogs) { g.Go(func() error { return c.captureLogs(timestampDir) }) } - if c.configuredTarget("metrics") { + if c.captureTarget(targetMetrics) { // TODO: pass in context from caller ctx, cancel := context.WithTimeout(context.Background(), c.duration) defer cancel() @@ -450,8 +442,7 @@ func (c *cmd) captureGoRoutines(timestampDir string) error { return fmt.Errorf("failed to collect goroutine profile: %w", err) } - err = ioutil.WriteFile(fmt.Sprintf("%s/goroutine.prof", timestampDir), gr, 0644) - return err + return ioutil.WriteFile(fmt.Sprintf("%s/goroutine.prof", timestampDir), gr, 0644) } func (c *cmd) captureTrace(s float64, timestampDir string) error { @@ -460,8 +451,7 @@ func (c *cmd) captureTrace(s float64, timestampDir string) error { return fmt.Errorf("failed to collect trace: %w", err) } - err = ioutil.WriteFile(fmt.Sprintf("%s/trace.out", timestampDir), trace, 0644) - return err + return ioutil.WriteFile(fmt.Sprintf("%s/trace.out", timestampDir), trace, 0644) } func (c *cmd) captureProfile(s float64, timestampDir string) error { @@ -470,8 +460,7 @@ func (c *cmd) captureProfile(s float64, timestampDir string) error { return fmt.Errorf("failed to collect cpu profile: %w", err) } - err = ioutil.WriteFile(fmt.Sprintf("%s/profile.prof", timestampDir), prof, 0644) - return err + return ioutil.WriteFile(fmt.Sprintf("%s/profile.prof", timestampDir), prof, 0644) } func (c *cmd) captureHeap(timestampDir string) error { @@ -480,8 +469,7 @@ func (c *cmd) captureHeap(timestampDir string) error { return fmt.Errorf("failed to collect heap profile: %w", err) } - err = ioutil.WriteFile(fmt.Sprintf("%s/heap.prof", timestampDir), heap, 0644) - return err + return ioutil.WriteFile(fmt.Sprintf("%s/heap.prof", timestampDir), heap, 0644) } func (c *cmd) captureLogs(timestampDir string) error { @@ -538,23 +526,31 @@ func (c *cmd) captureMetrics(ctx context.Context, timestampDir string) error { return nil } -// allowedTarget returns a boolean if the target is able to be captured -func (c *cmd) allowedTarget(target string) bool { - for _, dt := range c.defaultTargets() { +// allowedTarget returns true if the target is a recognized name of a capture +// target. +func allowedTarget(target string) bool { + for _, dt := range defaultTargets { if dt == target { return true } } + for _, t := range deprecatedTargets { + if t == target { + return true + } + } return false } -// configuredTarget returns a boolean if the target is configured to be -// captured in the command -func (c *cmd) configuredTarget(target string) bool { +// captureTarget returns true if the target capture type is enabled. +func (c *cmd) captureTarget(target string) bool { for _, dt := range c.capture { if dt == target { return true } + if target == targetMembers && dt == targetCluster { + return true + } } return false } @@ -676,12 +672,29 @@ func (c *cmd) createArchiveTemp(path string) (tempName string, err error) { return tempName, nil } -// defaultTargets specifies the list of all targets that -// will be captured by default -func (c *cmd) defaultTargets() []string { - return []string{"metrics", "logs", "pprof", "host", "agent", "cluster"} +const ( + targetMetrics = "metrics" + targetLogs = "logs" + targetProfiles = "pprof" + targetHost = "host" + targetAgent = "agent" + targetMembers = "members" + // targetCluster is the now deprecated name for targetMembers + targetCluster = "cluster" +) + +// defaultTargets specifies the list of targets that will be captured by default +var defaultTargets = []string{ + targetMetrics, + targetLogs, + targetProfiles, + targetHost, + targetAgent, + targetMembers, } +var deprecatedTargets = []string{targetCluster} + func (c *cmd) Synopsis() string { return "Records a debugging archive for operators" } diff --git a/command/debug/debug_test.go b/command/debug/debug_test.go index 1fa331f81d..fee8113b66 100644 --- a/command/debug/debug_test.go +++ b/command/debug/debug_test.go @@ -258,17 +258,17 @@ func TestDebugCommand_CaptureTargets(t *testing.T) { "single": { []string{"agent"}, []string{"agent.json"}, - []string{"host.json", "cluster.json"}, + []string{"host.json", "members.json"}, }, "static": { []string{"agent", "host", "cluster"}, - []string{"agent.json", "host.json", "cluster.json"}, + []string{"agent.json", "host.json", "members.json"}, []string{"*/metrics.json"}, }, "metrics-only": { []string{"metrics"}, []string{"*/metrics.json"}, - []string{"agent.json", "host.json", "cluster.json"}, + []string{"agent.json", "host.json", "members.json"}, }, "all-but-pprof": { []string{ @@ -281,7 +281,7 @@ func TestDebugCommand_CaptureTargets(t *testing.T) { []string{ "host.json", "agent.json", - "cluster.json", + "members.json", "*/metrics.json", "*/consul.log", }, From 31bcd805288a047b0a4ac1deda44e99489c94677 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 6 Aug 2021 13:46:01 -0400 Subject: [PATCH 07/28] debug: improve a couple of the test cases Use gotest.tools/v3/fs to make better assertions about the files Remove the TestAgent from TestDebugCommand_Prepare_ValidateTiming, since we can test that validation without making any API calls. --- command/debug/debug_test.go | 93 +++++++++++++++---------------------- go.mod | 1 + go.sum | 3 ++ 3 files changed, 42 insertions(+), 55 deletions(-) diff --git a/command/debug/debug_test.go b/command/debug/debug_test.go index fee8113b66..215475706c 100644 --- a/command/debug/debug_test.go +++ b/command/debug/debug_test.go @@ -14,19 +14,18 @@ import ( "testing" "time" + "github.com/google/pprof/profile" "github.com/mitchellh/cli" "github.com/stretchr/testify/require" - - "github.com/google/pprof/profile" + "gotest.tools/v3/assert" + "gotest.tools/v3/fs" "github.com/hashicorp/consul/agent" "github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/consul/testrpc" ) -func TestDebugCommand_noTabs(t *testing.T) { - t.Parallel() - +func TestDebugCommand_Help_TextContainsNoTabs(t *testing.T) { if strings.ContainsRune(New(cli.NewMockUi(), nil).Help(), '\t') { t.Fatal("help has tabs") } @@ -63,6 +62,16 @@ func TestDebugCommand(t *testing.T) { require.Equal(t, 0, code) require.Equal(t, "", ui.ErrorWriter.String()) + expected := fs.Expected(t, + fs.WithDir("debug", + fs.WithFile("agent.json", "", fs.MatchAnyFileContent), + fs.WithFile("host.json", "", fs.MatchAnyFileContent), + fs.WithFile("index.json", "", fs.MatchAnyFileContent), + fs.WithFile("members.json", "", fs.MatchAnyFileContent), + // TODO: make the sub-directory names predictable) + fs.MatchExtraFiles)) + assert.Assert(t, fs.Equal(testDir, expected)) + metricsFiles, err := filepath.Glob(fmt.Sprintf("%s/*/%s", outputPath, "metrics.json")) require.NoError(t, err) require.Len(t, metricsFiles, 1) @@ -127,15 +136,10 @@ func TestDebugCommand_Archive(t *testing.T) { } func TestDebugCommand_ArgsBad(t *testing.T) { - t.Parallel() - ui := cli.NewMockUi() cmd := New(ui, nil) - args := []string{ - "foo", - "bad", - } + args := []string{"foo", "bad"} if code := cmd.Run(args); code == 0 { t.Fatalf("should exit non-zero, got code: %d", code) @@ -518,65 +522,44 @@ func TestDebugCommand_ProfilesExist(t *testing.T) { } } -func TestDebugCommand_ValidateTiming(t *testing.T) { - if testing.Short() { - t.Skip("too slow for testing.Short") - } - +func TestDebugCommand_Prepare_ValidateTiming(t *testing.T) { cases := map[string]struct { duration string interval string - output string - code int + expected string }{ "both": { - "20ms", - "10ms", - "duration must be longer", - 1, + duration: "20ms", + interval: "10ms", + expected: "duration must be longer", }, "short interval": { - "10s", - "10ms", - "interval must be longer", - 1, + duration: "10s", + interval: "10ms", + expected: "interval must be longer", }, "lower duration": { - "20s", - "30s", - "must be longer than interval", - 1, + duration: "20s", + interval: "30s", + expected: "must be longer than interval", }, } for name, tc := range cases { - // Because we're only testng validation, we want to shut down - // the valid duration test to avoid hanging - shutdownCh := make(chan struct{}) + t.Run(name, func(t *testing.T) { + ui := cli.NewMockUi() + cmd := New(ui, nil) - a := agent.NewTestAgent(t, "") - defer a.Shutdown() - testrpc.WaitForLeader(t, a.RPC, "dc1") + args := []string{ + "-duration=" + tc.duration, + "-interval=" + tc.interval, + } + err := cmd.flags.Parse(args) + require.NoError(t, err) - ui := cli.NewMockUi() - cmd := New(ui, shutdownCh) - - args := []string{ - "-http-addr=" + a.HTTPAddr(), - "-duration=" + tc.duration, - "-interval=" + tc.interval, - "-capture=agent", - } - code := cmd.Run(args) - - if code != tc.code { - t.Errorf("%s: should exit %d, got code: %d", name, tc.code, code) - } - - errOutput := ui.ErrorWriter.String() - if !strings.Contains(errOutput, tc.output) { - t.Errorf("%s: expected error output '%s', got '%q'", name, tc.output, errOutput) - } + _, err = cmd.prepare() + testutil.RequireErrorContains(t, err, tc.expected) + }) } } diff --git a/go.mod b/go.mod index 9c7bb865f3..b2531f76e5 100644 --- a/go.mod +++ b/go.mod @@ -92,6 +92,7 @@ require ( google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55 google.golang.org/grpc v1.25.1 gopkg.in/square/go-jose.v2 v2.5.1 + gotest.tools/v3 v3.0.3 k8s.io/api v0.16.9 k8s.io/apimachinery v0.16.9 k8s.io/client-go v0.16.9 diff --git a/go.sum b/go.sum index 35862eacef..c8c2ff4fc4 100644 --- a/go.sum +++ b/go.sum @@ -652,6 +652,7 @@ golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3 golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190624222133-a101b041ded4/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190907020128-2ca718005c18/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= @@ -710,6 +711,8 @@ gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0= +gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= From 4359e3811424b062d2e751045f99be98cc6bd4eb Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 6 Aug 2021 14:24:12 -0400 Subject: [PATCH 08/28] debug: restore cancel on SigInt Some previous changes broke interrupting the debug on SigInterupt. This change restores the original behaviour by passing a context to requests. Since a new API client function was required to pass the context, I had it also return an io.ReadCloser, so that output can be streamed to files instead of fully buffering in process memory. --- api/debug.go | 22 +++++++++ command/commands_oss.go | 2 +- command/debug/debug.go | 92 +++++++++++++++++++++++-------------- command/debug/debug_test.go | 24 +++++----- command/registry.go | 1 + 5 files changed, 93 insertions(+), 48 deletions(-) diff --git a/api/debug.go b/api/debug.go index 56dcc9bcd2..2e7bb94b5b 100644 --- a/api/debug.go +++ b/api/debug.go @@ -1,7 +1,9 @@ package api import ( + "context" "fmt" + "io" "io/ioutil" "strconv" ) @@ -70,6 +72,26 @@ func (d *Debug) Profile(seconds int) ([]byte, error) { return body, nil } +// PProf returns a pprof profile for the specified number of seconds. The caller +// is responsible for closing the returned io.ReadCloser once all bytes are read. +func (d *Debug) PProf(ctx context.Context, name string, seconds int) (io.ReadCloser, error) { + r := d.c.newRequest("GET", "/debug/pprof/"+name) + r.ctx = ctx + + // Capture a profile for the specified number of seconds + r.params.Set("seconds", strconv.Itoa(seconds)) + + _, resp, err := d.c.doRequest(r) + if err != nil { + return nil, fmt.Errorf("error making request: %s", err) + } + + if resp.StatusCode != 200 { + return nil, generateUnexpectedResponseCodeError(resp) + } + return resp.Body, nil +} + // Trace returns an execution trace func (d *Debug) Trace(seconds int) ([]byte, error) { r := d.c.newRequest("GET", "/debug/pprof/trace") diff --git a/command/commands_oss.go b/command/commands_oss.go index cfa8a7e621..92b49e4106 100644 --- a/command/commands_oss.go +++ b/command/commands_oss.go @@ -166,7 +166,7 @@ func init() { Register("connect envoy pipe-bootstrap", func(ui cli.Ui) (cli.Command, error) { return pipebootstrap.New(ui), nil }) Register("connect expose", func(ui cli.Ui) (cli.Command, error) { return expose.New(ui), nil }) Register("connect redirect-traffic", func(ui cli.Ui) (cli.Command, error) { return redirecttraffic.New(ui), nil }) - Register("debug", func(ui cli.Ui) (cli.Command, error) { return debug.New(ui, MakeShutdownCh()), nil }) + Register("debug", func(ui cli.Ui) (cli.Command, error) { return debug.New(ui), nil }) Register("event", func(ui cli.Ui) (cli.Command, error) { return event.New(ui), nil }) Register("exec", func(ui cli.Ui) (cli.Command, error) { return exec.New(ui, MakeShutdownCh()), nil }) Register("force-leave", func(ui cli.Ui) (cli.Command, error) { return forceleave.New(ui), nil }) diff --git a/command/debug/debug.go b/command/debug/debug.go index ccb2607889..869161dadd 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -12,8 +12,10 @@ import ( "io" "io/ioutil" "os" + "os/signal" "path/filepath" "strings" + "syscall" "time" "golang.org/x/sync/errgroup" @@ -55,7 +57,7 @@ const ( debugProtocolVersion = 1 ) -func New(ui cli.Ui, shutdownCh <-chan struct{}) *cmd { +func New(ui cli.Ui) *cmd { ui = &cli.PrefixedUi{ OutputPrefix: "==> ", InfoPrefix: " ", @@ -63,7 +65,7 @@ func New(ui cli.Ui, shutdownCh <-chan struct{}) *cmd { Ui: ui, } - c := &cmd{UI: ui, shutdownCh: shutdownCh} + c := &cmd{UI: ui} c.init() return c } @@ -74,8 +76,6 @@ type cmd struct { http *flags.HTTPFlags help string - shutdownCh <-chan struct{} - // flags interval time.Duration duration time.Duration @@ -136,6 +136,9 @@ func (c *cmd) init() { } func (c *cmd) Run(args []string) int { + ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer cancel() + if err := c.flags.Parse(args); err != nil { c.UI.Error(fmt.Sprintf("Error parsing flags: %s", err)) return 1 @@ -197,8 +200,12 @@ func (c *cmd) Run(args []string) int { // Capture dynamic information from the target agent, blocking for duration if c.captureTarget(targetMetrics) || c.captureTarget(targetLogs) || c.captureTarget(targetProfiles) { g := new(errgroup.Group) - g.Go(c.captureInterval) - g.Go(c.captureLongRunning) + g.Go(func() error { + return c.captureInterval(ctx) + }) + g.Go(func() error { + return c.captureLongRunning(ctx) + }) err = g.Wait() if err != nil { c.UI.Error(fmt.Sprintf("Error encountered during collection: %v", err)) @@ -333,7 +340,7 @@ func writeJSONFile(filename string, content interface{}) error { // captureInterval blocks for the duration of the command // specified by the duration flag, capturing the dynamic // targets at the interval specified -func (c *cmd) captureInterval() error { +func (c *cmd) captureInterval(ctx context.Context) error { intervalChn := time.NewTicker(c.interval) defer intervalChn.Stop() durationChn := time.After(c.duration) @@ -358,7 +365,7 @@ func (c *cmd) captureInterval() error { case <-durationChn: intervalChn.Stop() return nil - case <-c.shutdownCh: + case <-ctx.Done(): return errors.New("stopping collection due to shutdown signal") } } @@ -395,7 +402,7 @@ func (c *cmd) createTimestampDir(timestamp int64) (string, error) { return timestampDir, nil } -func (c *cmd) captureLongRunning() error { +func (c *cmd) captureLongRunning(ctx context.Context) error { timestamp := time.Now().Local().Unix() timestampDir, err := c.createTimestampDir(timestamp) @@ -411,24 +418,27 @@ func (c *cmd) captureLongRunning() error { } if c.captureTarget(targetProfiles) { g.Go(func() error { - return c.captureProfile(s, timestampDir) + // use ctx without a timeout to allow the profile to finish sending + return c.captureProfile(ctx, s, timestampDir) }) g.Go(func() error { - return c.captureTrace(s, timestampDir) + // use ctx without a timeout to allow the trace to finish sending + return c.captureTrace(ctx, s, timestampDir) }) } if c.captureTarget(targetLogs) { g.Go(func() error { - return c.captureLogs(timestampDir) + ctx, cancel := context.WithTimeout(ctx, c.duration) + defer cancel() + return c.captureLogs(ctx, timestampDir) }) } if c.captureTarget(targetMetrics) { - // TODO: pass in context from caller - ctx, cancel := context.WithTimeout(context.Background(), c.duration) - defer cancel() - g.Go(func() error { + + ctx, cancel := context.WithTimeout(ctx, c.duration) + defer cancel() return c.captureMetrics(ctx, timestampDir) }) } @@ -445,22 +455,38 @@ func (c *cmd) captureGoRoutines(timestampDir string) error { return ioutil.WriteFile(fmt.Sprintf("%s/goroutine.prof", timestampDir), gr, 0644) } -func (c *cmd) captureTrace(s float64, timestampDir string) error { - trace, err := c.client.Debug().Trace(int(s)) - if err != nil { - return fmt.Errorf("failed to collect trace: %w", err) - } - - return ioutil.WriteFile(fmt.Sprintf("%s/trace.out", timestampDir), trace, 0644) -} - -func (c *cmd) captureProfile(s float64, timestampDir string) error { - prof, err := c.client.Debug().Profile(int(s)) +func (c *cmd) captureTrace(ctx context.Context, s float64, timestampDir string) error { + prof, err := c.client.Debug().PProf(ctx, "trace", int(s)) if err != nil { return fmt.Errorf("failed to collect cpu profile: %w", err) } + defer prof.Close() - return ioutil.WriteFile(fmt.Sprintf("%s/profile.prof", timestampDir), prof, 0644) + r := bufio.NewReader(prof) + fh, err := os.Create(fmt.Sprintf("%s/trace.out", timestampDir)) + if err != nil { + return err + } + defer fh.Close() + _, err = r.WriteTo(fh) + return err +} + +func (c *cmd) captureProfile(ctx context.Context, s float64, timestampDir string) error { + prof, err := c.client.Debug().PProf(ctx, "profile", int(s)) + if err != nil { + return fmt.Errorf("failed to collect cpu profile: %w", err) + } + defer prof.Close() + + r := bufio.NewReader(prof) + fh, err := os.Create(fmt.Sprintf("%s/profile.prof", timestampDir)) + if err != nil { + return err + } + defer fh.Close() + _, err = r.WriteTo(fh) + return err } func (c *cmd) captureHeap(timestampDir string) error { @@ -472,15 +498,11 @@ func (c *cmd) captureHeap(timestampDir string) error { return ioutil.WriteFile(fmt.Sprintf("%s/heap.prof", timestampDir), heap, 0644) } -func (c *cmd) captureLogs(timestampDir string) error { - endLogChn := make(chan struct{}) - timeIsUp := time.After(c.duration) - logCh, err := c.client.Agent().Monitor("DEBUG", endLogChn, nil) +func (c *cmd) captureLogs(ctx context.Context, timestampDir string) error { + logCh, err := c.client.Agent().Monitor("DEBUG", ctx.Done(), nil) if err != nil { return err } - // Close the log stream - defer close(endLogChn) // Create the log file for writing f, err := os.Create(fmt.Sprintf("%s/%s", timestampDir, "consul.log")) @@ -498,7 +520,7 @@ func (c *cmd) captureLogs(timestampDir string) error { if _, err = f.WriteString(log + "\n"); err != nil { return err } - case <-timeIsUp: + case <-ctx.Done(): return nil } } diff --git a/command/debug/debug_test.go b/command/debug/debug_test.go index 215475706c..22820e5cfb 100644 --- a/command/debug/debug_test.go +++ b/command/debug/debug_test.go @@ -26,7 +26,7 @@ import ( ) func TestDebugCommand_Help_TextContainsNoTabs(t *testing.T) { - if strings.ContainsRune(New(cli.NewMockUi(), nil).Help(), '\t') { + if strings.ContainsRune(New(cli.NewMockUi()).Help(), '\t') { t.Fatal("help has tabs") } } @@ -46,7 +46,7 @@ func TestDebugCommand(t *testing.T) { testrpc.WaitForLeader(t, a.RPC, "dc1") ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := fmt.Sprintf("%s/debug", testDir) @@ -92,7 +92,7 @@ func TestDebugCommand_Archive(t *testing.T) { testrpc.WaitForLeader(t, a.RPC, "dc1") ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := fmt.Sprintf("%s/debug", testDir) @@ -137,7 +137,7 @@ func TestDebugCommand_Archive(t *testing.T) { func TestDebugCommand_ArgsBad(t *testing.T) { ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) args := []string{"foo", "bad"} @@ -153,7 +153,7 @@ func TestDebugCommand_ArgsBad(t *testing.T) { func TestDebugCommand_InvalidFlags(t *testing.T) { ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := "" @@ -186,7 +186,7 @@ func TestDebugCommand_OutputPathBad(t *testing.T) { testrpc.WaitForLeader(t, a.RPC, "dc1") ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := "" @@ -219,7 +219,7 @@ func TestDebugCommand_OutputPathExists(t *testing.T) { testrpc.WaitForLeader(t, a.RPC, "dc1") ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := fmt.Sprintf("%s/debug", testDir) @@ -304,7 +304,7 @@ func TestDebugCommand_CaptureTargets(t *testing.T) { testrpc.WaitForLeader(t, a.RPC, "dc1") ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := fmt.Sprintf("%s/debug-%s", testDir, name) @@ -387,7 +387,7 @@ func TestDebugCommand_CaptureLogs(t *testing.T) { testrpc.WaitForLeader(t, a.RPC, "dc1") ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := fmt.Sprintf("%s/debug-%s", testDir, name) @@ -480,7 +480,7 @@ func TestDebugCommand_ProfilesExist(t *testing.T) { testrpc.WaitForLeader(t, a.RPC, "dc1") ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := fmt.Sprintf("%s/debug", testDir) @@ -548,7 +548,7 @@ func TestDebugCommand_Prepare_ValidateTiming(t *testing.T) { for name, tc := range cases { t.Run(name, func(t *testing.T) { ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) args := []string{ "-duration=" + tc.duration, @@ -579,7 +579,7 @@ func TestDebugCommand_DebugDisabled(t *testing.T) { testrpc.WaitForLeader(t, a.RPC, "dc1") ui := cli.NewMockUi() - cmd := New(ui, nil) + cmd := New(ui) cmd.validateTiming = false outputPath := fmt.Sprintf("%s/debug", testDir) diff --git a/command/registry.go b/command/registry.go index a96818c2f3..b400a92dfb 100644 --- a/command/registry.go +++ b/command/registry.go @@ -47,6 +47,7 @@ var registry map[string]Factory // MakeShutdownCh returns a channel that can be used for shutdown notifications // for commands. This channel will send a message for every interrupt or SIGTERM // received. +// Deprecated: use signal.NotifyContext func MakeShutdownCh() <-chan struct{} { resultCh := make(chan struct{}) signalCh := make(chan os.Signal, 4) From 26ef0df458578597b6c73c19cac391e92fde8ed9 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Wed, 18 Aug 2021 12:17:20 -0400 Subject: [PATCH 09/28] docs: update CLI reference docs for debug the cluster target was renamed to members. --- website/content/commands/debug.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/content/commands/debug.mdx b/website/content/commands/debug.mdx index dc4282d1b9..ded40e58e6 100644 --- a/website/content/commands/debug.mdx +++ b/website/content/commands/debug.mdx @@ -8,13 +8,13 @@ page_title: 'Commands: Debug' Command: `consul debug` The `consul debug` command monitors a Consul agent for the specified period of -time, recording information about the agent, cluster, and environment to an archive +time, recording information about the agent, cluster membership, and environment to an archive written to the current directory. Providing support for complex issues encountered by Consul operators often requires a large amount of debugging information to be retrieved. This command aims to shortcut that coordination and provide a simple workflow for accessing -data about Consul agent, cluster, and environment to enable faster +data about Consul agent, cluster membership, and environment to enable faster isolation and debugging of issues. This command requires an `operator:read` ACL token in order to retrieve the @@ -75,7 +75,7 @@ information when `debug` is running. By default, it captures all information. | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `agent` | Version and configuration information about the agent. | | `host` | Information about resources on the host running the target agent such as CPU, memory, and disk. | -| `cluster` | A list of all the WAN and LAN members in the cluster. | +| `members` | A list of all the WAN and LAN members in the cluster. | | `metrics` | Metrics from the in-memory metrics endpoint in the target, captured at the interval. | | `logs` | `DEBUG` level logs for the target agent, captured for the duration. | | `pprof` | Golang heap, CPU, goroutine, and trace profiling. CPU and traces are captured for `duration` in a single file while heap and goroutine are separate snapshots for each `interval`. This information is not retrieved unless [`enable_debug`](/docs/agent/options#enable_debug) is set to `true` on the target agent or ACLs are enable and an ACL token with `operator:read` is provided. | From 2f8d0e12cf23a42c4a4ffe9b1ca3e22a8ed68d7d Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 6 Aug 2021 14:32:36 -0400 Subject: [PATCH 10/28] debug: small cleanup Use the new WriteJsonFile function to write index.json Remove .String() from time.local() since that is done by %s Remove an unused field. --- command/debug/debug.go | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/command/debug/debug.go b/command/debug/debug.go index 869161dadd..a9227e957b 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -18,10 +18,9 @@ import ( "syscall" "time" - "golang.org/x/sync/errgroup" - "github.com/hashicorp/go-multierror" "github.com/mitchellh/cli" + "golang.org/x/sync/errgroup" "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/command/flags" @@ -86,8 +85,6 @@ type cmd struct { // validateTiming can be used to skip validation of interval, duration. This // is primarily useful for testing validateTiming bool - - index *debugIndex } // debugIndex is used to manage the summary of all data recorded @@ -178,15 +175,6 @@ func (c *cmd) Run(args []string) int { c.UI.Info(fmt.Sprintf(" Output: '%s'", archiveName)) c.UI.Info(fmt.Sprintf(" Capture: '%s'", strings.Join(c.capture, ", "))) - // Record some information for the index at the root of the archive - index := &debugIndex{ - Version: debugProtocolVersion, - AgentVersion: version, - Interval: c.interval.String(), - Duration: c.duration.String(), - Targets: c.capture, - } - // Add the extra grace period to ensure // all intervals will be captured within the time allotted c.duration = c.duration + debugDurationGrace @@ -212,15 +200,15 @@ func (c *cmd) Run(args []string) int { } } - // Write the index document - idxMarshalled, err := json.MarshalIndent(index, "", "\t") - if err != nil { - c.UI.Error(fmt.Sprintf("Error marshalling index document: %v", err)) - return 1 + // Record some information for the index at the root of the archive + index := &debugIndex{ + Version: debugProtocolVersion, + AgentVersion: version, + Interval: c.interval.String(), + Duration: c.duration.String(), + Targets: c.capture, } - - err = ioutil.WriteFile(fmt.Sprintf("%s/index.json", c.output), idxMarshalled, 0644) - if err != nil { + if err := writeJSONFile(fmt.Sprintf("%s/index.json", c.output), index); err != nil { c.UI.Error(fmt.Sprintf("Error creating index document: %v", err)) return 1 } @@ -346,13 +334,13 @@ func (c *cmd) captureInterval(ctx context.Context) error { durationChn := time.After(c.duration) intervalCount := 0 - c.UI.Output(fmt.Sprintf("Beginning capture interval %s (%d)", time.Now().Local().String(), intervalCount)) + c.UI.Output(fmt.Sprintf("Beginning capture interval %s (%d)", time.Now().Local(), intervalCount)) err := captureShortLived(c) if err != nil { return err } - c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", time.Now().Local().String(), intervalCount)) + c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", time.Now().Local(), intervalCount)) for { select { case t := <-intervalChn.C: @@ -361,7 +349,7 @@ func (c *cmd) captureInterval(ctx context.Context) error { if err != nil { return err } - c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", t.Local().String(), intervalCount)) + c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", t.Local(), intervalCount)) case <-durationChn: intervalChn.Stop() return nil @@ -436,7 +424,6 @@ func (c *cmd) captureLongRunning(ctx context.Context) error { } if c.captureTarget(targetMetrics) { g.Go(func() error { - ctx, cancel := context.WithTimeout(ctx, c.duration) defer cancel() return c.captureMetrics(ctx, timestampDir) From 049c4e9623e84e7fbcfca54cebeeb5c814c212c5 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Wed, 18 Aug 2021 12:54:11 -0400 Subject: [PATCH 11/28] add changelog --- .changelog/10804.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/10804.txt diff --git a/.changelog/10804.txt b/.changelog/10804.txt new file mode 100644 index 0000000000..ed4323fd7d --- /dev/null +++ b/.changelog/10804.txt @@ -0,0 +1,3 @@ +```release-note:improvement +debug: rename cluster capture target to members, to be more consistent with the terms used by the API. +``` From 797ee061e4445f8f813b33d6bd6aee95959ab2d4 Mon Sep 17 00:00:00 2001 From: Daniel Nephin Date: Fri, 6 Aug 2021 17:17:25 -0400 Subject: [PATCH 12/28] debug: use human readable dates for filenames The unix timestamps that were used make the debug data a little bit more difficult to consume. By using human readable dates we can easily see when the profile data was collected. This commit also improves the test coverage. Two test cases are removed and the assertions from those cases are moved to TestDebugCommand. Now TestDebugCommand is able to validate the contents of all files. This change reduces the test runtime of the command/debug package by almost 50%. It also makes much more strict assertions about the contents by using gotest.tools/v3/fs. --- command/debug/debug.go | 82 ++++++------ command/debug/debug_test.go | 245 ++++++++++++------------------------ 2 files changed, 121 insertions(+), 206 deletions(-) diff --git a/command/debug/debug.go b/command/debug/debug.go index a9227e957b..1749d76b29 100644 --- a/command/debug/debug.go +++ b/command/debug/debug.go @@ -85,6 +85,9 @@ type cmd struct { // validateTiming can be used to skip validation of interval, duration. This // is primarily useful for testing validateTiming bool + // timeNow is a shim for testing, it is used to generate the time used in + // file paths. + timeNow func() time.Time } // debugIndex is used to manage the summary of all data recorded @@ -102,10 +105,16 @@ type debugIndex struct { Targets []string } +// timeDateformat is a modified version of time.RFC3339 which replaces colons with +// hyphens. This is to make it more convenient to untar these files, because +// tar assumes colons indicate the file is on a remote host, unless --force-local +// is used. +const timeDateFormat = "2006-01-02T15-04-05Z0700" + func (c *cmd) init() { c.flags = flag.NewFlagSet("", flag.ContinueOnError) - defaultFilename := fmt.Sprintf("consul-debug-%d", time.Now().Unix()) + defaultFilename := fmt.Sprintf("consul-debug-%v", time.Now().Format(timeDateFormat)) c.flags.Var((*flags.AppendSliceValue)(&c.capture), "capture", fmt.Sprintf("One or more types of information to capture. This can be used "+ @@ -130,6 +139,9 @@ func (c *cmd) init() { c.help = flags.Usage(help, c.flags) c.validateTiming = true + c.timeNow = func() time.Time { + return time.Now().UTC() + } } func (c *cmd) Run(args []string) int { @@ -208,7 +220,7 @@ func (c *cmd) Run(args []string) int { Duration: c.duration.String(), Targets: c.capture, } - if err := writeJSONFile(fmt.Sprintf("%s/index.json", c.output), index); err != nil { + if err := writeJSONFile(filepath.Join(c.output, "index.json"), index); err != nil { c.UI.Error(fmt.Sprintf("Error creating index document: %v", err)) return 1 } @@ -334,13 +346,13 @@ func (c *cmd) captureInterval(ctx context.Context) error { durationChn := time.After(c.duration) intervalCount := 0 - c.UI.Output(fmt.Sprintf("Beginning capture interval %s (%d)", time.Now().Local(), intervalCount)) + c.UI.Output(fmt.Sprintf("Beginning capture interval %s (%d)", time.Now().UTC(), intervalCount)) err := captureShortLived(c) if err != nil { return err } - c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", time.Now().Local(), intervalCount)) + c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", time.Now().UTC(), intervalCount)) for { select { case t := <-intervalChn.C: @@ -349,7 +361,7 @@ func (c *cmd) captureInterval(ctx context.Context) error { if err != nil { return err } - c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", t.Local(), intervalCount)) + c.UI.Output(fmt.Sprintf("Capture successful %s (%d)", t.UTC(), intervalCount)) case <-durationChn: intervalChn.Stop() return nil @@ -361,43 +373,32 @@ func (c *cmd) captureInterval(ctx context.Context) error { func captureShortLived(c *cmd) error { g := new(errgroup.Group) - timestamp := time.Now().Local().Unix() - timestampDir, err := c.createTimestampDir(timestamp) + dir, err := makeIntervalDir(c.output, c.timeNow()) if err != nil { return err } if c.captureTarget(targetProfiles) { g.Go(func() error { - return c.captureHeap(timestampDir) + return c.captureHeap(dir) }) g.Go(func() error { - return c.captureGoRoutines(timestampDir) + return c.captureGoRoutines(dir) }) } return g.Wait() } -func (c *cmd) createTimestampDir(timestamp int64) (string, error) { - // Make the directory that will store all captured data - // for this interval - timestampDir := fmt.Sprintf("%s/%d", c.output, timestamp) - err := os.MkdirAll(timestampDir, 0755) - if err != nil { - return "", err +func makeIntervalDir(base string, now time.Time) (string, error) { + dir := filepath.Join(base, now.Format(timeDateFormat)) + if err := os.MkdirAll(dir, 0755); err != nil { + return "", fmt.Errorf("failed to create output directory %v: %w", dir, err) } - return timestampDir, nil + return dir, nil } func (c *cmd) captureLongRunning(ctx context.Context) error { - timestamp := time.Now().Local().Unix() - - timestampDir, err := c.createTimestampDir(timestamp) - if err != nil { - return err - } - g := new(errgroup.Group) // Capture a profile/trace with a minimum of 1s s := c.duration.Seconds() @@ -407,42 +408,42 @@ func (c *cmd) captureLongRunning(ctx context.Context) error { if c.captureTarget(targetProfiles) { g.Go(func() error { // use ctx without a timeout to allow the profile to finish sending - return c.captureProfile(ctx, s, timestampDir) + return c.captureProfile(ctx, s) }) g.Go(func() error { // use ctx without a timeout to allow the trace to finish sending - return c.captureTrace(ctx, s, timestampDir) + return c.captureTrace(ctx, s) }) } if c.captureTarget(targetLogs) { g.Go(func() error { ctx, cancel := context.WithTimeout(ctx, c.duration) defer cancel() - return c.captureLogs(ctx, timestampDir) + return c.captureLogs(ctx) }) } if c.captureTarget(targetMetrics) { g.Go(func() error { ctx, cancel := context.WithTimeout(ctx, c.duration) defer cancel() - return c.captureMetrics(ctx, timestampDir) + return c.captureMetrics(ctx) }) } return g.Wait() } -func (c *cmd) captureGoRoutines(timestampDir string) error { +func (c *cmd) captureGoRoutines(outputDir string) error { gr, err := c.client.Debug().Goroutine() if err != nil { return fmt.Errorf("failed to collect goroutine profile: %w", err) } - return ioutil.WriteFile(fmt.Sprintf("%s/goroutine.prof", timestampDir), gr, 0644) + return ioutil.WriteFile(filepath.Join(outputDir, "goroutine.prof"), gr, 0644) } -func (c *cmd) captureTrace(ctx context.Context, s float64, timestampDir string) error { +func (c *cmd) captureTrace(ctx context.Context, s float64) error { prof, err := c.client.Debug().PProf(ctx, "trace", int(s)) if err != nil { return fmt.Errorf("failed to collect cpu profile: %w", err) @@ -450,7 +451,7 @@ func (c *cmd) captureTrace(ctx context.Context, s float64, timestampDir string) defer prof.Close() r := bufio.NewReader(prof) - fh, err := os.Create(fmt.Sprintf("%s/trace.out", timestampDir)) + fh, err := os.Create(filepath.Join(c.output, "trace.out")) if err != nil { return err } @@ -459,7 +460,7 @@ func (c *cmd) captureTrace(ctx context.Context, s float64, timestampDir string) return err } -func (c *cmd) captureProfile(ctx context.Context, s float64, timestampDir string) error { +func (c *cmd) captureProfile(ctx context.Context, s float64) error { prof, err := c.client.Debug().PProf(ctx, "profile", int(s)) if err != nil { return fmt.Errorf("failed to collect cpu profile: %w", err) @@ -467,7 +468,7 @@ func (c *cmd) captureProfile(ctx context.Context, s float64, timestampDir string defer prof.Close() r := bufio.NewReader(prof) - fh, err := os.Create(fmt.Sprintf("%s/profile.prof", timestampDir)) + fh, err := os.Create(filepath.Join(c.output, "profile.prof")) if err != nil { return err } @@ -476,23 +477,23 @@ func (c *cmd) captureProfile(ctx context.Context, s float64, timestampDir string return err } -func (c *cmd) captureHeap(timestampDir string) error { +func (c *cmd) captureHeap(outputDir string) error { heap, err := c.client.Debug().Heap() if err != nil { return fmt.Errorf("failed to collect heap profile: %w", err) } - return ioutil.WriteFile(fmt.Sprintf("%s/heap.prof", timestampDir), heap, 0644) + return ioutil.WriteFile(filepath.Join(outputDir, "heap.prof"), heap, 0644) } -func (c *cmd) captureLogs(ctx context.Context, timestampDir string) error { +func (c *cmd) captureLogs(ctx context.Context) error { logCh, err := c.client.Agent().Monitor("DEBUG", ctx.Done(), nil) if err != nil { return err } // Create the log file for writing - f, err := os.Create(fmt.Sprintf("%s/%s", timestampDir, "consul.log")) + f, err := os.Create(filepath.Join(c.output, "consul.log")) if err != nil { return err } @@ -513,15 +514,14 @@ func (c *cmd) captureLogs(ctx context.Context, timestampDir string) error { } } -func (c *cmd) captureMetrics(ctx context.Context, timestampDir string) error { +func (c *cmd) captureMetrics(ctx context.Context) error { stream, err := c.client.Agent().MetricsStream(ctx) if err != nil { return err } defer stream.Close() - filename := fmt.Sprintf("%s/%s.json", timestampDir, "metrics") - fh, err := os.Create(filename) + fh, err := os.Create(filepath.Join(c.output, "metrics.json")) if err != nil { return fmt.Errorf("failed to create metrics file: %w", err) } diff --git a/command/debug/debug_test.go b/command/debug/debug_test.go index 22820e5cfb..e36cc58b15 100644 --- a/command/debug/debug_test.go +++ b/command/debug/debug_test.go @@ -3,10 +3,11 @@ package debug import ( "archive/tar" "bufio" + "bytes" "compress/gzip" + "encoding/json" "fmt" "io" - "io/ioutil" "os" "path/filepath" "regexp" @@ -18,6 +19,7 @@ import ( "github.com/mitchellh/cli" "github.com/stretchr/testify/require" "gotest.tools/v3/assert" + "gotest.tools/v3/assert/cmp" "gotest.tools/v3/fs" "github.com/hashicorp/consul/agent" @@ -49,12 +51,17 @@ func TestDebugCommand(t *testing.T) { cmd := New(ui) cmd.validateTiming = false + it := &incrementalTime{ + base: time.Date(2021, 7, 8, 9, 10, 11, 0, time.UTC), + } + cmd.timeNow = it.Now + outputPath := fmt.Sprintf("%s/debug", testDir) args := []string{ "-http-addr=" + a.HTTPAddr(), "-output=" + outputPath, - "-duration=100ms", - "-interval=50ms", + "-duration=2s", + "-interval=1s", "-archive=false", } @@ -64,17 +71,75 @@ func TestDebugCommand(t *testing.T) { expected := fs.Expected(t, fs.WithDir("debug", - fs.WithFile("agent.json", "", fs.MatchAnyFileContent), - fs.WithFile("host.json", "", fs.MatchAnyFileContent), - fs.WithFile("index.json", "", fs.MatchAnyFileContent), - fs.WithFile("members.json", "", fs.MatchAnyFileContent), - // TODO: make the sub-directory names predictable) + fs.WithFile("index.json", "", fs.MatchFileContent(validIndexJSON)), + fs.WithFile("agent.json", "", fs.MatchFileContent(validJSON)), + fs.WithFile("host.json", "", fs.MatchFileContent(validJSON)), + fs.WithFile("members.json", "", fs.MatchFileContent(validJSON)), + fs.WithFile("metrics.json", "", fs.MatchAnyFileContent), + fs.WithFile("consul.log", "", fs.MatchFileContent(validLogFile)), + fs.WithFile("profile.prof", "", fs.MatchFileContent(validProfileData)), + fs.WithFile("trace.out", "", fs.MatchAnyFileContent), + fs.WithDir("2021-07-08T09-10-12Z", + fs.WithFile("goroutine.prof", "", fs.MatchFileContent(validProfileData)), + fs.WithFile("heap.prof", "", fs.MatchFileContent(validProfileData))), + fs.WithDir("2021-07-08T09-10-13Z", + fs.WithFile("goroutine.prof", "", fs.MatchFileContent(validProfileData)), + fs.WithFile("heap.prof", "", fs.MatchFileContent(validProfileData))), + // Ignore the extra directories, they should be the same as the first two fs.MatchExtraFiles)) assert.Assert(t, fs.Equal(testDir, expected)) - metricsFiles, err := filepath.Glob(fmt.Sprintf("%s/*/%s", outputPath, "metrics.json")) - require.NoError(t, err) - require.Len(t, metricsFiles, 1) + require.Equal(t, "", ui.ErrorWriter.String(), "expected no error output") +} + +func validLogFile(raw []byte) fs.CompareResult { + scanner := bufio.NewScanner(bytes.NewReader(raw)) + for scanner.Scan() { + logLine := scanner.Text() + if !validateLogLine([]byte(logLine)) { + return cmp.ResultFailure(fmt.Sprintf("log line is not valid %s", logLine)) + } + } + if scanner.Err() != nil { + return cmp.ResultFailure(scanner.Err().Error()) + } + return cmp.ResultSuccess +} + +func validIndexJSON(raw []byte) fs.CompareResult { + var target debugIndex + decoder := json.NewDecoder(bytes.NewReader(raw)) + decoder.DisallowUnknownFields() + if err := decoder.Decode(&target); err != nil { + return cmp.ResultFailure(err.Error()) + } + return cmp.ResultSuccess +} + +func validJSON(raw []byte) fs.CompareResult { + var target interface{} + decoder := json.NewDecoder(bytes.NewReader(raw)) + if err := decoder.Decode(&target); err != nil { + return cmp.ResultFailure(err.Error()) + } + return cmp.ResultSuccess +} + +func validProfileData(raw []byte) fs.CompareResult { + if _, err := profile.ParseData(raw); err != nil { + return cmp.ResultFailure(err.Error()) + } + return cmp.ResultSuccess +} + +type incrementalTime struct { + base time.Time + next uint64 +} + +func (t *incrementalTime) Now() time.Time { + t.next++ + return t.base.Add(time.Duration(t.next) * time.Second) } func TestDebugCommand_Archive(t *testing.T) { @@ -267,11 +332,11 @@ func TestDebugCommand_CaptureTargets(t *testing.T) { "static": { []string{"agent", "host", "cluster"}, []string{"agent.json", "host.json", "members.json"}, - []string{"*/metrics.json"}, + []string{"metrics.json"}, }, "metrics-only": { []string{"metrics"}, - []string{"*/metrics.json"}, + []string{"metrics.json"}, []string{"agent.json", "host.json", "members.json"}, }, "all-but-pprof": { @@ -286,8 +351,8 @@ func TestDebugCommand_CaptureTargets(t *testing.T) { "host.json", "agent.json", "members.json", - "*/metrics.json", - "*/consul.log", + "metrics.json", + "consul.log", }, []string{}, }, @@ -356,100 +421,6 @@ func TestDebugCommand_CaptureTargets(t *testing.T) { } } -func TestDebugCommand_CaptureLogs(t *testing.T) { - if testing.Short() { - t.Skip("too slow for testing.Short") - } - - cases := map[string]struct { - // used in -target param - targets []string - // existence verified after execution - files []string - // non-existence verified after execution - excludedFiles []string - }{ - "logs-only": { - []string{"logs"}, - []string{"*/consul.log"}, - []string{"agent.json", "host.json", "cluster.json", "*/metrics.json"}, - }, - } - - for name, tc := range cases { - testDir := testutil.TempDir(t, "debug") - - a := agent.NewTestAgent(t, ` - enable_debug = true - `) - - defer a.Shutdown() - testrpc.WaitForLeader(t, a.RPC, "dc1") - - ui := cli.NewMockUi() - cmd := New(ui) - cmd.validateTiming = false - - outputPath := fmt.Sprintf("%s/debug-%s", testDir, name) - args := []string{ - "-http-addr=" + a.HTTPAddr(), - "-output=" + outputPath, - "-archive=false", - "-duration=1000ms", - "-interval=50ms", - } - for _, t := range tc.targets { - args = append(args, "-capture="+t) - } - - if code := cmd.Run(args); code != 0 { - t.Fatalf("should exit 0, got code: %d", code) - } - - errOutput := ui.ErrorWriter.String() - if errOutput != "" { - t.Errorf("expected no error output, got %q", errOutput) - } - - // Ensure the debug data was written - _, err := os.Stat(outputPath) - if err != nil { - t.Fatalf("output path should exist: %s", err) - } - - // Ensure the captured static files exist - for _, f := range tc.files { - path := fmt.Sprintf("%s/%s", outputPath, f) - // Glob ignores file system errors - fs, _ := filepath.Glob(path) - if len(fs) <= 0 { - t.Fatalf("%s: output data should exist for %s", name, f) - } - for _, logFile := range fs { - content, err := ioutil.ReadFile(logFile) - require.NoError(t, err) - scanner := bufio.NewScanner(strings.NewReader(string(content))) - for scanner.Scan() { - logLine := scanner.Text() - if !validateLogLine([]byte(logLine)) { - t.Fatalf("%s: log line is not valid %s", name, logLine) - } - } - } - } - - // Ensure any excluded files do not exist - for _, f := range tc.excludedFiles { - path := fmt.Sprintf("%s/%s", outputPath, f) - // Glob ignores file system errors - fs, _ := filepath.Glob(path) - if len(fs) > 0 { - t.Fatalf("%s: output data should not exist for %s", name, f) - } - } - } -} - func validateLogLine(content []byte) bool { fields := strings.SplitN(string(content), " ", 2) if len(fields) != 2 { @@ -466,62 +437,6 @@ func validateLogLine(content []byte) bool { return valid } -func TestDebugCommand_ProfilesExist(t *testing.T) { - if testing.Short() { - t.Skip("too slow for testing.Short") - } - - testDir := testutil.TempDir(t, "debug") - - a := agent.NewTestAgent(t, ` - enable_debug = true - `) - defer a.Shutdown() - testrpc.WaitForLeader(t, a.RPC, "dc1") - - ui := cli.NewMockUi() - cmd := New(ui) - cmd.validateTiming = false - - outputPath := fmt.Sprintf("%s/debug", testDir) - println(outputPath) - args := []string{ - "-http-addr=" + a.HTTPAddr(), - "-output=" + outputPath, - // CPU profile has a minimum of 1s - "-archive=false", - "-duration=2s", - "-interval=1s", - "-capture=pprof", - } - - if code := cmd.Run(args); code != 0 { - t.Fatalf("should exit 0, got code: %d", code) - } - - profiles := []string{"heap.prof", "profile.prof", "goroutine.prof", "trace.out"} - // Glob ignores file system errors - for _, v := range profiles { - fs, _ := filepath.Glob(fmt.Sprintf("%s/*/%s", outputPath, v)) - if len(fs) < 1 { - t.Errorf("output data should exist for %s", v) - } - for _, f := range fs { - if !strings.Contains(f, "trace.out") { - content, err := ioutil.ReadFile(f) - require.NoError(t, err) - _, err = profile.ParseData(content) - require.NoError(t, err) - } - } - } - - errOutput := ui.ErrorWriter.String() - if errOutput != "" { - t.Errorf("expected no error output, got %s", errOutput) - } -} - func TestDebugCommand_Prepare_ValidateTiming(t *testing.T) { cases := map[string]struct { duration string From 39b3c3e2bddc519ac1b79ca471967afd0c6d1641 Mon Sep 17 00:00:00 2001 From: Mike Wickett Date: Wed, 18 Aug 2021 16:39:16 -0400 Subject: [PATCH 13/28] chore: update alert banner (#10816) --- website/data/alert-banner.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/website/data/alert-banner.js b/website/data/alert-banner.js index df30c0ab06..62ffbfca2e 100644 --- a/website/data/alert-banner.js +++ b/website/data/alert-banner.js @@ -2,11 +2,11 @@ export const ALERT_BANNER_ACTIVE = true // https://github.com/hashicorp/web-components/tree/master/packages/alert-banner export default { - tag: 'Thank you', - url: 'https://hashiconf.com/europe', - text: 'HashiConf Europe is a wrap. Watch this year’s sessions on-demand.', - linkText: 'Watch Now', - // Set the `expirationDate prop with a datetime string (e.g. `2020-01-31T12:00:00-07:00`) + tag: 'Event', + url: 'https://hashiconf.com/global/?utm_campaign=22Q3_WW_HASHICONFGLOBAL_EVENT-USER&utm_source=CorpBanner&utm_medium=EVT&utm_offer=EVENT-USER', + text: 'Join us for HashiConf Global - product updates, technical sessions, workshops & more', + linkText: 'Register now', + // Set the expirationDate prop with a datetime string (e.g. '2020-01-31T12:00:00-07:00') // if you'd like the component to stop showing at or after a certain date - expirationDate: `2021-06-20T12:00:00-07:00`, + expirationDate: '2021-10-01T12:00:00-07:00', } From e62b1d05d887d6f98cdfa6801e9594b1773d123b Mon Sep 17 00:00:00 2001 From: Blake Covarrubias Date: Thu, 19 Aug 2021 11:18:55 -0700 Subject: [PATCH 14/28] docs: Add common CA config options to provider doc pages (#10842) Add the list of common Connect CA configuration options to the provider-specific CA docs. Previously these options were only documented under the agent configuration options. This change makes it so that all supported CA provider configuration options are available from a single location. Co-authored-by: Daniel Nephin --- website/content/docs/agent/options.mdx | 49 +++++----- website/content/docs/connect/ca/aws.mdx | 55 +++++++++-- website/content/docs/connect/ca/consul.mdx | 20 ++-- website/content/docs/connect/ca/index.mdx | 6 +- website/content/docs/connect/ca/vault.mdx | 98 ++++++++++++++----- .../http_api_connect_ca_common_options.mdx | 71 ++++++++++++++ 6 files changed, 233 insertions(+), 66 deletions(-) create mode 100644 website/content/partials/http_api_connect_ca_common_options.mdx diff --git a/website/content/docs/agent/options.mdx b/website/content/docs/agent/options.mdx index 946a177577..4909b3c12a 100644 --- a/website/content/docs/agent/options.mdx +++ b/website/content/docs/agent/options.mdx @@ -1165,7 +1165,7 @@ bind_addr = "{{ GetPrivateInterfaces | include \"network\" \"10.0.0.0/8\" | attr through mesh gateways. Defaults to false. This was added in Consul 1.8.0. - `ca_provider` ((#connect_ca_provider)) Controls which CA provider to - use for Connect's CA. Currently only the `consul` and `vault` providers are supported. + use for Connect's CA. Currently only the `aws-pca`, `consul`, and `vault` providers are supported. This is only used when initially bootstrapping the cluster. For an existing cluster, use the [Update CA Configuration Endpoint](/api/connect/ca#update-ca-configuration). @@ -1176,6 +1176,12 @@ bind_addr = "{{ GetPrivateInterfaces | include \"network\" \"10.0.0.0/8\" | attr The following providers are supported: + #### AWS ACM Private CA Provider (`ca_provider = "aws-pca"`) + + - `existing_arn` ((#aws_ca_existing_arn)) The Amazon Resource Name (ARN) of + an existing private CA in your ACM account. If specified, Consul will + attempt to use the existing CA to issue certificates. + #### Consul CA Provider (`ca_provider = "consul"`) - `private_key` ((#consul_ca_private_key)) The PEM contents of the @@ -1212,34 +1218,33 @@ bind_addr = "{{ GetPrivateInterfaces | include \"network\" \"10.0.0.0/8\" | attr There are also a number of common configuration options supported by all providers: - - `csr_max_concurrent` ((#ca_csr_max_concurrent)) Sets a limit on how - many Certificate Signing Requests will be processed concurrently. Defaults - to 0 (disabled). This is useful when you have more than one or two cores available - to the server. For example on an 8 core server, setting this to 1 will ensure - that even during a CA rotation no more than one server core on the leader will - be consumed at a time with generating new certificates. Setting this is recommended - **instead** of `csr_max_per_second` where you know there are multiple cores available - since it is simpler to reason about limiting CSR resources this way without - artificially slowing down rotations. Added in 1.4.1. + - `csr_max_concurrent` ((#ca_csr_max_concurrent)) Sets a limit on the number + of Certificate Signing Requests that can be processed concurrently. Defaults + to 0 (disabled). This is useful when you want to limit the number of CPU cores + available to the server for certificate signing operations. For example, on an + 8 core server, setting this to 1 will ensure that no more than one CPU core + will be consumed when generating or rotating certificates. Setting this is + recommended **instead** of `csr_max_per_second` when you want to limit the + number of cores consumed since it is simpler to reason about limiting CSR + resources this way without artificially slowing down rotations. Added in 1.4.1. - `csr_max_per_second` ((#ca_csr_max_per_second)) Sets a rate limit on the maximum number of Certificate Signing Requests (CSRs) the servers will accept. This is used to prevent CA rotation from causing unbounded CPU usage - on servers. It defaults to 50 which is conservative - a 2017 Macbook can process - about 100 per second using only ~40% of one CPU core - but sufficient for deployments + on servers. It defaults to 50 which is conservative – a 2017 Macbook can process + about 100 per second using only ~40% of one CPU core – but sufficient for deployments up to ~1500 service instances before the time it takes to rotate is impacted. For larger deployments we recommend increasing this based on the expected number of server instances and server resources, or use `csr_max_concurrent` instead - if servers have more than one core. Setting this to zero disables rate limiting. + if servers have more than one CPU core. Setting this to zero disables rate limiting. Added in 1.4.1. - `leaf_cert_ttl` ((#ca_leaf_cert_ttl)) The upper bound on the lease duration of a leaf certificate issued for a service. In most cases a new leaf certificate will be requested by a proxy before this limit is reached. This is also the effective limit on how long a server outage can last (with no leader) - before network connections will start being rejected, and as a result the defaults - is `72h` to last through a weekend without intervention. This value cannot - be lower than 1 hour or higher than 1 year. + before network connections will start being rejected. Defaults to `72h`. + This value cannot be lower than 1 hour or higher than 1 year. This value is also used when rotating out old root certificates from the cluster. When a root certificate has been inactive (rotated out) @@ -1252,9 +1257,9 @@ bind_addr = "{{ GetPrivateInterfaces | include \"network\" \"10.0.0.0/8\" | attr PKI paths given for Vault then this will be ignored. Currently supported options are `ec` or `rsa`. Default is `ec`. - It is required that all servers in a Datacenter have + It is required that all servers in a datacenter have the same config for the CA. It is recommended that servers in - different Datacenters have the same CA config for key type and size + different datacenters use the same key type and size, although the built-in CA and Vault provider will both allow mixed CA key types. @@ -1752,7 +1757,7 @@ bind_addr = "{{ GetPrivateInterfaces | include \"network\" \"10.0.0.0/8\" | attr failover if this is increased significantly as more logs will need to be replayed. In Consul 1.1.0 and later this defaults to 16384, and in prior versions it was set to 8192. - + Since Consul 1.10.0 this can be reloaded using `consul reload` or sending the server a `SIGHUP` to allow tuning snapshot activity without a rolling restart in emergencies. @@ -1768,7 +1773,7 @@ bind_addr = "{{ GetPrivateInterfaces | include \"network\" \"10.0.0.0/8\" | attr is increased significantly as more logs will need to be replayed. In Consul 1.1.0 and later this defaults to `30s`, and in prior versions it was set to `5s`. - + Since Consul 1.10.0 this can be reloaded using `consul reload` or sending the server a `SIGHUP` to allow tuning snapshot activity without a rolling restart in emergencies. @@ -1781,10 +1786,10 @@ bind_addr = "{{ GetPrivateInterfaces | include \"network\" \"10.0.0.0/8\" | attr consider reducing write throughput or the amount of data stored on Consul as it is likely under a load it is not designed to handle. The default value is 10000 which is suitable for all normal workloads. Added in Consul 1.5.3. - + Since Consul 1.10.0 this can be reloaded using `consul reload` or sending the server a `SIGHUP` to allow recovery without downtime when followers can't keep - up. + up. - `reap` This controls Consul's automatic reaping of child processes, which is useful if Consul is running as PID 1 in a Docker container. If this isn't diff --git a/website/content/docs/connect/ca/aws.mdx b/website/content/docs/connect/ca/aws.mdx index 77eca45875..46e215f87c 100644 --- a/website/content/docs/connect/ca/aws.mdx +++ b/website/content/docs/connect/ca/aws.mdx @@ -43,10 +43,19 @@ The IAM credential provided must have permission for the following actions: ## Configuration -The ACM Private CA provider is enabled by setting the `ca_provider` to -`"aws-pca"`. At this time there is only one, optional configuration value. +The ACM Private CA provider is enabled by setting the CA provider to +`"aws-pca"` in the agent's [`ca_provider`] configuration option, or via the +[`/connect/ca/configuration`] API endpoint. At this time there is only one, +optional configuration value. + +Example configurations are shown below: + + + + ```hcl +# ... connect { enabled = true ca_provider = "aws-pca" @@ -56,14 +65,33 @@ connect { } ``` -~> Note that suitable AWS IAM credentials are necessary for the provider to -work, however these are not configured in the Consul config which is typically -on disk and rely on the [standard AWS SDK configuration + + + + +```json +{ + "Provider": "aws-pca", + "Config": { + "ExistingARN": "arn:aws:acm-pca:region:account:certificate-authority/12345678-1234-1234-123456789012" + } +} +``` + + + + + +~> **Note**: Suitable AWS IAM credentials are necessary for the provider to +work. However, these are not configured in the Consul config which is typically +on disk, and instead rely on the [standard AWS SDK configuration locations](https://docs.aws.amazon.com/sdk-for-go/v1/developer-guide/configuring-sdk.html#specifying-credentials). -The configuration options are listed below. Note, the -first key is the value used in API calls and the second key (after the `/`) -is used if you're adding configuring to the agent's configuration file. +The configuration options are listed below. + +-> **Note**: The first key is the value used in API calls, and the second key + (after the `/`) is used if you are adding the configuration to the agent's + configuration file. - `ExistingARN` / `existing_arn` (`string: `) - The Amazon Resource Name (ARN) of an existing private CA in your ACM account. If specified, @@ -76,10 +104,12 @@ is used if you're adding configuring to the agent's configuration file. root, Consul will automatically create a new subordinate signed by the primary's root instead. - The default behavior with no `ExistingArn` specified is for Consul to + The default behavior with no `ExistingARN` specified is for Consul to create a new root CA in the primary datacenter and a subordinate CA in each secondary DC. +@include 'http_api_connect_ca_common_options.mdx' + ## Limitations ACM Private CA has several @@ -93,7 +123,7 @@ limitations described below. ### Unable to Cross-sign Other CAs It's not possible to cross-sign other CA provider's root certificates during a -migration. ACM Private CA is capable of doing that through a different work flow +migration. ACM Private CA is capable of doing that through a different workflow but is not able to blindly cross-sign another root certificate without a CSR being generated. Both Consul's built-in CA and Vault can do this and the current workflow for managing CAs relies on it. @@ -146,3 +176,8 @@ The number of certificates issued could be reduced by increasing [`leaf_cert_ttl`](/docs/agent/options#ca_leaf_cert_ttl) in the CA Provider configuration if the longer lived credentials are an acceptable risk tradeoff against the cost. + + +[`ca_config`]: /docs/agent/options#connect_ca_config +[`ca_provider`]: /docs/agent/options#connect_ca_provider +[`/connect/ca/configuration`]: /api-docs/connect/ca#update-ca-configuration diff --git a/website/content/docs/connect/ca/consul.mdx b/website/content/docs/connect/ca/consul.mdx index 72ed653aff..bad7effbb0 100644 --- a/website/content/docs/connect/ca/consul.mdx +++ b/website/content/docs/connect/ca/consul.mdx @@ -28,18 +28,25 @@ CA providers. ## Configuration The built-in CA provider has no required configuration. Enabling Connect -alone will configure the built-in CA provider and will automatically generate +alone will configure the built-in CA provider, and will automatically generate a root certificate and private key: + + ```hcl +# ... connect { enabled = true } ``` -The configuration options are listed below. Note, the -first key is the value used in API calls and the second key (after the `/`) -is used if you're adding configuring to the agent's configuration file. + + +The configuration options are listed below. + +-> **Note**: The first key is the value used in API calls, and the second key + (after the `/`) is used if you are adding the configuration to the agent's + configuration file. - `PrivateKey` / `private_key` (`string: ""`) - A PEM-encoded private key for signing operations. This must match the private key used for the root @@ -55,8 +62,7 @@ is used if you're adding configuring to the agent's configuration file. bootstrap with the ".consul" TLD. The cluster identifier can be found using the [CA List Roots endpoint](/api/connect/ca#list-ca-root-certificates). -There are also [common CA configuration options](/docs/agent/options#common-ca-config-options) -that are supported by all CA providers. +@include 'http_api_connect_ca_common_options.mdx' ## Specifying a Custom Private Key and Root Certificate @@ -89,7 +95,7 @@ There are two ways to have the Consul CA use a custom private key and root certi either through the `ca_config` section of the [Agent configuration](/docs/agent/options#connect_ca_config) (which can only be used during the cluster's initial bootstrap) or through the [Update CA Configuration endpoint](/api/connect/ca#update-ca-configuration). -Currently consul requires that root certificates are valid [SPIFFE SVID Signing certificates](https://github.com/spiffe/spiffe/blob/master/standards/X509-SVID.md) and that the URI encoded +Currently Consul requires that root certificates are valid [SPIFFE SVID Signing certificates](https://github.com/spiffe/spiffe/blob/master/standards/X509-SVID.md) and that the URI encoded in the SAN is the cluster identifier created at bootstrap with the ".consul" TLD. In this example, we will set the URI SAN to `spiffe://36cb52cd-4058-f811-0432-6798a240c5d3.consul`. diff --git a/website/content/docs/connect/ca/index.mdx b/website/content/docs/connect/ca/index.mdx index e3bd830494..94d7bf3946 100644 --- a/website/content/docs/connect/ca/index.mdx +++ b/website/content/docs/connect/ca/index.mdx @@ -16,7 +16,7 @@ The CA provider abstraction enables Consul to support multiple systems for storing and signing certificates. Consul ships with a [built-in CA](/docs/connect/ca/consul) which generates and stores the root certificate and private key on the Consul servers. Consul also has -built-in support for +support for using [Vault as a CA](/docs/connect/ca/vault). With Vault, the root certificate and private key material remain with the Vault cluster. @@ -24,8 +24,8 @@ and private key material remain with the Vault cluster. CA initialization happens automatically when a new Consul leader is elected as long as -[Connect is enabled](/docs/connect/configuration#agent-configuration) -and the CA system hasn't already been initialized. This initialization process +[Connect is enabled](/docs/connect/configuration#agent-configuration), +and the CA system has not already been initialized. This initialization process will generate the initial root certificates and setup the internal Consul server state. diff --git a/website/content/docs/connect/ca/vault.mdx b/website/content/docs/connect/ca/vault.mdx index 8bcb75a97d..6b96616b20 100644 --- a/website/content/docs/connect/ca/vault.mdx +++ b/website/content/docs/connect/ca/vault.mdx @@ -30,45 +30,80 @@ must be met: ## Configuration -The Vault CA is enabled by setting the `ca_provider` to `"vault"` and -setting the required configuration values. An example configuration -is shown below: +The Vault CA is enabled by setting the CA provider to `"vault"` and +setting the required configuration values. + +The configuration may either be provided in the agent's configuration file using +the [`ca_provider`] and [`ca_config`] options, or configured using the +[`/connect/ca/configuration`] API endpoint. + +Example configurations are shown below: + + + + ```hcl +# ... connect { - enabled = true - ca_provider = "vault" - ca_config { - address = "http://localhost:8200" - token = "..." - root_pki_path = "connect-root" - intermediate_pki_path = "connect-intermediate" - } + enabled = true + ca_provider = "vault" + ca_config { + address = "http://localhost:8200" + token = "..." + root_pki_path = "connect-root" + intermediate_pki_path = "connect-intermediate" + } } ``` -The configuration options are listed below. Note, the -first key is the value used in API calls and the second key (after the `/`) -is used if you're adding configuring to the agent's configuration file. + + + + +```json +{ + "Provider": "vault", + "Config": { + "Address": "http://localhost:8200", + "Token": "", + "RootPKIPath": "connect-root", + "IntermediatePKIPath": "connect-intermediate" + } +} +``` + + + + + +The configuration options are listed below. + +-> **Note**: The first key is the value used in API calls, and the second key + (after the `/`) is used if you are adding the configuration to the agent's + configuration file. - `Address` / `address` (`string: `) - The address of the Vault server. - `Token` / `token` (`string: `) - A token for accessing Vault. This is write-only and will not be exposed when reading the CA configuration. - This token must have proper privileges for the PKI paths configured. In Consul - 1.8.5 and later, if the token has the [renewable](https://www.vaultproject.io/api-docs/auth/token#renewable) + This token must have [proper privileges](#vault-acl-policies) for the PKI + paths configured. In Consul 1.8.5 and later, if the token has the [renewable](https://www.vaultproject.io/api-docs/auth/token#renewable) flag set, Consul will attempt to renew its lease periodically after half the duration has expired. - `RootPKIPath` / `root_pki_path` (`string: `) - The path to - a PKI secrets engine for the root certificate. If the path doesn't - exist, Consul will attempt to mount and configure this automatically. + a PKI secrets engine for the root certificate. If the path does not + exist, Consul will mount a new PKI secrets engine at the specified path with + a [`max_lease_ttl`](https://www.vaultproject.io/api/system/mounts#max_lease_ttl) + of 8760 hours, or 1 year. This TTL value specifies the expiry period of the + root certificate and is currently not configurable. - `IntermediatePKIPath` / `intermediate_pki_path` (`string: `) - The path to a PKI secrets engine for the generated intermediate certificate. This certificate will be signed by the configured root PKI path. If this - path doesn't exist, Consul will attempt to mount and configure this + path does not exist, Consul will attempt to mount and configure this automatically. - `CAFile` / `ca_file` (`string: ""`) - Specifies an optional path to the CA @@ -81,12 +116,12 @@ is used if you're adding configuring to the agent's configuration file. varies by OS and version. - `CertFile` / `cert_file` (`string: ""`) - Specifies the path to the - certificate used for Vault communication. If this is set then you need to - also set tls_key_file. + certificate used for Vault communication. If this is set, then you also need to + set `key_file`. - `KeyFile` / `key_file` (`string: ""`) - Specifies the path to the private - key used for Vault communication. If this is set then you need to also set - cert_file. + key used for Vault communication. If this is set, then you also need to set + `cert_file`. - `TLSServerName` / `tls_server_name` (`string: ""`) - Specifies an optional string used to set the SNI host when connecting to Vault via TLS. @@ -94,6 +129,8 @@ is used if you're adding configuring to the agent's configuration file. - `TLSSkipVerify` / `tls_skip_verify` (`bool: false`) - Specifies if SSL peer validation should be enforced. +@include 'http_api_connect_ca_common_options.mdx' + ## Root and Intermediate PKI Paths The Vault CA provider uses two separately configured @@ -124,6 +161,8 @@ granted full control of the Intermediate or Leaf CA for Connect clients. In this example the `RootPKIPath` is `connect_root` and the `IntermediatePKIPath` is `connect_inter`. These values should be updated for your environment. + + ```hcl # Existing PKI Mounts path "/sys/mounts" { @@ -153,12 +192,14 @@ path "/connect_inter/*" { path "auth/token/renew-self" { capabilities = [ "update" ] } - + path "auth/token/lookup-self" { capabilities = [ "read" ] } ``` + + ### Consul Managed PKI Paths The following Vault policy allows Consul to create and manage the PKI paths in Vault. @@ -168,6 +209,8 @@ control of the Root and Intermediate or Leaf CA for Connect clients. In this example the `RootPKIPath` is `connect_root` and the `IntermediatePKIPath` is `connect_inter`. These values should be updated for your environment. + + ```hcl # Consul Managed PKI Mounts path "/sys/mounts" { @@ -190,3 +233,10 @@ path "/connect_inter/*" { capabilities = [ "create", "read", "update", "delete", "list" ] } ``` + + + + +[`ca_config`]: /docs/agent/options#connect_ca_config +[`ca_provider`]: /docs/agent/options#connect_ca_provider +[`/connect/ca/configuration`]: /api-docs/connect/ca#update-ca-configuration diff --git a/website/content/partials/http_api_connect_ca_common_options.mdx b/website/content/partials/http_api_connect_ca_common_options.mdx new file mode 100644 index 0000000000..d64d6ee475 --- /dev/null +++ b/website/content/partials/http_api_connect_ca_common_options.mdx @@ -0,0 +1,71 @@ +#### Common CA Config Options + +The following configuration options are supported by all CA providers: + +- `CSRMaxConcurrent` / `csr_max_concurrent` (`int: 0`) - Sets a limit on the + number of Certificate Signing Requests that can be processed concurrently. Defaults + to 0 (disabled). This is useful when you want to limit the number of CPU cores + available to the server for certificate signing operations. For example, on an + 8 core server, setting this to 1 will ensure that no more than one CPU core + will be consumed when generating or rotating certificates. Setting this is + recommended **instead** of `csr_max_per_second` when you want to limit the + number of cores consumed since it is simpler to reason about limiting CSR + resources this way without artificially slowing down rotations. Added in 1.4.1. + +- `CSRMaxPerSecond` / `csr_max_per_second` (`float: 50`) - Sets a rate limit + on the maximum number of Certificate Signing Requests (CSRs) the servers will + accept. This is used to prevent CA rotation from causing unbounded CPU usage + on servers. It defaults to 50 which is conservative – a 2017 MacBook can process + about 100 per second using only ~40% of one CPU core – but sufficient for deployments + up to ~1500 service instances before the time it takes to rotate is impacted. + For larger deployments we recommend increasing this based on the expected number + of server instances and server resources, or use `csr_max_concurrent` instead + if servers have more than one CPU core. Setting this to zero disables rate limiting. + Added in 1.4.1. + +- `LeafCertTTL` / `leaf_cert_ttl` (`duration: "72h"`) - The upper bound on the lease + duration of a leaf certificate issued for a service. In most cases a new leaf + certificate will be requested by a proxy before this limit is reached. This + is also the effective limit on how long a server outage can last (with no leader) + before network connections will start being rejected. Defaults to `72h`. This + value cannot be lower than 1 hour or higher than 1 year. + + This value is also used when rotating out old root certificates from + the cluster. When a root certificate has been inactive (rotated out) + for more than twice the _current_ `leaf_cert_ttl`, it will be removed + from the trusted list. + +- `PrivateKeyType` / `private_key_type` (`string: "ec"`) - The type of key to generate + for this CA. This is only used when the provider is generating a new key. If + `private_key` is set for the Consul provider, or existing root or intermediate + PKI paths given for Vault then this will be ignored. Currently supported options + are `ec` or `rsa`. Default is `ec`. + + It is required that all servers in a datacenter have + the same config for the CA. It is recommended that servers in + different datacenters use the same key type and size, although the built-in CA + and Vault provider will both allow mixed CA key types. + + Some CA providers (currently Vault) will not allow cross-signing a + new CA certificate with a different key type. This means that if you + migrate from an RSA-keyed Vault CA to an EC-keyed CA from any + provider, you may have to proceed without cross-signing which risks + temporary connection issues for workloads during the new certificate + rollout. We highly recommend testing this outside of production to + understand the impact, and suggest sticking to same key type where + possible. + + -> **Note**: This only affects _CA_ keys generated by the provider. + Leaf certificate keys are always EC 256 regardless of the CA + configuration. + +- `PrivateKeyBits` / `private_key_bits` (`string: ""`) - The length of key to + generate for this CA. This is only used when the provider is generating a new + key. If `private_key` is set for the Consul provider, or existing root or intermediate + PKI paths given for Vault then this will be ignored. + + Currently supported values are: + + - `private_key_type = ec` (default): `224, 256, 384, 521` + corresponding to the NIST P-\* curves of the same name. + - `private_key_type = rsa`: `2048, 4096` From 097e1645e32502e988463b73a4f218b1057ce2d0 Mon Sep 17 00:00:00 2001 From: "R.B. Boyer" <4903+rboyer@users.noreply.github.com> Date: Thu, 19 Aug 2021 15:09:42 -0500 Subject: [PATCH 15/28] agent: ensure that most agent behavior correctly respects partition configuration (#10880) --- agent/acl.go | 7 +- agent/agent.go | 32 +++-- agent/agent_endpoint.go | 102 ++++++++++++---- agent/agent_endpoint_oss.go | 13 ++ agent/agent_endpoint_test.go | 4 +- agent/agent_oss.go | 4 + agent/checks/alias.go | 2 +- agent/config/builder.go | 13 +- agent/config/runtime_oss.go | 3 +- agent/consul/acl.go | 5 + agent/consul/catalog_endpoint.go | 5 +- agent/consul/client_serf.go | 3 +- agent/consul/config_oss.go | 9 ++ agent/consul/coordinate_endpoint.go | 53 ++++++--- agent/consul/coordinate_endpoint_test.go | 5 - agent/consul/enterprise_server_oss.go | 2 +- agent/consul/filter.go | 1 + agent/consul/leader.go | 90 +++++++++----- agent/consul/leader_test.go | 20 +--- agent/consul/merge.go | 13 +- agent/consul/rtt.go | 12 +- agent/consul/segment_oss.go | 10 +- agent/consul/server_serf.go | 3 +- agent/consul/session_ttl.go | 2 + agent/consul/txn_endpoint_test.go | 1 - agent/coordinate_endpoint.go | 10 ++ agent/dns.go | 26 ++-- agent/http.go | 6 + agent/http_oss.go | 11 +- agent/local/state.go | 145 +++++++++++++++++++---- agent/proxycfg/connect_proxy.go | 6 +- agent/proxycfg/manager.go | 2 +- agent/proxycfg/manager_test.go | 4 +- agent/proxycfg/mesh_gateway.go | 6 +- agent/proxycfg/testing.go | 2 + agent/proxycfg/upstreams.go | 3 +- agent/rpc/subscribe/subscribe.go | 3 +- agent/rpcclient/health/view.go | 1 + agent/sidecar_service.go | 2 +- agent/structs/catalog.go | 5 - agent/txn_endpoint.go | 24 ++-- agent/ui_endpoint.go | 6 +- agent/user_event.go | 1 + api/agent_test.go | 6 +- api/coordinate_test.go | 8 +- command/rtt/rtt.go | 8 +- 46 files changed, 500 insertions(+), 199 deletions(-) create mode 100644 agent/agent_endpoint_oss.go create mode 100644 agent/consul/config_oss.go diff --git a/agent/acl.go b/agent/acl.go index 7e6a1455f7..f3600ba0b0 100644 --- a/agent/acl.go +++ b/agent/acl.go @@ -87,6 +87,8 @@ func (a *Agent) vetServiceUpdateWithAuthorizer(authz acl.Authorizer, serviceID s } func (a *Agent) vetCheckRegisterWithAuthorizer(authz acl.Authorizer, check *structs.HealthCheck) error { + // TODO(partitions) + var authzContext acl.AuthorizerContext check.FillAuthzContext(&authzContext) // Vet the check itself. @@ -147,7 +149,7 @@ func (a *Agent) filterMembers(token string, members *[]serf.Member) error { } var authzContext acl.AuthorizerContext - structs.DefaultEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) + a.agentEnterpriseMeta().FillAuthzContext(&authzContext) // Filter out members based on the node policy. m := *members for i := 0; i < len(m); i++ { @@ -188,7 +190,8 @@ func (a *Agent) filterChecksWithAuthorizer(authz acl.Authorizer, checks *map[str continue } } else { - structs.DefaultEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) + // TODO(partition): should this be a Default or Node flavored entmeta? + check.NodeEnterpriseMetaForPartition().FillAuthzContext(&authzContext) if authz.NodeRead(a.config.NodeName, &authzContext) == acl.Allow { continue } diff --git a/agent/agent.go b/agent/agent.go index b54af8b55f..5b9e4a8b89 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -434,6 +434,7 @@ func LocalConfig(cfg *config.RuntimeConfig) local.Config { DiscardCheckOutput: cfg.DiscardCheckOutput, NodeID: cfg.NodeID, NodeName: cfg.NodeName, + Partition: cfg.PartitionOrDefault(), TaggedAddresses: map[string]string{}, } for k, v := range cfg.TaggedAddresses { @@ -561,8 +562,9 @@ func (a *Agent) Start(ctx context.Context) error { State: a.State, Tokens: a.baseDeps.Tokens, Source: &structs.QuerySource{ - Datacenter: a.config.Datacenter, - Segment: a.config.SegmentName, + Datacenter: a.config.Datacenter, + Segment: a.config.SegmentName, + NodePartition: a.config.PartitionOrEmpty(), }, DNSConfig: proxycfg.DNSConfig{ Domain: a.config.DNSDomain, @@ -1529,11 +1531,13 @@ func (a *Agent) LocalMember() serf.Member { // LANMembers is used to retrieve the LAN members func (a *Agent) LANMembers() []serf.Member { + // TODO(partitions): filter this by the partition? return a.delegate.LANMembers() } // WANMembers is used to retrieve the WAN members func (a *Agent) WANMembers() []serf.Member { + // TODO(partitions): filter this by the partition by omitting wan results for now? if srv, ok := a.delegate.(*consul.Server); ok { return srv.WANMembers() } @@ -1646,11 +1650,12 @@ OUTER: for segment, coord := range cs { agentToken := a.tokens.AgentToken() req := structs.CoordinateUpdateRequest{ - Datacenter: a.config.Datacenter, - Node: a.config.NodeName, - Segment: segment, - Coord: coord, - WriteRequest: structs.WriteRequest{Token: agentToken}, + Datacenter: a.config.Datacenter, + Node: a.config.NodeName, + Segment: segment, + Coord: coord, + EnterpriseMeta: *a.agentEnterpriseMeta(), + WriteRequest: structs.WriteRequest{Token: agentToken}, } var reply struct{} // todo(kit) port all of these logger calls to hclog w/ loglevel configuration @@ -1674,7 +1679,7 @@ OUTER: // reapServicesInternal does a single pass, looking for services to reap. func (a *Agent) reapServicesInternal() { reaped := make(map[structs.ServiceID]bool) - for checkID, cs := range a.State.CriticalCheckStates(structs.WildcardEnterpriseMetaInDefaultPartition()) { + for checkID, cs := range a.State.AllCriticalCheckStates() { serviceID := cs.Check.CompoundServiceID() // There's nothing to do if there's no service. @@ -2004,7 +2009,7 @@ func (a *Agent) addServiceInternal(req addServiceInternalRequest) error { // Agent.Start does not have a snapshot, and we don't want to query // State.Checks each time. if req.checkStateSnapshot == nil { - req.checkStateSnapshot = a.State.Checks(structs.WildcardEnterpriseMetaInDefaultPartition()) + req.checkStateSnapshot = a.State.AllChecks() } // Create an associated health check @@ -2458,6 +2463,8 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType, // Need its config to know whether we should reroute checks to it var proxy *structs.NodeService if service != nil { + // NOTE: Both services must live in the same namespace and + // partition so this will correctly scope the results. for _, svc := range a.State.Services(&service.EnterpriseMeta) { if svc.Proxy.DestinationServiceID == service.ID { proxy = svc @@ -2719,6 +2726,7 @@ func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType, var rpcReq structs.NodeSpecificRequest rpcReq.Datacenter = a.config.Datacenter + rpcReq.EnterpriseMeta = *a.agentEnterpriseMeta() // The token to set is really important. The behavior below follows // the same behavior as anti-entropy: we use the user-specified token @@ -3297,7 +3305,7 @@ func (a *Agent) loadServices(conf *config.RuntimeConfig, snap map[structs.CheckI // unloadServices will deregister all services. func (a *Agent) unloadServices() error { - for id := range a.State.Services(structs.WildcardEnterpriseMetaInDefaultPartition()) { + for id := range a.State.AllServices() { if err := a.removeServiceLocked(id, false); err != nil { return fmt.Errorf("Failed deregistering service '%s': %v", id, err) } @@ -3411,7 +3419,7 @@ func (a *Agent) loadChecks(conf *config.RuntimeConfig, snap map[structs.CheckID] // unloadChecks will deregister all checks known to the local agent. func (a *Agent) unloadChecks() error { - for id := range a.State.Checks(structs.WildcardEnterpriseMetaInDefaultPartition()) { + for id := range a.State.AllChecks() { if err := a.removeCheckLocked(id, false); err != nil { return fmt.Errorf("Failed deregistering check '%s': %s", id, err) } @@ -3423,7 +3431,7 @@ func (a *Agent) unloadChecks() error { // checks. This is done before we reload our checks, so that we can properly // restore into the same state. func (a *Agent) snapshotCheckState() map[structs.CheckID]*structs.HealthCheck { - return a.State.Checks(structs.WildcardEnterpriseMetaInDefaultPartition()) + return a.State.AllChecks() } // loadMetadata loads node metadata fields from the agent config and diff --git a/agent/agent_endpoint.go b/agent/agent_endpoint.go index 216b8c8593..9f846484b9 100644 --- a/agent/agent_endpoint.go +++ b/agent/agent_endpoint.go @@ -82,6 +82,7 @@ func (s *HTTPHandlers) AgentSelf(resp http.ResponseWriter, req *http.Request) (i PrimaryDatacenter string NodeName string NodeID string + Partition string `json:",omitempty"` Revision string Server bool Version string @@ -90,6 +91,7 @@ func (s *HTTPHandlers) AgentSelf(resp http.ResponseWriter, req *http.Request) (i PrimaryDatacenter: s.agent.config.PrimaryDatacenter, NodeName: s.agent.config.NodeName, NodeID: string(s.agent.config.NodeID), + Partition: s.agent.config.PartitionOrEmpty(), Revision: s.agent.config.Revision, Server: s.agent.config.ServerMode, Version: s.agent.config.Version, @@ -305,6 +307,12 @@ func (s *HTTPHandlers) AgentServices(resp http.ResponseWriter, req *http.Request return nil, err } + if !s.validateRequestPartition(resp, &entMeta) { + return nil, nil + } + + // NOTE: we're explicitly fetching things in the requested partition and + // namespace here. services := s.agent.State.Services(&entMeta) if err := s.agent.filterServicesWithAuthorizer(authz, &services); err != nil { return nil, err @@ -368,6 +376,10 @@ func (s *HTTPHandlers) AgentService(resp http.ResponseWriter, req *http.Request) sid := structs.NewServiceID(id, &entMeta) + if !s.validateRequestPartition(resp, &entMeta) { + return nil, nil + } + dc := s.agent.config.Datacenter resultHash, service, err := s.agent.LocalBlockingQuery(false, hash, queryOpts.MaxQueryTime, @@ -400,6 +412,7 @@ func (s *HTTPHandlers) AgentService(resp http.ResponseWriter, req *http.Request) aSvc := buildAgentService(svc, dc) reply := &aSvc + // TODO(partitions): do we need to do anything here? rawHash, err := hashstructure.Hash(reply, nil) if err != nil { return "", nil, err @@ -432,6 +445,10 @@ func (s *HTTPHandlers) AgentChecks(resp http.ResponseWriter, req *http.Request) return nil, err } + if !s.validateRequestPartition(resp, &entMeta) { + return nil, nil + } + var filterExpression string s.parseFilter(req, &filterExpression) filter, err := bexpr.CreateFilter(filterExpression, nil, nil) @@ -439,6 +456,7 @@ func (s *HTTPHandlers) AgentChecks(resp http.ResponseWriter, req *http.Request) return nil, err } + // NOTE(partitions): this works because nodes exist in ONE partition checks := s.agent.State.Checks(&entMeta) if err := s.agent.filterChecksWithAuthorizer(authz, &checks); err != nil { return nil, err @@ -485,6 +503,8 @@ func (s *HTTPHandlers) AgentMembers(resp http.ResponseWriter, req *http.Request) } } + // TODO(partitions): likely partitions+segment integration will take care of this + var members []serf.Member if wan { members = s.agent.WANMembers() @@ -521,6 +541,7 @@ func (s *HTTPHandlers) AgentJoin(resp http.ResponseWriter, req *http.Request) (i wan := false if other := req.URL.Query().Get("wan"); other != "" { wan = true + // TODO(partitions) : block wan join } // Get the address @@ -616,6 +637,10 @@ func (s *HTTPHandlers) AgentRegisterCheck(resp http.ResponseWriter, req *http.Re return nil, err } + if !s.validateRequestPartition(resp, &args.EnterpriseMeta) { + return nil, nil + } + // Construct the health check. health := args.HealthCheck(s.agent.config.NodeName) @@ -674,6 +699,10 @@ func (s *HTTPHandlers) AgentDeregisterCheck(resp http.ResponseWriter, req *http. return nil, err } + if !s.validateRequestPartition(resp, &checkID.EnterpriseMeta) { + return nil, nil + } + if err := s.agent.RemoveCheck(checkID, true); err != nil { return nil, err } @@ -740,7 +769,7 @@ func (s *HTTPHandlers) AgentCheckUpdate(resp http.ResponseWriter, req *http.Requ return s.agentCheckUpdate(resp, req, checkID, update.Status, update.Output) } -func (s *HTTPHandlers) agentCheckUpdate(_resp http.ResponseWriter, req *http.Request, checkID types.CheckID, status string, output string) (interface{}, error) { +func (s *HTTPHandlers) agentCheckUpdate(resp http.ResponseWriter, req *http.Request, checkID types.CheckID, status string, output string) (interface{}, error) { cid := structs.NewCheckID(checkID, nil) // Get the provided token, if any, and vet against any ACL policies. @@ -762,6 +791,10 @@ func (s *HTTPHandlers) agentCheckUpdate(_resp http.ResponseWriter, req *http.Req return nil, err } + if !s.validateRequestPartition(resp, &cid.EnterpriseMeta) { + return nil, nil + } + if err := s.agent.updateTTLCheck(cid, status, output); err != nil { return nil, err } @@ -833,6 +866,10 @@ func (s *HTTPHandlers) AgentHealthServiceByID(resp http.ResponseWriter, req *htt return nil, err } + if !s.validateRequestPartition(resp, &entMeta) { + return nil, nil + } + sid := structs.NewServiceID(serviceID, &entMeta) dc := s.agent.config.Datacenter @@ -891,35 +928,38 @@ func (s *HTTPHandlers) AgentHealthServiceByName(resp http.ResponseWriter, req *h return nil, acl.ErrPermissionDenied } + if !s.validateRequestPartition(resp, &entMeta) { + return nil, nil + } + dc := s.agent.config.Datacenter code := http.StatusNotFound status := fmt.Sprintf("ServiceName %s Not Found", serviceName) - services := s.agent.State.Services(&entMeta) + + services := s.agent.State.ServicesByName(structs.NewServiceName(serviceName, &entMeta)) result := make([]api.AgentServiceChecksInfo, 0, 16) for _, service := range services { - if service.Service == serviceName { - sid := structs.NewServiceID(service.ID, &entMeta) + sid := structs.NewServiceID(service.ID, &entMeta) - scode, sstatus, healthChecks := agentHealthService(sid, s) - serviceInfo := buildAgentService(service, dc) - res := api.AgentServiceChecksInfo{ - AggregatedStatus: sstatus, - Checks: healthChecks, - Service: &serviceInfo, - } - result = append(result, res) - // When service is not found, we ignore it and keep existing HTTP status - if code == http.StatusNotFound { - code = scode - status = sstatus - } - // We take the worst of all statuses, so we keep iterating - // passing: 200 < warning: 429 < critical: 503 - if code < scode { - code = scode - status = sstatus - } + scode, sstatus, healthChecks := agentHealthService(sid, s) + serviceInfo := buildAgentService(service, dc) + res := api.AgentServiceChecksInfo{ + AggregatedStatus: sstatus, + Checks: healthChecks, + Service: &serviceInfo, + } + result = append(result, res) + // When service is not found, we ignore it and keep existing HTTP status + if code == http.StatusNotFound { + code = scode + status = sstatus + } + // We take the worst of all statuses, so we keep iterating + // passing: 200 < warning: 429 < critical: 503 + if code < scode { + code = scode + status = sstatus } } if returnTextPlain(req) { @@ -965,6 +1005,10 @@ func (s *HTTPHandlers) AgentRegisterService(resp http.ResponseWriter, req *http. return nil, err } + if !s.validateRequestPartition(resp, &args.EnterpriseMeta) { + return nil, nil + } + // Get the node service. ns := args.NodeService() if ns.Weights != nil { @@ -1104,6 +1148,10 @@ func (s *HTTPHandlers) AgentDeregisterService(resp http.ResponseWriter, req *htt return nil, err } + if !s.validateRequestPartition(resp, &sid.EnterpriseMeta) { + return nil, nil + } + if err := s.agent.RemoveService(sid); err != nil { return nil, err } @@ -1403,6 +1451,10 @@ func (s *HTTPHandlers) AgentConnectCALeafCert(resp http.ResponseWriter, req *htt args.MaxQueryTime = qOpts.MaxQueryTime args.Token = qOpts.Token + if !s.validateRequestPartition(resp, &args.EnterpriseMeta) { + return nil, nil + } + raw, m, err := s.agent.cache.Get(req.Context(), cachetype.ConnectCALeafName, &args) if err != nil { return nil, err @@ -1442,6 +1494,10 @@ func (s *HTTPHandlers) AgentConnectAuthorize(resp http.ResponseWriter, req *http return nil, BadRequestError{fmt.Sprintf("Request decode failed: %v", err)} } + if !s.validateRequestPartition(resp, &authReq.EnterpriseMeta) { + return nil, nil + } + authz, reason, cacheMeta, err := s.agent.ConnectAuthorize(token, &authReq) if err != nil { return nil, err diff --git a/agent/agent_endpoint_oss.go b/agent/agent_endpoint_oss.go new file mode 100644 index 0000000000..1c4dd44285 --- /dev/null +++ b/agent/agent_endpoint_oss.go @@ -0,0 +1,13 @@ +// +build !consulent + +package agent + +import ( + "net/http" + + "github.com/hashicorp/consul/agent/structs" +) + +func (s *HTTPHandlers) validateRequestPartition(_ http.ResponseWriter, _ *structs.EnterpriseMeta) bool { + return true +} diff --git a/agent/agent_endpoint_test.go b/agent/agent_endpoint_test.go index 0febf1405f..95a829de29 100644 --- a/agent/agent_endpoint_test.go +++ b/agent/agent_endpoint_test.go @@ -4180,7 +4180,7 @@ func testAgent_RegisterServiceDeregisterService_Sidecar(t *testing.T, extraHCL s resp.Body.String()) // Sanity the target service registration - svcs := a.State.Services(nil) + svcs := a.State.AllServices() // Parse the expected definition into a ServiceDefinition var sd structs.ServiceDefinition @@ -4229,7 +4229,7 @@ func testAgent_RegisterServiceDeregisterService_Sidecar(t *testing.T, extraHCL s require.NoError(err) require.Nil(obj) - svcs := a.State.Services(nil) + svcs := a.State.AllServices() _, ok = svcs[structs.NewServiceID(tt.wantNS.ID, nil)] if tt.wantSidecarIDLeftAfterDereg { require.True(ok, "removed non-sidecar service at "+tt.wantNS.ID) diff --git a/agent/agent_oss.go b/agent/agent_oss.go index 4be00ed806..88496e28be 100644 --- a/agent/agent_oss.go +++ b/agent/agent_oss.go @@ -50,3 +50,7 @@ func (a *Agent) stopLicenseManager() {} func (a *Agent) enterpriseStats() map[string]map[string]string { return nil } + +func (a *Agent) agentEnterpriseMeta() *structs.EnterpriseMeta { + return structs.NodeEnterpriseMetaInDefaultPartition() +} diff --git a/agent/checks/alias.go b/agent/checks/alias.go index b2fb9c0495..78c4a32b4c 100644 --- a/agent/checks/alias.go +++ b/agent/checks/alias.go @@ -108,7 +108,7 @@ func (c *CheckAlias) runLocal(stopCh chan struct{}) { } updateStatus := func() { - checks := c.Notify.Checks(structs.WildcardEnterpriseMetaInDefaultPartition()) + checks := c.Notify.Checks(c.WildcardEnterpriseMetaForPartition()) checksList := make([]*structs.HealthCheck, 0, len(checks)) for _, chk := range checks { checksList = append(checksList, chk) diff --git a/agent/config/builder.go b/agent/config/builder.go index 849c54b0dc..daae2c2f06 100644 --- a/agent/config/builder.go +++ b/agent/config/builder.go @@ -688,7 +688,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) { } autoEncryptAllowTLS := boolVal(c.AutoEncrypt.AllowTLS) - autoConfig := b.autoConfigVal(c.AutoConfig) + autoConfig := b.autoConfigVal(c.AutoConfig, stringVal(c.Partition)) if autoEncryptAllowTLS || autoConfig.Enabled { connectEnabled = true } @@ -2231,7 +2231,7 @@ func (b *builder) makeAddrs(pri []net.Addr, sec []*net.IPAddr, port int) []net.A return x } -func (b *builder) autoConfigVal(raw AutoConfigRaw) AutoConfig { +func (b *builder) autoConfigVal(raw AutoConfigRaw, agentPartition string) AutoConfig { var val AutoConfig val.Enabled = boolValWithDefault(raw.Enabled, false) @@ -2259,12 +2259,12 @@ func (b *builder) autoConfigVal(raw AutoConfigRaw) AutoConfig { val.IPSANs = append(val.IPSANs, ip) } - val.Authorizer = b.autoConfigAuthorizerVal(raw.Authorization) + val.Authorizer = b.autoConfigAuthorizerVal(raw.Authorization, agentPartition) return val } -func (b *builder) autoConfigAuthorizerVal(raw AutoConfigAuthorizationRaw) AutoConfigAuthorizer { +func (b *builder) autoConfigAuthorizerVal(raw AutoConfigAuthorizationRaw, agentPartition string) AutoConfigAuthorizer { // Our config file syntax wraps the static authorizer configuration in a "static" stanza. However // internally we do not support multiple configured authorization types so the RuntimeConfig just // inlines the static one. While we can and probably should extend the authorization types in the @@ -2272,13 +2272,16 @@ func (b *builder) autoConfigAuthorizerVal(raw AutoConfigAuthorizationRaw) AutoCo // needed right now so the configuration types will remain simplistic until they need to be otherwise. var val AutoConfigAuthorizer + entMeta := structs.DefaultEnterpriseMetaInPartition(agentPartition) + entMeta.Normalize() + val.Enabled = boolValWithDefault(raw.Enabled, false) val.ClaimAssertions = raw.Static.ClaimAssertions val.AllowReuse = boolValWithDefault(raw.Static.AllowReuse, false) val.AuthMethod = structs.ACLAuthMethod{ Name: "Auto Config Authorizer", Type: "jwt", - EnterpriseMeta: *structs.DefaultEnterpriseMetaInDefaultPartition(), + EnterpriseMeta: *entMeta, Config: map[string]interface{}{ "JWTSupportedAlgs": raw.Static.JWTSupportedAlgs, "BoundAudiences": raw.Static.BoundAudiences, diff --git a/agent/config/runtime_oss.go b/agent/config/runtime_oss.go index a83946f106..fcc9135dc2 100644 --- a/agent/config/runtime_oss.go +++ b/agent/config/runtime_oss.go @@ -4,4 +4,5 @@ package config type EnterpriseRuntimeConfig struct{} -func (c *RuntimeConfig) PartitionOrEmpty() string { return "" } +func (c *RuntimeConfig) PartitionOrEmpty() string { return "" } +func (c *RuntimeConfig) PartitionOrDefault() string { return "" } diff --git a/agent/consul/acl.go b/agent/consul/acl.go index 461c05b82e..910e711a07 100644 --- a/agent/consul/acl.go +++ b/agent/consul/acl.go @@ -1457,6 +1457,7 @@ func (f *aclFilter) filterNodeServices(services **structs.NodeServices) { } var authzContext acl.AuthorizerContext + // TODO(partitions): put partition into this wildcard? structs.WildcardEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) if !f.allowNode((*services).Node.Node, &authzContext) { *services = nil @@ -1481,6 +1482,7 @@ func (f *aclFilter) filterNodeServiceList(services **structs.NodeServiceList) { } var authzContext acl.AuthorizerContext + // TODO(partitions): put partition into this wildcard? structs.WildcardEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) if !f.allowNode((*services).Node.Node, &authzContext) { *services = nil @@ -1578,6 +1580,7 @@ func (f *aclFilter) filterSessions(sessions *structs.Sessions) { func (f *aclFilter) filterCoordinates(coords *structs.Coordinates) { c := *coords var authzContext acl.AuthorizerContext + // TODO(partitions): put partition into this wildcard? structs.WildcardEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) for i := 0; i < len(c); i++ { @@ -1619,6 +1622,7 @@ func (f *aclFilter) filterNodeDump(dump *structs.NodeDump) { info := nd[i] // Filter nodes + // TODO(partitions): put partition into this wildcard? structs.WildcardEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) if node := info.Node; !f.allowNode(node, &authzContext) { f.logger.Debug("dropping node from result due to ACLs", "node", node) @@ -1687,6 +1691,7 @@ func (f *aclFilter) filterNodes(nodes *structs.Nodes) { n := *nodes var authzContext acl.AuthorizerContext + // TODO(partitions): put partition into this wildcard? structs.WildcardEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) for i := 0; i < len(n); i++ { diff --git a/agent/consul/catalog_endpoint.go b/agent/consul/catalog_endpoint.go index c3f9b9daa8..a48a8133aa 100644 --- a/agent/consul/catalog_endpoint.go +++ b/agent/consul/catalog_endpoint.go @@ -474,11 +474,10 @@ func (c *Catalog) ListNodes(args *structs.DCSpecificRequest, reply *structs.Inde &reply.QueryMeta, func(ws memdb.WatchSet, state *state.Store) error { var err error - // TODO(partitions) if len(args.NodeMetaFilters) > 0 { - reply.Index, reply.Nodes, err = state.NodesByMeta(ws, args.NodeMetaFilters, nil) + reply.Index, reply.Nodes, err = state.NodesByMeta(ws, args.NodeMetaFilters, &args.EnterpriseMeta) } else { - reply.Index, reply.Nodes, err = state.Nodes(ws, nil) + reply.Index, reply.Nodes, err = state.Nodes(ws, &args.EnterpriseMeta) } if err != nil { return err diff --git a/agent/consul/client_serf.go b/agent/consul/client_serf.go index eb10498b95..f4692ef528 100644 --- a/agent/consul/client_serf.go +++ b/agent/consul/client_serf.go @@ -61,6 +61,7 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( nodeID: c.config.NodeID, nodeName: c.config.NodeName, segment: c.config.Segment, + server: false, } conf.SnapshotPath = filepath.Join(c.config.DataDir, path) @@ -68,7 +69,7 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( return nil, err } - addEnterpriseSerfTags(conf.Tags) + addEnterpriseSerfTags(conf.Tags, c.config.agentEnterpriseMeta()) conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(c.logger) diff --git a/agent/consul/config_oss.go b/agent/consul/config_oss.go new file mode 100644 index 0000000000..b2e295724b --- /dev/null +++ b/agent/consul/config_oss.go @@ -0,0 +1,9 @@ +// +build !consulent + +package consul + +import "github.com/hashicorp/consul/agent/structs" + +func (c *Config) agentEnterpriseMeta() *structs.EnterpriseMeta { + return structs.NodeEnterpriseMetaInDefaultPartition() +} diff --git a/agent/consul/coordinate_endpoint.go b/agent/consul/coordinate_endpoint.go index 503a8a6f87..ae3e169245 100644 --- a/agent/consul/coordinate_endpoint.go +++ b/agent/consul/coordinate_endpoint.go @@ -86,10 +86,13 @@ func (c *Coordinate) batchApplyUpdates() error { break } + update.EnterpriseMeta.Normalize() + updates[i] = &structs.Coordinate{ - Node: update.Node, - Segment: update.Segment, - Coord: update.Coord, + Node: update.Node, + Segment: update.Segment, + Coord: update.Coord, + Partition: update.PartitionOrEmpty(), } i++ } @@ -138,12 +141,17 @@ func (c *Coordinate) Update(args *structs.CoordinateUpdateRequest, reply *struct } // Fetch the ACL token, if any, and enforce the node policy if enabled. - authz, err := c.srv.ResolveToken(args.Token) + authz, err := c.srv.ResolveTokenAndDefaultMeta(args.Token, &args.EnterpriseMeta, nil) if err != nil { return err } + + if err := c.srv.validateEnterpriseRequest(&args.EnterpriseMeta, false); err != nil { + return err + } + var authzContext acl.AuthorizerContext - structs.DefaultEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) + args.DefaultEnterpriseMetaForPartition().FillAuthzContext(&authzContext) if authz.NodeWrite(args.Node, &authzContext) != acl.Allow { return acl.ErrPermissionDenied } @@ -166,6 +174,8 @@ func (c *Coordinate) ListDatacenters(args *struct{}, reply *[]structs.Datacenter return err } + // TODO(partitions): + var out []structs.DatacenterMap // Strip the datacenter suffixes from all the node names. @@ -194,11 +204,19 @@ func (c *Coordinate) ListNodes(args *structs.DCSpecificRequest, reply *structs.I return err } + _, err := c.srv.ResolveTokenAndDefaultMeta(args.Token, &args.EnterpriseMeta, nil) + if err != nil { + return err + } + + if err := c.srv.validateEnterpriseRequest(&args.EnterpriseMeta, false); err != nil { + return err + } + return c.srv.blockingQuery(&args.QueryOptions, &reply.QueryMeta, func(ws memdb.WatchSet, state *state.Store) error { - // TODO(partitions) - index, coords, err := state.Coordinates(ws, nil) + index, coords, err := state.Coordinates(ws, &args.EnterpriseMeta) if err != nil { return err } @@ -220,21 +238,27 @@ func (c *Coordinate) Node(args *structs.NodeSpecificRequest, reply *structs.Inde // Fetch the ACL token, if any, and enforce the node policy if enabled. - authz, err := c.srv.ResolveToken(args.Token) + authz, err := c.srv.ResolveTokenAndDefaultMeta(args.Token, &args.EnterpriseMeta, nil) if err != nil { return err } + + if err := c.srv.validateEnterpriseRequest(&args.EnterpriseMeta, false); err != nil { + return err + } + var authzContext acl.AuthorizerContext - structs.WildcardEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) + args.WildcardEnterpriseMetaForPartition().FillAuthzContext(&authzContext) if authz.NodeRead(args.Node, &authzContext) != acl.Allow { return acl.ErrPermissionDenied } + // TODO(partitions): do we have to add EnterpriseMeta to the reply like in Catalog.ListServices? + return c.srv.blockingQuery(&args.QueryOptions, &reply.QueryMeta, func(ws memdb.WatchSet, state *state.Store) error { - // TODO(partitions) - index, nodeCoords, err := state.Coordinate(ws, args.Node, nil) + index, nodeCoords, err := state.Coordinate(ws, args.Node, &args.EnterpriseMeta) if err != nil { return err } @@ -242,9 +266,10 @@ func (c *Coordinate) Node(args *structs.NodeSpecificRequest, reply *structs.Inde var coords structs.Coordinates for segment, coord := range nodeCoords { coords = append(coords, &structs.Coordinate{ - Node: args.Node, - Segment: segment, - Coord: coord, + Node: args.Node, + Segment: segment, + Partition: args.PartitionOrEmpty(), + Coord: coord, }) } reply.Index, reply.Coordinates = index, coords diff --git a/agent/consul/coordinate_endpoint_test.go b/agent/consul/coordinate_endpoint_test.go index 5741450f7c..646c85b19e 100644 --- a/agent/consul/coordinate_endpoint_test.go +++ b/agent/consul/coordinate_endpoint_test.go @@ -84,14 +84,12 @@ func TestCoordinate_Update(t *testing.T) { // Make sure the updates did not yet apply because the update period // hasn't expired. state := s1.fsm.State() - // TODO(partitions) _, c, err := state.Coordinate(nil, "node1", nil) if err != nil { t.Fatalf("err: %v", err) } require.Equal(t, lib.CoordinateSet{}, c) - // TODO(partitions) _, c, err = state.Coordinate(nil, "node2", nil) if err != nil { t.Fatalf("err: %v", err) @@ -107,7 +105,6 @@ func TestCoordinate_Update(t *testing.T) { // Wait a while and the updates should get picked up. time.Sleep(3 * s1.config.CoordinateUpdatePeriod) - // TODO(partitions) _, c, err = state.Coordinate(nil, "node1", nil) if err != nil { t.Fatalf("err: %v", err) @@ -117,7 +114,6 @@ func TestCoordinate_Update(t *testing.T) { } require.Equal(t, expected, c) - // TODO(partitions) _, c, err = state.Coordinate(nil, "node2", nil) if err != nil { t.Fatalf("err: %v", err) @@ -157,7 +153,6 @@ func TestCoordinate_Update(t *testing.T) { time.Sleep(3 * s1.config.CoordinateUpdatePeriod) numDropped := 0 for i := 0; i < spamLen; i++ { - // TODO(partitions) _, c, err = state.Coordinate(nil, fmt.Sprintf("bogusnode%d", i), nil) if err != nil { t.Fatalf("err: %v", err) diff --git a/agent/consul/enterprise_server_oss.go b/agent/consul/enterprise_server_oss.go index 90bf131303..12ede7bffd 100644 --- a/agent/consul/enterprise_server_oss.go +++ b/agent/consul/enterprise_server_oss.go @@ -71,7 +71,7 @@ func (s *Server) validateEnterpriseIntentionNamespace(ns string, _ bool) error { return errors.New("Namespaces is a Consul Enterprise feature") } -func addEnterpriseSerfTags(_ map[string]string) { +func addEnterpriseSerfTags(_ map[string]string, _ *structs.EnterpriseMeta) { // do nothing } diff --git a/agent/consul/filter.go b/agent/consul/filter.go index f650059e9a..a85ce65eed 100644 --- a/agent/consul/filter.go +++ b/agent/consul/filter.go @@ -47,6 +47,7 @@ func (t *txnResultsFilter) Filter(i int) bool { result.KV.EnterpriseMeta.FillAuthzContext(&authzContext) return t.authorizer.KeyRead(result.KV.Key, &authzContext) != acl.Allow case result.Node != nil: + // TODO(partitions): put partition into this wildcard? structs.WildcardEnterpriseMetaInDefaultPartition().FillAuthzContext(&authzContext) return t.authorizer.NodeRead(result.Node.Node, &authzContext) != acl.Allow case result.Service != nil: diff --git a/agent/consul/leader.go b/agent/consul/leader.go index d5239008e9..996abea621 100644 --- a/agent/consul/leader.go +++ b/agent/consul/leader.go @@ -532,6 +532,8 @@ func (s *Server) initializeACLs(ctx context.Context, upgrade bool) error { s.logger.Info("initializing acls") + // TODO(partitions): initialize acls in all of the partitions? + // Create/Upgrade the builtin global-management policy _, policy, err := s.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMetaInDefaultPartition()) if err != nil { @@ -1110,9 +1112,13 @@ func (s *Server) bootstrapConfigEntries(entries []structs.ConfigEntry) error { // reconcileReaped is used to reconcile nodes that have failed and been reaped // from Serf but remain in the catalog. This is done by looking for unknown nodes with serfHealth checks registered. // We generate a "reap" event to cause the node to be cleaned up. -func (s *Server) reconcileReaped(known map[string]struct{}) error { +func (s *Server) reconcileReaped(known map[string]struct{}, nodeEntMeta *structs.EnterpriseMeta) error { + if nodeEntMeta == nil { + nodeEntMeta = structs.NodeEnterpriseMetaInDefaultPartition() + } + state := s.fsm.State() - _, checks, err := state.ChecksInState(nil, api.HealthAny, structs.DefaultEnterpriseMetaInDefaultPartition()) + _, checks, err := state.ChecksInState(nil, api.HealthAny, nodeEntMeta) if err != nil { return err } @@ -1128,7 +1134,7 @@ func (s *Server) reconcileReaped(known map[string]struct{}) error { } // Get the node services, look for ConsulServiceID - _, services, err := state.NodeServices(nil, check.Node, structs.DefaultEnterpriseMetaInDefaultPartition()) + _, services, err := state.NodeServices(nil, check.Node, nodeEntMeta) if err != nil { return err } @@ -1139,8 +1145,7 @@ func (s *Server) reconcileReaped(known map[string]struct{}) error { CHECKS: for _, service := range services.Services { if service.ID == structs.ConsulServiceID { - // TODO(partitions) - _, node, err := state.GetNode(check.Node, nil) + _, node, err := state.GetNode(check.Node, nodeEntMeta) if err != nil { s.logger.Error("Unable to look up node with name", "name", check.Node, "error", err) continue CHECKS @@ -1165,6 +1170,7 @@ func (s *Server) reconcileReaped(known map[string]struct{}) error { "role": "node", }, } + addEnterpriseSerfTags(member.Tags, nodeEntMeta) // Create the appropriate tags if this was a server node if serverPort > 0 { @@ -1175,7 +1181,7 @@ func (s *Server) reconcileReaped(known map[string]struct{}) error { } // Attempt to reap this member - if err := s.handleReapMember(member); err != nil { + if err := s.handleReapMember(member, nodeEntMeta); err != nil { return err } } @@ -1187,23 +1193,28 @@ func (s *Server) reconcileReaped(known map[string]struct{}) error { func (s *Server) reconcileMember(member serf.Member) error { // Check if this is a member we should handle if !s.shouldHandleMember(member) { + // TODO(partition): log the partition name s.logger.Warn("skipping reconcile of node", "member", member) return nil } defer metrics.MeasureSince([]string{"leader", "reconcileMember"}, time.Now()) + + nodeEntMeta := getSerfMemberEnterpriseMeta(member) + var err error switch member.Status { case serf.StatusAlive: - err = s.handleAliveMember(member) + err = s.handleAliveMember(member, nodeEntMeta) case serf.StatusFailed: - err = s.handleFailedMember(member) + err = s.handleFailedMember(member, nodeEntMeta) case serf.StatusLeft: - err = s.handleLeftMember(member) + err = s.handleLeftMember(member, nodeEntMeta) case StatusReap: - err = s.handleReapMember(member) + err = s.handleReapMember(member, nodeEntMeta) } if err != nil { s.logger.Error("failed to reconcile member", + // TODO(partition): log the partition name "member", member, "error", err, ) @@ -1231,7 +1242,11 @@ func (s *Server) shouldHandleMember(member serf.Member) bool { // handleAliveMember is used to ensure the node // is registered, with a passing health check. -func (s *Server) handleAliveMember(member serf.Member) error { +func (s *Server) handleAliveMember(member serf.Member, nodeEntMeta *structs.EnterpriseMeta) error { + if nodeEntMeta == nil { + nodeEntMeta = structs.NodeEnterpriseMetaInDefaultPartition() + } + // Register consul service if a server var service *structs.NodeService if valid, parts := metadata.IsConsulServer(member); valid { @@ -1243,6 +1258,7 @@ func (s *Server) handleAliveMember(member serf.Member) error { Passing: 1, Warning: 1, }, + EnterpriseMeta: *nodeEntMeta, Meta: map[string]string{ // DEPRECATED - remove nonvoter in favor of read_replica in a future version of consul "non_voter": strconv.FormatBool(member.Tags["nonvoter"] == "1"), @@ -1263,8 +1279,7 @@ func (s *Server) handleAliveMember(member serf.Member) error { // Check if the node exists state := s.fsm.State() - // TODO(partitions) - _, node, err := state.GetNode(member.Name, nil) + _, node, err := state.GetNode(member.Name, nodeEntMeta) if err != nil { return err } @@ -1272,7 +1287,7 @@ func (s *Server) handleAliveMember(member serf.Member) error { // Check if the associated service is available if service != nil { match := false - _, services, err := state.NodeServices(nil, member.Name, structs.DefaultEnterpriseMetaInDefaultPartition()) + _, services, err := state.NodeServices(nil, member.Name, nodeEntMeta) if err != nil { return err } @@ -1290,7 +1305,7 @@ func (s *Server) handleAliveMember(member serf.Member) error { } // Check if the serfCheck is in the passing state - _, checks, err := state.NodeChecks(nil, member.Name, structs.DefaultEnterpriseMetaInDefaultPartition()) + _, checks, err := state.NodeChecks(nil, member.Name, nodeEntMeta) if err != nil { return err } @@ -1317,6 +1332,7 @@ AFTER_CHECK: Status: api.HealthPassing, Output: structs.SerfCheckAliveOutput, }, + EnterpriseMeta: *nodeEntMeta, } if node != nil { req.TaggedAddresses = node.TaggedAddresses @@ -1329,11 +1345,14 @@ AFTER_CHECK: // handleFailedMember is used to mark the node's status // as being critical, along with all checks as unknown. -func (s *Server) handleFailedMember(member serf.Member) error { +func (s *Server) handleFailedMember(member serf.Member, nodeEntMeta *structs.EnterpriseMeta) error { + if nodeEntMeta == nil { + nodeEntMeta = structs.NodeEnterpriseMetaInDefaultPartition() + } + // Check if the node exists state := s.fsm.State() - // TODO(partitions) - _, node, err := state.GetNode(member.Name, nil) + _, node, err := state.GetNode(member.Name, nodeEntMeta) if err != nil { return err } @@ -1343,9 +1362,11 @@ func (s *Server) handleFailedMember(member serf.Member) error { return nil } + // TODO(partitions): get the ent meta by parsing serf tags + if node.Address == member.Addr.String() { // Check if the serfCheck is in the critical state - _, checks, err := state.NodeChecks(nil, member.Name, structs.DefaultEnterpriseMetaInDefaultPartition()) + _, checks, err := state.NodeChecks(nil, member.Name, nodeEntMeta) if err != nil { return err } @@ -1359,10 +1380,11 @@ func (s *Server) handleFailedMember(member serf.Member) error { // Register with the catalog req := structs.RegisterRequest{ - Datacenter: s.config.Datacenter, - Node: member.Name, - ID: types.NodeID(member.Tags["id"]), - Address: member.Addr.String(), + Datacenter: s.config.Datacenter, + Node: member.Name, + EnterpriseMeta: *nodeEntMeta, + ID: types.NodeID(member.Tags["id"]), + Address: member.Addr.String(), Check: &structs.HealthCheck{ Node: member.Name, CheckID: structs.SerfCheckID, @@ -1381,18 +1403,22 @@ func (s *Server) handleFailedMember(member serf.Member) error { // handleLeftMember is used to handle members that gracefully // left. They are deregistered if necessary. -func (s *Server) handleLeftMember(member serf.Member) error { - return s.handleDeregisterMember("left", member) +func (s *Server) handleLeftMember(member serf.Member, nodeEntMeta *structs.EnterpriseMeta) error { + return s.handleDeregisterMember("left", member, nodeEntMeta) } // handleReapMember is used to handle members that have been // reaped after a prolonged failure. They are deregistered. -func (s *Server) handleReapMember(member serf.Member) error { - return s.handleDeregisterMember("reaped", member) +func (s *Server) handleReapMember(member serf.Member, nodeEntMeta *structs.EnterpriseMeta) error { + return s.handleDeregisterMember("reaped", member, nodeEntMeta) } // handleDeregisterMember is used to deregister a member of a given reason -func (s *Server) handleDeregisterMember(reason string, member serf.Member) error { +func (s *Server) handleDeregisterMember(reason string, member serf.Member, nodeEntMeta *structs.EnterpriseMeta) error { + if nodeEntMeta == nil { + nodeEntMeta = structs.NodeEnterpriseMetaInDefaultPartition() + } + // Do not deregister ourself. This can only happen if the current leader // is leaving. Instead, we should allow a follower to take-over and // deregister us later. @@ -1410,8 +1436,7 @@ func (s *Server) handleDeregisterMember(reason string, member serf.Member) error // Check if the node does not exist state := s.fsm.State() - // TODO(partitions) - _, node, err := state.GetNode(member.Name, nil) + _, node, err := state.GetNode(member.Name, nodeEntMeta) if err != nil { return err } @@ -1422,8 +1447,9 @@ func (s *Server) handleDeregisterMember(reason string, member serf.Member) error // Deregister the node s.logger.Info("deregistering member", "member", member.Name, "reason", reason) req := structs.DeregisterRequest{ - Datacenter: s.config.Datacenter, - Node: member.Name, + Datacenter: s.config.Datacenter, + Node: member.Name, + EnterpriseMeta: *nodeEntMeta, } _, err = s.raftApply(structs.DeregisterRequestType, &req) return err diff --git a/agent/consul/leader_test.go b/agent/consul/leader_test.go index 7463b794df..9089815712 100644 --- a/agent/consul/leader_test.go +++ b/agent/consul/leader_test.go @@ -49,7 +49,6 @@ func TestLeader_RegisterMember(t *testing.T) { // Client should be registered state := s1.fsm.State() retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -79,7 +78,6 @@ func TestLeader_RegisterMember(t *testing.T) { // Server should be registered retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(s1.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -129,7 +127,6 @@ func TestLeader_FailedMember(t *testing.T) { // Should be registered state := s1.fsm.State() retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -191,7 +188,6 @@ func TestLeader_LeftMember(t *testing.T) { // Should be registered retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -207,7 +203,6 @@ func TestLeader_LeftMember(t *testing.T) { // Should be deregistered retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -243,7 +238,6 @@ func TestLeader_ReapMember(t *testing.T) { // Should be registered retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -269,7 +263,6 @@ func TestLeader_ReapMember(t *testing.T) { // anti-entropy will put it back. reaped := false for start := time.Now(); time.Since(start) < 5*time.Second; { - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { t.Fatalf("err: %v", err) @@ -367,7 +360,7 @@ func TestLeader_CheckServersMeta(t *testing.T) { member.Tags["nonvoter"] = "1" member.Tags["read_replica"] = "1" member.Tags["build"] = versionToExpect - err := s1.handleAliveMember(member) + err := s1.handleAliveMember(member, nil) if err != nil { r.Fatalf("Unexpected error :%v", err) } @@ -439,7 +432,6 @@ func TestLeader_ReapServer(t *testing.T) { // s3 should be registered retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(s3.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -454,14 +446,13 @@ func TestLeader_ReapServer(t *testing.T) { knownMembers[s1.config.NodeName] = struct{}{} knownMembers[s2.config.NodeName] = struct{}{} - err := s1.reconcileReaped(knownMembers) + err := s1.reconcileReaped(knownMembers, nil) if err != nil { t.Fatalf("Unexpected error :%v", err) } // s3 should be deregistered retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(s3.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -517,7 +508,6 @@ func TestLeader_Reconcile_ReapMember(t *testing.T) { // Node should be gone state := s1.fsm.State() - // TODO(partitions) _, node, err := state.GetNode("no-longer-around", nil) if err != nil { t.Fatalf("err: %v", err) @@ -551,7 +541,6 @@ func TestLeader_Reconcile(t *testing.T) { // Should not be registered state := s1.fsm.State() - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { t.Fatalf("err: %v", err) @@ -562,7 +551,6 @@ func TestLeader_Reconcile(t *testing.T) { // Should be registered retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -595,7 +583,6 @@ func TestLeader_Reconcile_Races(t *testing.T) { state := s1.fsm.State() var nodeAddr string retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) @@ -632,7 +619,6 @@ func TestLeader_Reconcile_Races(t *testing.T) { if err := s1.reconcile(); err != nil { t.Fatalf("err: %v", err) } - // TODO(partitions) _, node, err := state.GetNode(c1.config.NodeName, nil) if err != nil { t.Fatalf("err: %v", err) @@ -657,7 +643,6 @@ func TestLeader_Reconcile_Races(t *testing.T) { }) // Make sure the metadata didn't get clobbered. - // TODO(partitions) _, node, err = state.GetNode(c1.config.NodeName, nil) if err != nil { t.Fatalf("err: %v", err) @@ -773,7 +758,6 @@ func TestLeader_LeftLeader(t *testing.T) { // Verify the old leader is deregistered state := remain.fsm.State() retry.Run(t, func(r *retry.R) { - // TODO(partitions) _, node, err := state.GetNode(leader.config.NodeName, nil) if err != nil { r.Fatalf("err: %v", err) diff --git a/agent/consul/merge.go b/agent/consul/merge.go index 006e41355a..16bc55ddc8 100644 --- a/agent/consul/merge.go +++ b/agent/consul/merge.go @@ -3,10 +3,11 @@ package consul import ( "fmt" - "github.com/hashicorp/consul/agent/metadata" - "github.com/hashicorp/consul/types" "github.com/hashicorp/go-version" "github.com/hashicorp/serf/serf" + + "github.com/hashicorp/consul/agent/metadata" + "github.com/hashicorp/consul/types" ) // lanMergeDelegate is used to handle a cluster merge on the LAN gossip @@ -17,6 +18,14 @@ type lanMergeDelegate struct { nodeID types.NodeID nodeName string segment string + + // TODO(partitions): use server and partition to reject gossip messages + // from nodes in the wrong partition depending upon the role the node is + // playing. For example servers will always be in the default partition, + // but all clients in all partitions should be aware of the servers so that + // general RPC routing works. + server bool + partition string } // uniqueIDMinVersion is the lowest version where we insist that nodes diff --git a/agent/consul/rtt.go b/agent/consul/rtt.go index 9a77802513..caf8116ed0 100644 --- a/agent/consul/rtt.go +++ b/agent/consul/rtt.go @@ -22,8 +22,7 @@ func (s *Server) newNodeSorter(cs lib.CoordinateSet, nodes structs.Nodes) (sort. state := s.fsm.State() vec := make([]float64, len(nodes)) for i, node := range nodes { - // TODO(partitions) - _, other, err := state.Coordinate(nil, node.Node, nil) + _, other, err := state.Coordinate(nil, node.Node, node.GetEnterpriseMeta()) if err != nil { return nil, err } @@ -63,8 +62,7 @@ func (s *Server) newServiceNodeSorter(cs lib.CoordinateSet, nodes structs.Servic state := s.fsm.State() vec := make([]float64, len(nodes)) for i, node := range nodes { - // TODO(partitions) - _, other, err := state.Coordinate(nil, node.Node, nil) + _, other, err := state.Coordinate(nil, node.Node, &node.EnterpriseMeta) if err != nil { return nil, err } @@ -104,8 +102,7 @@ func (s *Server) newHealthCheckSorter(cs lib.CoordinateSet, checks structs.Healt state := s.fsm.State() vec := make([]float64, len(checks)) for i, check := range checks { - // TODO(partitions) - _, other, err := state.Coordinate(nil, check.Node, nil) + _, other, err := state.Coordinate(nil, check.Node, &check.EnterpriseMeta) if err != nil { return nil, err } @@ -145,8 +142,7 @@ func (s *Server) newCheckServiceNodeSorter(cs lib.CoordinateSet, nodes structs.C state := s.fsm.State() vec := make([]float64, len(nodes)) for i, node := range nodes { - // TODO(partitions) - _, other, err := state.Coordinate(nil, node.Node.Node, nil) + _, other, err := state.Coordinate(nil, node.Node.Node, node.Node.GetEnterpriseMeta()) if err != nil { return nil, err } diff --git a/agent/consul/segment_oss.go b/agent/consul/segment_oss.go index 690132c347..8e8936a3c5 100644 --- a/agent/consul/segment_oss.go +++ b/agent/consul/segment_oss.go @@ -8,8 +8,9 @@ import ( "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/serf/serf" + + "github.com/hashicorp/consul/agent/structs" ) var SegmentOSSSummaries = []prometheus.SummaryDefinition{ @@ -62,12 +63,17 @@ func (s *Server) setupSegments(config *Config, port int, rpcListeners map[string func (s *Server) floodSegments(config *Config) { } +func getSerfMemberEnterpriseMeta(member serf.Member) *structs.EnterpriseMeta { + return structs.NodeEnterpriseMetaInDefaultPartition() +} + // reconcile is used to reconcile the differences between Serf membership and // what is reflected in our strongly consistent store. Mainly we need to ensure // all live nodes are registered, all failed nodes are marked as such, and all // left nodes are deregistered. func (s *Server) reconcile() (err error) { defer metrics.MeasureSince([]string{"leader", "reconcile"}, time.Now()) + members := s.serfLAN.Members() knownMembers := make(map[string]struct{}) for _, member := range members { @@ -79,5 +85,5 @@ func (s *Server) reconcile() (err error) { // Reconcile any members that have been reaped while we were not the // leader. - return s.reconcileReaped(knownMembers) + return s.reconcileReaped(knownMembers, nil) } diff --git a/agent/consul/server_serf.go b/agent/consul/server_serf.go index f26c843028..d0c698daa5 100644 --- a/agent/consul/server_serf.go +++ b/agent/consul/server_serf.go @@ -117,6 +117,7 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w nodeID: s.config.NodeID, nodeName: s.config.NodeName, segment: segment, + server: true, } } @@ -175,7 +176,7 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w conf.ReconnectTimeoutOverride = libserf.NewReconnectOverride(s.logger) - addEnterpriseSerfTags(conf.Tags) + addEnterpriseSerfTags(conf.Tags, s.config.agentEnterpriseMeta()) if s.config.OverrideInitialSerfTags != nil { s.config.OverrideInitialSerfTags(conf.Tags) diff --git a/agent/consul/session_ttl.go b/agent/consul/session_ttl.go index 497fc7b540..426179d96a 100644 --- a/agent/consul/session_ttl.go +++ b/agent/consul/session_ttl.go @@ -6,6 +6,7 @@ import ( "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" + "github.com/hashicorp/consul/agent/structs" ) @@ -46,6 +47,7 @@ func (s *Server) initializeSessionTimers() error { // Scan all sessions and reset their timer state := s.fsm.State() + // TODO(partitions): track all session timers in all partitions _, sessions, err := state.SessionList(nil, structs.WildcardEnterpriseMetaInDefaultPartition()) if err != nil { return err diff --git a/agent/consul/txn_endpoint_test.go b/agent/consul/txn_endpoint_test.go index 53f9b36ded..4abb691402 100644 --- a/agent/consul/txn_endpoint_test.go +++ b/agent/consul/txn_endpoint_test.go @@ -234,7 +234,6 @@ func TestTxn_Apply(t *testing.T) { t.Fatalf("bad: %v", d) } - // TODO(partitions) _, n, err := state.GetNode("foo", nil) if err != nil { t.Fatalf("err: %v", err) diff --git a/agent/coordinate_endpoint.go b/agent/coordinate_endpoint.go index 596f63ae06..08f92f7d9a 100644 --- a/agent/coordinate_endpoint.go +++ b/agent/coordinate_endpoint.go @@ -82,6 +82,9 @@ func (s *HTTPHandlers) CoordinateNodes(resp http.ResponseWriter, req *http.Reque if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done { return nil, nil } + if err := parseEntMetaPartition(req, &args.EnterpriseMeta); err != nil { + return nil, err + } var out structs.IndexedCoordinates defer setMeta(resp, &out.QueryMeta) @@ -105,6 +108,9 @@ func (s *HTTPHandlers) CoordinateNode(resp http.ResponseWriter, req *http.Reques if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done { return nil, nil } + if err := parseEntMetaPartition(req, &args.EnterpriseMeta); err != nil { + return nil, err + } var out structs.IndexedCoordinates defer setMeta(resp, &out.QueryMeta) @@ -158,6 +164,10 @@ func (s *HTTPHandlers) CoordinateUpdate(resp http.ResponseWriter, req *http.Requ s.parseDC(req, &args.Datacenter) s.parseToken(req, &args.Token) + if err := s.parseEntMetaNoWildcard(req, &args.EnterpriseMeta); err != nil { + return nil, err + } + var reply struct{} if err := s.agent.RPC("Coordinate.Update", &args, &reply); err != nil { return nil, err diff --git a/agent/dns.go b/agent/dns.go index 8224b27306..5693178c28 100644 --- a/agent/dns.go +++ b/agent/dns.go @@ -122,6 +122,8 @@ type DNSServer struct { // recursorEnabled stores whever the recursor handler is enabled as an atomic flag. // the recursor handler is only enabled if recursors are configured. This flag is used during config hot-reloading recursorEnabled uint32 + + defaultEnterpriseMeta structs.EnterpriseMeta } func NewDNSServer(a *Agent) (*DNSServer, error) { @@ -130,10 +132,11 @@ func NewDNSServer(a *Agent) (*DNSServer, error) { altDomain := dns.Fqdn(strings.ToLower(a.config.DNSAltDomain)) srv := &DNSServer{ - agent: a, - domain: domain, - altDomain: altDomain, - logger: a.logger.Named(logging.DNS), + agent: a, + domain: domain, + altDomain: altDomain, + logger: a.logger.Named(logging.DNS), + defaultEnterpriseMeta: *a.agentEnterpriseMeta(), } cfg, err := GetDNSConfig(a.config) if err != nil { @@ -414,7 +417,7 @@ func (d *DNSServer) handlePtr(resp dns.ResponseWriter, req *dns.Msg) { AllowStale: cfg.AllowStale, }, ServiceAddress: serviceAddress, - EnterpriseMeta: *structs.WildcardEnterpriseMetaInDefaultPartition(), + EnterpriseMeta: *d.defaultEnterpriseMeta.WildcardEnterpriseMetaForPartition(), } var sout structs.IndexedServiceNodes @@ -548,7 +551,7 @@ func (d *DNSServer) nameservers(cfg *dnsConfig, maxRecursionLevel int) (ns []dns Service: structs.ConsulServiceName, Connect: false, Ingress: false, - EnterpriseMeta: *structs.DefaultEnterpriseMetaInDefaultPartition(), + EnterpriseMeta: d.defaultEnterpriseMeta, }) if err != nil { d.logger.Warn("Unable to get list of servers", "error", err) @@ -645,8 +648,8 @@ func (d *DNSServer) dispatch(remoteAddr net.Addr, req, resp *dns.Msg, maxRecursi // By default the query is in the default datacenter datacenter := d.agent.config.Datacenter - // have to deref to clone it so we don't modify - var entMeta structs.EnterpriseMeta + // have to deref to clone it so we don't modify (start from the agent's defaults) + var entMeta = d.defaultEnterpriseMeta // Get the QName without the domain suffix qName := strings.ToLower(dns.Fqdn(req.Question[0].Name)) @@ -1316,9 +1319,10 @@ func (d *DNSServer) preparedQueryLookup(cfg *dnsConfig, datacenter, query string // send the local agent's data through to allow distance sorting // relative to ourself on the server side. Agent: structs.QuerySource{ - Datacenter: d.agent.config.Datacenter, - Segment: d.agent.config.SegmentName, - Node: d.agent.config.NodeName, + Datacenter: d.agent.config.Datacenter, + Segment: d.agent.config.SegmentName, + Node: d.agent.config.NodeName, + NodePartition: d.agent.config.PartitionOrEmpty(), }, } diff --git a/agent/http.go b/agent/http.go index 3c854287a8..af26cdc2c6 100644 --- a/agent/http.go +++ b/agent/http.go @@ -333,6 +333,11 @@ func (s *HTTPHandlers) nodeName() string { return s.agent.config.NodeName } +// nodePartition returns the node partition of the agent +func (s *HTTPHandlers) nodePartition() string { + return s.agent.config.PartitionOrEmpty() +} + // aclEndpointRE is used to find old ACL endpoints that take tokens in the URL // so that we can redact them. The ACL endpoints that take the token in the URL // are all of the form /v1/acl//, and can optionally include query @@ -1032,6 +1037,7 @@ func (s *HTTPHandlers) parseSource(req *http.Request, source *structs.QuerySourc } else { source.Node = node } + source.NodePartition = s.agent.config.PartitionOrEmpty() } } diff --git a/agent/http_oss.go b/agent/http_oss.go index 6622404726..4b7808ab25 100644 --- a/agent/http_oss.go +++ b/agent/http_oss.go @@ -17,7 +17,8 @@ func (s *HTTPHandlers) parseEntMeta(req *http.Request, entMeta *structs.Enterpri if queryNS := req.URL.Query().Get("ns"); queryNS != "" { return BadRequestError{Reason: "Invalid query parameter: \"ns\" - Namespaces are a Consul Enterprise feature"} } - return nil + + return parseEntMetaPartition(req, entMeta) } func (s *HTTPHandlers) validateEnterpriseIntentionNamespace(logName, ns string, _ bool) error { @@ -74,7 +75,13 @@ func (s *HTTPHandlers) uiTemplateDataTransform(data map[string]interface{}) erro return nil } -// parseEntMetaPartition is a noop for the enterprise implementation. func parseEntMetaPartition(req *http.Request, meta *structs.EnterpriseMeta) error { + if headerAP := req.Header.Get("X-Consul-Partition"); headerAP != "" { + return BadRequestError{Reason: "Invalid header: \"X-Consul-Partition\" - Partitions are a Consul Enterprise feature"} + } + if queryAP := req.URL.Query().Get("partition"); queryAP != "" { + return BadRequestError{Reason: "Invalid query parameter: \"partition\" - Partitions are a Consul Enterprise feature"} + } + return nil } diff --git a/agent/local/state.go b/agent/local/state.go index 5908c4af27..862f046fe2 100644 --- a/agent/local/state.go +++ b/agent/local/state.go @@ -58,6 +58,7 @@ type Config struct { DiscardCheckOutput bool NodeID types.NodeID NodeName string + Partition string // this defaults if empty TaggedAddresses map[string]string } @@ -176,6 +177,8 @@ type State struct { // Config is the agent config config Config + agentEnterpriseMeta structs.EnterpriseMeta + // nodeInfoInSync tracks whether the server has our correct top-level // node information in sync nodeInfoInSync bool @@ -208,14 +211,15 @@ type State struct { // NewState creates a new local state for the agent. func NewState(c Config, logger hclog.Logger, tokens *token.Store) *State { l := &State{ - config: c, - logger: logger, - services: make(map[structs.ServiceID]*ServiceState), - checks: make(map[structs.CheckID]*CheckState), - checkAliases: make(map[structs.ServiceID]map[structs.CheckID]chan<- struct{}), - metadata: make(map[string]string), - tokens: tokens, - notifyHandlers: make(map[chan<- struct{}]struct{}), + config: c, + logger: logger, + services: make(map[structs.ServiceID]*ServiceState), + checks: make(map[structs.CheckID]*CheckState), + checkAliases: make(map[structs.ServiceID]map[structs.CheckID]chan<- struct{}), + metadata: make(map[string]string), + tokens: tokens, + notifyHandlers: make(map[chan<- struct{}]struct{}), + agentEnterpriseMeta: *structs.NodeEnterpriseMetaInPartition(c.Partition), } l.SetDiscardCheckOutput(c.DiscardCheckOutput) return l @@ -267,6 +271,10 @@ func (l *State) addServiceLocked(service *structs.NodeService, token string) err service.ID = service.Service } + if l.agentEnterpriseMeta.PartitionOrDefault() != service.PartitionOrDefault() { + return fmt.Errorf("cannot add service %q to node in partition %q", service.CompoundServiceID(), l.config.Partition) + } + l.setServiceStateLocked(&ServiceState{ Service: service, Token: token, @@ -340,8 +348,8 @@ func (l *State) removeServiceLocked(id structs.ServiceID) error { return nil } -// Service returns the locally registered service that the -// agent is aware of and are being kept in sync with the server +// Service returns the locally registered service that the agent is aware of +// with this ID and are being kept in sync with the server. func (l *State) Service(id structs.ServiceID) *structs.NodeService { l.RLock() defer l.RUnlock() @@ -353,9 +361,43 @@ func (l *State) Service(id structs.ServiceID) *structs.NodeService { return s.Service } -// Services returns the locally registered services that the +// ServicesByName returns all the locally registered service instances that the +// agent is aware of with this name and are being kept in sync with the server +func (l *State) ServicesByName(sn structs.ServiceName) []*structs.NodeService { + l.RLock() + defer l.RUnlock() + + var found []*structs.NodeService + for id, s := range l.services { + if s.Deleted { + continue + } + + if !sn.EnterpriseMeta.Matches(&id.EnterpriseMeta) { + continue + } + if s.Service.Service == sn.Name { + found = append(found, s.Service) + } + } + return found +} + +// AllServices returns the locally registered services that the // agent is aware of and are being kept in sync with the server +func (l *State) AllServices() map[structs.ServiceID]*structs.NodeService { + return l.listServices(false, nil) +} + +// Services returns the locally registered services that the agent is aware of +// and are being kept in sync with the server +// +// Results are scoped to the provided namespace and partition. func (l *State) Services(entMeta *structs.EnterpriseMeta) map[structs.ServiceID]*structs.NodeService { + return l.listServices(true, entMeta) +} + +func (l *State) listServices(filtered bool, entMeta *structs.EnterpriseMeta) map[structs.ServiceID]*structs.NodeService { l.RLock() defer l.RUnlock() @@ -365,7 +407,7 @@ func (l *State) Services(entMeta *structs.EnterpriseMeta) map[structs.ServiceID] continue } - if !entMeta.Matches(&id.EnterpriseMeta) { + if filtered && !entMeta.Matches(&id.EnterpriseMeta) { continue } m[id] = s.Service @@ -395,6 +437,10 @@ func (l *State) SetServiceState(s *ServiceState) { l.Lock() defer l.Unlock() + if l.agentEnterpriseMeta.PartitionOrDefault() != s.Service.PartitionOrDefault() { + return + } + l.setServiceStateLocked(s) } @@ -483,15 +529,19 @@ func (l *State) addCheckLocked(check *structs.HealthCheck, token string) error { check.Output = "" } + // hard-set the node name and partition + check.Node = l.config.NodeName + check.EnterpriseMeta = structs.NewEnterpriseMetaWithPartition( + l.agentEnterpriseMeta.PartitionOrEmpty(), + check.NamespaceOrEmpty(), + ) + // if there is a serviceID associated with the check, make sure it exists before adding it // NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor if _, ok := l.services[check.CompoundServiceID()]; check.ServiceID != "" && !ok { return fmt.Errorf("Check %q refers to non-existent service %q", check.CheckID, check.ServiceID) } - // hard-set the node name - check.Node = l.config.NodeName - l.setCheckStateLocked(&CheckState{ Check: check, Token: token, @@ -510,6 +560,13 @@ func (l *State) AddAliasCheck(checkID structs.CheckID, srcServiceID structs.Serv l.Lock() defer l.Unlock() + if l.agentEnterpriseMeta.PartitionOrDefault() != checkID.PartitionOrDefault() { + return fmt.Errorf("cannot add alias check %q to node in partition %q", checkID.String(), l.config.Partition) + } + if l.agentEnterpriseMeta.PartitionOrDefault() != srcServiceID.PartitionOrDefault() { + return fmt.Errorf("cannot add alias check for %q to node in partition %q", srcServiceID.String(), l.config.Partition) + } + m, ok := l.checkAliases[srcServiceID] if !ok { m = make(map[structs.CheckID]chan<- struct{}) @@ -663,11 +720,23 @@ func (l *State) Check(id structs.CheckID) *structs.HealthCheck { return c.Check } +// AllChecks returns the locally registered checks that the +// agent is aware of and are being kept in sync with the server +func (l *State) AllChecks() map[structs.CheckID]*structs.HealthCheck { + return l.listChecks(false, nil) +} + // Checks returns the locally registered checks that the // agent is aware of and are being kept in sync with the server +// +// Results are scoped to the provided namespace and partition. func (l *State) Checks(entMeta *structs.EnterpriseMeta) map[structs.CheckID]*structs.HealthCheck { + return l.listChecks(true, entMeta) +} + +func (l *State) listChecks(filtered bool, entMeta *structs.EnterpriseMeta) map[structs.CheckID]*structs.HealthCheck { m := make(map[structs.CheckID]*structs.HealthCheck) - for id, c := range l.CheckStates(entMeta) { + for id, c := range l.listCheckStates(filtered, entMeta) { m[id] = c.Check } return m @@ -719,6 +788,10 @@ func (l *State) SetCheckState(c *CheckState) { l.Lock() defer l.Unlock() + if l.agentEnterpriseMeta.PartitionOrDefault() != c.Check.PartitionOrDefault() { + return + } + l.setCheckStateLocked(c) } @@ -737,11 +810,25 @@ func (l *State) setCheckStateLocked(c *CheckState) { l.TriggerSyncChanges() } +// AllCheckStates returns a shallow copy of all health check state records. +// The map contains a shallow copy of the current check states. +// +// The defer timers still point to the original values and must not be modified. +func (l *State) AllCheckStates() map[structs.CheckID]*CheckState { + return l.listCheckStates(false, nil) +} + // CheckStates returns a shallow copy of all health check state records. // The map contains a shallow copy of the current check states. // // The defer timers still point to the original values and must not be modified. +// +// Results are scoped to the provided namespace and partition. func (l *State) CheckStates(entMeta *structs.EnterpriseMeta) map[structs.CheckID]*CheckState { + return l.listCheckStates(true, entMeta) +} + +func (l *State) listCheckStates(filtered bool, entMeta *structs.EnterpriseMeta) map[structs.CheckID]*CheckState { l.RLock() defer l.RUnlock() @@ -750,7 +837,7 @@ func (l *State) CheckStates(entMeta *structs.EnterpriseMeta) map[structs.CheckID if c.Deleted { continue } - if !entMeta.Matches(&id.EnterpriseMeta) { + if filtered && !entMeta.Matches(&id.EnterpriseMeta) { continue } m[id] = c.Clone() @@ -758,12 +845,27 @@ func (l *State) CheckStates(entMeta *structs.EnterpriseMeta) map[structs.CheckID return m } +// AllCriticalCheckStates returns the locally registered checks that the +// agent is aware of and are being kept in sync with the server. +// The map contains a shallow copy of the current check states. +// +// The defer timers still point to the original values and must not be modified. +func (l *State) AllCriticalCheckStates() map[structs.CheckID]*CheckState { + return l.listCriticalCheckStates(false, nil) +} + // CriticalCheckStates returns the locally registered checks that the // agent is aware of and are being kept in sync with the server. // The map contains a shallow copy of the current check states. // // The defer timers still point to the original values and must not be modified. +// +// Results are scoped to the provided namespace and partition. func (l *State) CriticalCheckStates(entMeta *structs.EnterpriseMeta) map[structs.CheckID]*CheckState { + return l.listCriticalCheckStates(true, entMeta) +} + +func (l *State) listCriticalCheckStates(filtered bool, entMeta *structs.EnterpriseMeta) map[structs.CheckID]*CheckState { l.RLock() defer l.RUnlock() @@ -772,7 +874,7 @@ func (l *State) CriticalCheckStates(entMeta *structs.EnterpriseMeta) map[structs if c.Deleted || !c.Critical() { continue } - if !entMeta.Matches(&id.EnterpriseMeta) { + if filtered && !entMeta.Matches(&id.EnterpriseMeta) { continue } m[id] = c.Clone() @@ -887,7 +989,7 @@ func (l *State) updateSyncState() error { AllowStale: true, MaxStaleDuration: fullSyncReadMaxStale, }, - EnterpriseMeta: *structs.WildcardEnterpriseMetaInDefaultPartition(), + EnterpriseMeta: *l.agentEnterpriseMeta.WildcardEnterpriseMetaForPartition(), } var out1 structs.IndexedNodeServiceList @@ -958,7 +1060,7 @@ func (l *State) updateSyncState() error { if ls == nil { // The consul service is managed automatically and does // not need to be deregistered - if id == structs.ConsulCompoundServiceID { + if structs.IsConsulServiceID(id) { continue } @@ -1002,7 +1104,7 @@ func (l *State) updateSyncState() error { if lc == nil { // The Serf check is created automatically and does not // need to be deregistered. - if id == structs.SerfCompoundCheckID { + if structs.IsSerfCheckID(id) { l.logger.Debug("Skipping remote check since it is managed automatically", "check", structs.SerfCheckID) continue } @@ -1366,6 +1468,7 @@ func (l *State) syncNodeInfo() error { Address: l.config.AdvertiseAddr, TaggedAddresses: l.config.TaggedAddresses, NodeMeta: l.metadata, + EnterpriseMeta: l.agentEnterpriseMeta, WriteRequest: structs.WriteRequest{Token: at}, } var out struct{} diff --git a/agent/proxycfg/connect_proxy.go b/agent/proxycfg/connect_proxy.go index 2a2e1e6842..b69a4f720f 100644 --- a/agent/proxycfg/connect_proxy.go +++ b/agent/proxycfg/connect_proxy.go @@ -86,7 +86,7 @@ func (s *handlerConnectProxy) initialize(ctx context.Context) (ConfigSnapshot, e Datacenter: s.source.Datacenter, QueryOptions: structs.QueryOptions{Token: s.token}, ServiceName: s.proxyCfg.DestinationServiceName, - EnterpriseMeta: structs.NewEnterpriseMetaInDefaultPartition(s.proxyID.NamespaceOrEmpty()), + EnterpriseMeta: s.proxyID.EnterpriseMeta, }, intentionUpstreamsID, s.ch) if err != nil { return snap, err @@ -97,7 +97,7 @@ func (s *handlerConnectProxy) initialize(ctx context.Context) (ConfigSnapshot, e Name: structs.MeshConfigMesh, Datacenter: s.source.Datacenter, QueryOptions: structs.QueryOptions{Token: s.token}, - EnterpriseMeta: *structs.DefaultEnterpriseMetaInDefaultPartition(), + EnterpriseMeta: *s.proxyID.DefaultEnterpriseMetaForPartition(), }, meshConfigEntryID, s.ch) if err != nil { return snap, err @@ -228,7 +228,7 @@ func (s *handlerConnectProxy) handleUpdate(ctx context.Context, u cache.UpdateEv // Use the centralized upstream defaults if they exist and there isn't specific configuration for this upstream // This is only relevant to upstreams from intentions because for explicit upstreams the defaulting is handled // by the ResolveServiceConfig endpoint. - wildcardSID := structs.NewServiceID(structs.WildcardSpecifier, structs.WildcardEnterpriseMetaInDefaultPartition()) + wildcardSID := structs.NewServiceID(structs.WildcardSpecifier, s.proxyID.WildcardEnterpriseMetaForPartition()) defaults, ok := snap.ConnectProxy.UpstreamConfig[wildcardSID.String()] if ok { u = defaults diff --git a/agent/proxycfg/manager.go b/agent/proxycfg/manager.go index aca122b0c5..083291c137 100644 --- a/agent/proxycfg/manager.go +++ b/agent/proxycfg/manager.go @@ -145,7 +145,7 @@ func (m *Manager) syncState() { defer m.mu.Unlock() // Traverse the local state and ensure all proxy services are registered - services := m.State.Services(structs.WildcardEnterpriseMetaInDefaultPartition()) + services := m.State.AllServices() for sid, svc := range services { if svc.Kind != structs.ServiceKindConnectProxy && svc.Kind != structs.ServiceKindTerminatingGateway && diff --git a/agent/proxycfg/manager_test.go b/agent/proxycfg/manager_test.go index 8cd19a5139..324098f704 100644 --- a/agent/proxycfg/manager_test.go +++ b/agent/proxycfg/manager_test.go @@ -330,6 +330,7 @@ func TestManager_BasicLifecycle(t *testing.T) { rootsCacheKey, leafCacheKey, roots, webProxyCopy.(*structs.NodeService), + local.Config{}, expectSnapCopy.(*ConfigSnapshot), ) }) @@ -349,13 +350,14 @@ func testManager_BasicLifecycle( rootsCacheKey, leafCacheKey string, roots *structs.IndexedCARoots, webProxy *structs.NodeService, + agentConfig local.Config, expectSnap *ConfigSnapshot, ) { c := TestCacheWithTypes(t, types) require := require.New(t) logger := testutil.Logger(t) - state := local.NewState(local.Config{}, logger, &token.Store{}) + state := local.NewState(agentConfig, logger, &token.Store{}) source := &structs.QuerySource{Datacenter: "dc1"} // Stub state syncing diff --git a/agent/proxycfg/mesh_gateway.go b/agent/proxycfg/mesh_gateway.go index 7a402441b6..5ffc590b4f 100644 --- a/agent/proxycfg/mesh_gateway.go +++ b/agent/proxycfg/mesh_gateway.go @@ -29,12 +29,14 @@ func (s *handlerMeshGateway) initialize(ctx context.Context) (ConfigSnapshot, er return snap, err } + wildcardEntMeta := s.proxyID.WildcardEnterpriseMetaForPartition() + // Watch for all services err = s.cache.Notify(ctx, cachetype.CatalogServiceListName, &structs.DCSpecificRequest{ Datacenter: s.source.Datacenter, QueryOptions: structs.QueryOptions{Token: s.token}, Source: *s.source, - EnterpriseMeta: *structs.WildcardEnterpriseMetaInDefaultPartition(), + EnterpriseMeta: *wildcardEntMeta, }, serviceListWatchID, s.ch) if err != nil { @@ -85,7 +87,7 @@ func (s *handlerMeshGateway) initialize(ctx context.Context) (ConfigSnapshot, er Datacenter: s.source.Datacenter, QueryOptions: structs.QueryOptions{Token: s.token}, Kind: structs.ServiceResolver, - EnterpriseMeta: *structs.WildcardEnterpriseMetaInDefaultPartition(), + EnterpriseMeta: *wildcardEntMeta, }, serviceResolversWatchID, s.ch) if err != nil { s.logger.Named(logging.MeshGateway). diff --git a/agent/proxycfg/testing.go b/agent/proxycfg/testing.go index 7292cb44f5..dd5465d3d8 100644 --- a/agent/proxycfg/testing.go +++ b/agent/proxycfg/testing.go @@ -127,6 +127,7 @@ func TestUpstreamNodes(t testing.T) structs.CheckServiceNodes { Node: "test1", Address: "10.10.1.1", Datacenter: "dc1", + Partition: structs.NodeEnterpriseMetaInDefaultPartition().PartitionOrEmpty(), }, Service: structs.TestNodeService(t), }, @@ -136,6 +137,7 @@ func TestUpstreamNodes(t testing.T) structs.CheckServiceNodes { Node: "test2", Address: "10.10.1.2", Datacenter: "dc1", + Partition: structs.NodeEnterpriseMetaInDefaultPartition().PartitionOrEmpty(), }, Service: structs.TestNodeService(t), }, diff --git a/agent/proxycfg/upstreams.go b/agent/proxycfg/upstreams.go index ff0f1f2cdb..310290c25e 100644 --- a/agent/proxycfg/upstreams.go +++ b/agent/proxycfg/upstreams.go @@ -229,7 +229,8 @@ func (s *handlerUpstreams) resetWatchesFromChain( // Outside of transparent mode we only watch the chain target, B, // since A is a virtual service and traffic will not be sent to it. if !watchedChainEndpoints && s.proxyCfg.Mode == structs.ProxyModeTransparent { - chainEntMeta := structs.NewEnterpriseMetaInDefaultPartition(chain.Namespace) + // TODO(partitions): add partition to the disco chain + chainEntMeta := structs.NewEnterpriseMetaWithPartition("" /*TODO*/, chain.Namespace) opts := targetWatchOpts{ upstreamID: id, diff --git a/agent/rpc/subscribe/subscribe.go b/agent/rpc/subscribe/subscribe.go index be19e9fa9e..0e7bfb24d2 100644 --- a/agent/rpc/subscribe/subscribe.go +++ b/agent/rpc/subscribe/subscribe.go @@ -51,7 +51,7 @@ func (h *Server) Subscribe(req *pbsubscribe.SubscribeRequest, serverStream pbsub logger.Trace("new subscription") defer logger.Trace("subscription closed") - entMeta := structs.NewEnterpriseMetaInDefaultPartition(req.Namespace) + entMeta := structs.NewEnterpriseMetaWithPartition(req.Partition, req.Namespace) authz, err := h.Backend.ResolveTokenAndDefaultMeta(req.Token, &entMeta, nil) if err != nil { return err @@ -94,6 +94,7 @@ func toStreamSubscribeRequest(req *pbsubscribe.SubscribeRequest, entMeta structs Token: req.Token, Index: req.Index, Namespace: entMeta.NamespaceOrEmpty(), + Partition: entMeta.PartitionOrEmpty(), } } diff --git a/agent/rpcclient/health/view.go b/agent/rpcclient/health/view.go index 2f31dde210..60af617f00 100644 --- a/agent/rpcclient/health/view.go +++ b/agent/rpcclient/health/view.go @@ -29,6 +29,7 @@ func newMaterializerRequest(srvReq structs.ServiceSpecificRequest) func(index ui Datacenter: srvReq.Datacenter, Index: index, Namespace: srvReq.EnterpriseMeta.NamespaceOrEmpty(), + Partition: srvReq.EnterpriseMeta.PartitionOrEmpty(), } if srvReq.Connect { req.Topic = pbsubscribe.Topic_ServiceHealthConnect diff --git a/agent/sidecar_service.go b/agent/sidecar_service.go index da87b6fccf..673a02252e 100644 --- a/agent/sidecar_service.go +++ b/agent/sidecar_service.go @@ -126,7 +126,7 @@ func (a *Agent) sidecarServiceFromNodeService(ns *structs.NodeService, token str // it doesn't seem to be necessary - even with thousands of services this is // not expensive to compute. usedPorts := make(map[int]struct{}) - for _, otherNS := range a.State.Services(structs.WildcardEnterpriseMetaInDefaultPartition()) { + for _, otherNS := range a.State.AllServices() { // Check if other port is in auto-assign range if otherNS.Port >= a.config.ConnectSidecarMinPort && otherNS.Port <= a.config.ConnectSidecarMaxPort { diff --git a/agent/structs/catalog.go b/agent/structs/catalog.go index 7f3fa4a157..b118b99352 100644 --- a/agent/structs/catalog.go +++ b/agent/structs/catalog.go @@ -19,8 +19,3 @@ const ( ConsulServiceID = "consul" ConsulServiceName = "consul" ) - -var ( - ConsulCompoundServiceID = NewServiceID(ConsulServiceID, nil) // TODO(partitions): delete this in favor of IsConsulServiceID(ServiceID) - SerfCompoundCheckID = NewCheckID(SerfCheckID, nil) // TODO(partitions): delete this in favor of IsSerfCheckID(CheckID) -) diff --git a/agent/txn_endpoint.go b/agent/txn_endpoint.go index 42c2251c6d..afe298d629 100644 --- a/agent/txn_endpoint.go +++ b/agent/txn_endpoint.go @@ -152,11 +152,14 @@ func (s *HTTPHandlers) convertOps(resp http.ResponseWriter, req *http.Request) ( KV: &structs.TxnKVOp{ Verb: verb, DirEnt: structs.DirEntry{ - Key: in.KV.Key, - Value: in.KV.Value, - Flags: in.KV.Flags, - Session: in.KV.Session, - EnterpriseMeta: structs.NewEnterpriseMetaInDefaultPartition(in.KV.Namespace), + Key: in.KV.Key, + Value: in.KV.Value, + Flags: in.KV.Flags, + Session: in.KV.Session, + EnterpriseMeta: structs.NewEnterpriseMetaWithPartition( + in.KV.Partition, + in.KV.Namespace, + ), RaftIndex: structs.RaftIndex{ ModifyIndex: in.KV.Index, }, @@ -182,6 +185,7 @@ func (s *HTTPHandlers) convertOps(resp http.ResponseWriter, req *http.Request) ( Node: structs.Node{ ID: types.NodeID(node.ID), Node: node.Node, + Partition: node.Partition, Address: node.Address, Datacenter: node.Datacenter, TaggedAddresses: node.TaggedAddresses, @@ -216,7 +220,10 @@ func (s *HTTPHandlers) convertOps(resp http.ResponseWriter, req *http.Request) ( Warning: svc.Weights.Warning, }, EnableTagOverride: svc.EnableTagOverride, - EnterpriseMeta: structs.NewEnterpriseMetaInDefaultPartition(svc.Namespace), + EnterpriseMeta: structs.NewEnterpriseMetaWithPartition( + svc.Partition, + svc.Namespace, + ), RaftIndex: structs.RaftIndex{ ModifyIndex: svc.ModifyIndex, }, @@ -274,7 +281,10 @@ func (s *HTTPHandlers) convertOps(resp http.ResponseWriter, req *http.Request) ( Timeout: timeout, DeregisterCriticalServiceAfter: deregisterCriticalServiceAfter, }, - EnterpriseMeta: structs.NewEnterpriseMetaInDefaultPartition(check.Namespace), + EnterpriseMeta: structs.NewEnterpriseMetaWithPartition( + check.Partition, + check.Namespace, + ), RaftIndex: structs.RaftIndex{ ModifyIndex: check.ModifyIndex, }, diff --git a/agent/ui_endpoint.go b/agent/ui_endpoint.go index df2505759b..92fb70d6e7 100644 --- a/agent/ui_endpoint.go +++ b/agent/ui_endpoint.go @@ -603,6 +603,9 @@ func (s *HTTPHandlers) UIMetricsProxy(resp http.ResponseWriter, req *http.Reques s.clearTokenFromHeaders(req) var entMeta structs.EnterpriseMeta + if err := parseEntMetaPartition(req, &entMeta); err != nil { + return nil, err + } authz, err := s.agent.delegate.ResolveTokenAndDefaultMeta(token, &entMeta, nil) if err != nil { return nil, err @@ -611,9 +614,8 @@ func (s *HTTPHandlers) UIMetricsProxy(resp http.ResponseWriter, req *http.Reques // This endpoint requires wildcard read on all services and all nodes. // // In enterprise it requires this _in all namespaces_ too. - wildMeta := structs.WildcardEnterpriseMetaInDefaultPartition() var authzContext acl.AuthorizerContext - wildMeta.FillAuthzContext(&authzContext) + entMeta.WildcardEnterpriseMetaForPartition().FillAuthzContext(&authzContext) if authz.NodeReadAll(&authzContext) != acl.Allow || authz.ServiceReadAll(&authzContext) != acl.Allow { return nil, acl.ErrPermissionDenied diff --git a/agent/user_event.go b/agent/user_event.go index d3b6224b03..3df2b6bd82 100644 --- a/agent/user_event.go +++ b/agent/user_event.go @@ -187,6 +187,7 @@ func (a *Agent) shouldProcessUserEvent(msg *UserEvent) bool { } // Scan for a match + // NOTE: this only works in the default partition and default namespace services := a.State.Services(structs.DefaultEnterpriseMetaInDefaultPartition()) found := false OUTER: diff --git a/api/agent_test.go b/api/agent_test.go index 082857a484..f875092517 100644 --- a/api/agent_test.go +++ b/api/agent_test.go @@ -15,10 +15,11 @@ import ( "testing" "time" - "github.com/hashicorp/consul/sdk/testutil" - "github.com/hashicorp/consul/sdk/testutil/retry" "github.com/hashicorp/serf/serf" "github.com/stretchr/testify/require" + + "github.com/hashicorp/consul/sdk/testutil" + "github.com/hashicorp/consul/sdk/testutil/retry" ) func TestAPI_AgentSelf(t *testing.T) { @@ -793,6 +794,7 @@ func TestAPI_AgentService(t *testing.T) { }, Meta: map[string]string{}, Namespace: defaultNamespace, + Partition: defaultPartition, Datacenter: "dc1", } require.Equal(expect, got) diff --git a/api/coordinate_test.go b/api/coordinate_test.go index 8c8e986145..071b1f99e4 100644 --- a/api/coordinate_test.go +++ b/api/coordinate_test.go @@ -5,9 +5,10 @@ import ( "testing" "time" - "github.com/hashicorp/consul/sdk/testutil/retry" "github.com/hashicorp/serf/coordinate" "github.com/stretchr/testify/require" + + "github.com/hashicorp/consul/sdk/testutil/retry" ) func TestAPI_CoordinateDatacenters(t *testing.T) { @@ -85,8 +86,9 @@ func TestAPI_CoordinateUpdate(t *testing.T) { newCoord := coordinate.NewCoordinate(coordinate.DefaultConfig()) newCoord.Height = 0.5 entry := &CoordinateEntry{ - Node: node, - Coord: newCoord, + Node: node, + Partition: defaultPartition, + Coord: newCoord, } _, err = coord.Update(entry, nil) if err != nil { diff --git a/command/rtt/rtt.go b/command/rtt/rtt.go index 21e174157e..0710a4ef83 100644 --- a/command/rtt/rtt.go +++ b/command/rtt/rtt.go @@ -5,12 +5,16 @@ import ( "fmt" "strings" - "github.com/hashicorp/consul/command/flags" - "github.com/hashicorp/consul/lib" "github.com/hashicorp/serf/coordinate" "github.com/mitchellh/cli" + + "github.com/hashicorp/consul/command/flags" + "github.com/hashicorp/consul/lib" ) +// TODO(partitions): how will this command work when asking for RTT between a +// partitioned client and a server in the default partition? + func New(ui cli.Ui) *cmd { c := &cmd{UI: ui} c.init() From ac41e30614a103ca1b45b3ce581a833d5fd040d1 Mon Sep 17 00:00:00 2001 From: "R.B. Boyer" <4903+rboyer@users.noreply.github.com> Date: Thu, 19 Aug 2021 16:17:59 -0500 Subject: [PATCH 16/28] state: partition the nodes.uuid and nodes.meta indexes as well (#10882) --- agent/consul/autopilot.go | 8 +- agent/consul/state/acl_schema.go | 13 ++ agent/consul/state/catalog.go | 87 +++++++------ agent/consul/state/catalog_oss_test.go | 58 ++++++++- agent/consul/state/catalog_schema.go | 66 ++++++++-- agent/consul/state/catalog_test.go | 162 ++++++++++++------------- agent/consul/state/query.go | 51 +++++++- agent/consul/state/schema_test.go | 11 +- agent/consul/state/txn.go | 2 +- 9 files changed, 322 insertions(+), 136 deletions(-) diff --git a/agent/consul/autopilot.go b/agent/consul/autopilot.go index cc6cf62302..d926e4a991 100644 --- a/agent/consul/autopilot.go +++ b/agent/consul/autopilot.go @@ -6,11 +6,13 @@ import ( "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/consul/agent/metadata" - "github.com/hashicorp/consul/types" "github.com/hashicorp/raft" autopilot "github.com/hashicorp/raft-autopilot" "github.com/hashicorp/serf/serf" + + "github.com/hashicorp/consul/agent/metadata" + "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/types" ) var AutopilotGauges = []prometheus.GaugeDefinition{ @@ -127,7 +129,7 @@ func (s *Server) autopilotServerFromMetadata(srv *metadata.Server) (*autopilot.S // populate the node meta if there is any. When a node first joins or if // there are ACL issues then this could be empty if the server has not // yet been able to register itself in the catalog - _, node, err := s.fsm.State().GetNodeID(types.NodeID(srv.ID)) + _, node, err := s.fsm.State().GetNodeID(types.NodeID(srv.ID), structs.NodeEnterpriseMetaInDefaultPartition()) if err != nil { return nil, fmt.Errorf("error retrieving node from state store: %w", err) } diff --git a/agent/consul/state/acl_schema.go b/agent/consul/state/acl_schema.go index 22ccdbd95d..bbab26a1e9 100644 --- a/agent/consul/state/acl_schema.go +++ b/agent/consul/state/acl_schema.go @@ -212,6 +212,19 @@ func indexFromUUIDQuery(raw interface{}) ([]byte, error) { return uuidStringToBytes(q.Value) } +func prefixIndexFromUUIDQuery(arg interface{}) ([]byte, error) { + switch v := arg.(type) { + case *structs.EnterpriseMeta: + return nil, nil + case structs.EnterpriseMeta: + return nil, nil + case Query: + return variableLengthUUIDStringToBytes(v.Value) + } + + return nil, fmt.Errorf("unexpected type %T for Query prefix index", arg) +} + func multiIndexPolicyFromACLRole(raw interface{}) ([][]byte, error) { role, ok := raw.(*structs.ACLRole) if !ok { diff --git a/agent/consul/state/catalog.go b/agent/consul/state/catalog.go index eefdb82e31..24319da1b4 100644 --- a/agent/consul/state/catalog.go +++ b/agent/consul/state/catalog.go @@ -7,7 +7,6 @@ import ( "strings" memdb "github.com/hashicorp/go-memdb" - "github.com/hashicorp/go-uuid" "github.com/mitchellh/copystructure" "github.com/hashicorp/consul/acl" @@ -202,8 +201,6 @@ func (s *Store) EnsureNode(idx uint64, node *structs.Node) error { func ensureNoNodeWithSimilarNameTxn(tx ReadTxn, node *structs.Node, allowClashWithoutID bool) error { // Retrieve all of the nodes - // TODO(partitions): since the node UUID field is not partitioned, do we have to do something additional here? - enodes, err := tx.Get(tableNodes, indexID+"_prefix", node.GetEnterpriseMeta()) if err != nil { return fmt.Errorf("Cannot lookup all nodes: %s", err) @@ -277,8 +274,7 @@ func (s *Store) ensureNodeTxn(tx WriteTxn, idx uint64, preserveIndexes bool, nod // name is the same. var n *structs.Node if node.ID != "" { - // TODO(partitions): should this take a node ent-meta? - existing, err := getNodeIDTxn(tx, node.ID) + existing, err := getNodeIDTxn(tx, node.ID, node.GetEnterpriseMeta()) if err != nil { return fmt.Errorf("node lookup failed: %s", err) } @@ -384,14 +380,11 @@ func getNodeTxn(tx ReadTxn, nodeNameOrID string, entMeta *structs.EnterpriseMeta return nil, nil } -func getNodeIDTxn(tx ReadTxn, id types.NodeID) (*structs.Node, error) { - strnode := string(id) - uuidValue, err := uuid.ParseUUID(strnode) - if err != nil { - return nil, fmt.Errorf("node lookup by ID failed, wrong UUID: %v for '%s'", err, strnode) - } - - node, err := tx.First(tableNodes, "uuid", uuidValue) +func getNodeIDTxn(tx ReadTxn, id types.NodeID, entMeta *structs.EnterpriseMeta) (*structs.Node, error) { + node, err := tx.First(tableNodes, indexUUID+"_prefix", Query{ + Value: string(id), + EnterpriseMeta: *entMeta, + }) if err != nil { return nil, fmt.Errorf("node lookup by ID failed: %s", err) } @@ -402,17 +395,20 @@ func getNodeIDTxn(tx ReadTxn, id types.NodeID) (*structs.Node, error) { } // GetNodeID is used to retrieve a node registration by node ID. -func (s *Store) GetNodeID(id types.NodeID) (uint64, *structs.Node, error) { +func (s *Store) GetNodeID(id types.NodeID, entMeta *structs.EnterpriseMeta) (uint64, *structs.Node, error) { tx := s.db.Txn(false) defer tx.Abort() + // TODO: accept non-pointer value + if entMeta == nil { + entMeta = structs.NodeEnterpriseMetaInDefaultPartition() + } + // Get the table index. - /// - // NOTE: nodeIDs aren't partitioned so don't use the convenience function. - idx := maxIndexTxn(tx, tableNodes) + idx := catalogNodesMaxIndex(tx, entMeta) // Retrieve the node from the state store - node, err := getNodeIDTxn(tx, id) + node, err := getNodeIDTxn(tx, id, entMeta) return idx, node, err } @@ -455,19 +451,25 @@ func (s *Store) NodesByMeta(ws memdb.WatchSet, filters map[string]string, entMet } // Get the table index. - idx := maxIndexTxn(tx, tableNodes) - // TODO:(partitions) use the partitioned meta index - // idx := catalogNodesMaxIndex(tx, entMeta) - _ = entMeta + idx := catalogNodesMaxIndex(tx, entMeta) - // Retrieve all of the nodes - var args []interface{} - for key, value := range filters { - args = append(args, key, value) + if len(filters) == 0 { + return idx, nil, nil // NodesByMeta is never called with an empty map, but just in case make it return no results. + } + + // Retrieve all of the nodes. We'll do a lookup of just ONE KV pair, which + // over-matches if multiple pairs are requested, but then in the loop below + // we'll finish filtering. + var firstKey, firstValue string + for firstKey, firstValue = range filters { break } - nodes, err := tx.Get(tableNodes, "meta", args...) + nodes, err := tx.Get(tableNodes, indexMeta, KeyValueQuery{ + Key: firstKey, + Value: firstValue, + EnterpriseMeta: *entMeta, + }) if err != nil { return 0, nil, fmt.Errorf("failed nodes lookup: %s", err) } @@ -831,20 +833,34 @@ func (s *Store) ServicesByNodeMeta(ws memdb.WatchSet, filters map[string]string, tx := s.db.Txn(false) defer tx.Abort() + // TODO: accept non-pointer value + if entMeta == nil { + entMeta = structs.NodeEnterpriseMetaInDefaultPartition() + } + // Get the table index. idx := catalogServicesMaxIndex(tx, entMeta) if nodeIdx := catalogNodesMaxIndex(tx, entMeta); nodeIdx > idx { idx = nodeIdx } - // Retrieve all of the nodes with the meta k/v pair - var args []interface{} - for key, value := range filters { - args = append(args, key, value) + if len(filters) == 0 { + return idx, nil, nil // ServicesByNodeMeta is never called with an empty map, but just in case make it return no results. + } + + // Retrieve all of the nodes. We'll do a lookup of just ONE KV pair, which + // over-matches if multiple pairs are requested, but then in the loop below + // we'll finish filtering. + var firstKey, firstValue string + for firstKey, firstValue = range filters { break } - // TODO(partitions): scope the meta index to a partition - nodes, err := tx.Get(tableNodes, "meta", args...) + + nodes, err := tx.Get(tableNodes, indexMeta, KeyValueQuery{ + Key: firstKey, + Value: firstValue, + EnterpriseMeta: *entMeta, + }) if err != nil { return 0, nil, fmt.Errorf("failed nodes lookup: %s", err) } @@ -1276,7 +1292,10 @@ func (s *Store) nodeServices(ws memdb.WatchSet, nodeNameOrID string, entMeta *st } // Attempt to lookup the node by its node ID - iter, err := tx.Get(tableNodes, "uuid_prefix", resizeNodeLookupKey(nodeNameOrID)) + iter, err := tx.Get(tableNodes, indexUUID+"_prefix", Query{ + Value: resizeNodeLookupKey(nodeNameOrID), + EnterpriseMeta: *entMeta, + }) if err != nil { ws.Add(watchCh) // TODO(sean@): We could/should log an error re: the uuid_prefix lookup diff --git a/agent/consul/state/catalog_oss_test.go b/agent/consul/state/catalog_oss_test.go index c259d42aed..5c81e671ef 100644 --- a/agent/consul/state/catalog_oss_test.go +++ b/agent/consul/state/catalog_oss_test.go @@ -4,6 +4,7 @@ package state import ( "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/types" ) func testIndexerTableChecks() map[string]indexerTestCase { @@ -175,6 +176,8 @@ func testIndexerTableGatewayServices() map[string]indexerTestCase { } func testIndexerTableNodes() map[string]indexerTestCase { + uuidBuf, uuid := generateUUID() + return map[string]indexerTestCase{ indexID: { read: indexValue{ @@ -200,8 +203,59 @@ func testIndexerTableNodes() map[string]indexerTestCase { }, }, }, - // TODO: uuid - // TODO: meta + indexUUID: { + read: indexValue{ + source: Query{Value: uuid}, + expected: uuidBuf, + }, + write: indexValue{ + source: &structs.Node{ + ID: types.NodeID(uuid), + Node: "NoDeId", + }, + expected: uuidBuf, + }, + prefix: []indexValue{ + { + source: (*structs.EnterpriseMeta)(nil), + expected: nil, + }, + { + source: structs.EnterpriseMeta{}, + expected: nil, + }, + { // partial length + source: Query{Value: uuid[:6]}, + expected: uuidBuf[:3], + }, + { // full length + source: Query{Value: uuid}, + expected: uuidBuf, + }, + }, + }, + indexMeta: { + read: indexValue{ + source: KeyValueQuery{ + Key: "KeY", + Value: "VaLuE", + }, + expected: []byte("KeY\x00VaLuE\x00"), + }, + writeMulti: indexValueMulti{ + source: &structs.Node{ + Node: "NoDeId", + Meta: map[string]string{ + "MaP-kEy-1": "mAp-VaL-1", + "mAp-KeY-2": "MaP-vAl-2", + }, + }, + expected: [][]byte{ + []byte("MaP-kEy-1\x00mAp-VaL-1\x00"), + []byte("mAp-KeY-2\x00MaP-vAl-2\x00"), + }, + }, + }, // TODO(partitions): fix schema tests for tables that reference nodes too } diff --git a/agent/consul/state/catalog_schema.go b/agent/consul/state/catalog_schema.go index a161689668..7726fd6a17 100644 --- a/agent/consul/state/catalog_schema.go +++ b/agent/consul/state/catalog_schema.go @@ -27,6 +27,8 @@ const ( indexUpstream = "upstream" indexDownstream = "downstream" indexGateway = "gateway" + indexUUID = "uuid" + indexMeta = "meta" ) // nodesTableSchema returns a new table schema used for storing struct.Node. @@ -44,19 +46,23 @@ func nodesTableSchema() *memdb.TableSchema { prefixIndex: prefixIndexFromQueryNoNamespace, }, }, - "uuid": { - Name: "uuid", + indexUUID: { + Name: indexUUID, AllowMissing: true, Unique: true, - Indexer: &memdb.UUIDFieldIndex{Field: "ID"}, + Indexer: indexerSingleWithPrefix{ + readIndex: indexFromUUIDQuery, + writeIndex: indexIDFromNode, + prefixIndex: prefixIndexFromUUIDQuery, + }, }, - "meta": { - Name: "meta", + indexMeta: { + Name: indexMeta, AllowMissing: true, Unique: false, - Indexer: &memdb.StringMapFieldIndex{ - Field: "Meta", - Lowercase: false, + Indexer: indexerMulti{ + readIndex: indexFromKeyValueQuery, + writeIndexMulti: indexMetaFromNode, }, }, }, @@ -78,6 +84,50 @@ func indexFromNode(raw interface{}) ([]byte, error) { return b.Bytes(), nil } +func indexIDFromNode(raw interface{}) ([]byte, error) { + n, ok := raw.(*structs.Node) + if !ok { + return nil, fmt.Errorf("unexpected type %T for structs.Node index", raw) + } + + if n.ID == "" { + return nil, errMissingValueForIndex + } + + v, err := uuidStringToBytes(string(n.ID)) + if err != nil { + return nil, err + } + + return v, nil +} + +func indexMetaFromNode(raw interface{}) ([][]byte, error) { + n, ok := raw.(*structs.Node) + if !ok { + return nil, fmt.Errorf("unexpected type %T for structs.Node index", raw) + } + + // NOTE: this is case-sensitive! + + vals := make([][]byte, 0, len(n.Meta)) + for key, val := range n.Meta { + if key == "" { + continue + } + + var b indexBuilder + b.String(key) + b.String(val) + vals = append(vals, b.Bytes()) + } + if len(vals) == 0 { + return nil, errMissingValueForIndex + } + + return vals, nil +} + // servicesTableSchema returns a new table schema used to store information // about services. func servicesTableSchema() *memdb.TableSchema { diff --git a/agent/consul/state/catalog_test.go b/agent/consul/state/catalog_test.go index c4b09aeae3..75d37c5f75 100644 --- a/agent/consul/state/catalog_test.go +++ b/agent/consul/state/catalog_test.go @@ -1,6 +1,7 @@ package state import ( + crand "crypto/rand" "fmt" "reflect" "sort" @@ -29,23 +30,23 @@ func makeRandomNodeID(t *testing.T) types.NodeID { func TestStateStore_GetNodeID(t *testing.T) { s := testStateStore(t) - _, out, err := s.GetNodeID(types.NodeID("wrongId")) - if err == nil || out != nil || !strings.Contains(err.Error(), "node lookup by ID failed, wrong UUID") { - t.Fatalf("want an error, nil value, err:=%q ; out:=%q", err.Error(), out) + _, out, err := s.GetNodeID(types.NodeID("wrongId"), nil) + if err == nil || out != nil || !strings.Contains(err.Error(), "node lookup by ID failed: index error: UUID (without hyphens) must be") { + t.Errorf("want an error, nil value, err:=%q ; out:=%q", err.Error(), out) } - _, out, err = s.GetNodeID(types.NodeID("0123456789abcdefghijklmnopqrstuvwxyz")) - if err == nil || out != nil || !strings.Contains(err.Error(), "node lookup by ID failed, wrong UUID") { - t.Fatalf("want an error, nil value, err:=%q ; out:=%q", err, out) + _, out, err = s.GetNodeID(types.NodeID("0123456789abcdefghijklmnopqrstuvwxyz"), nil) + if err == nil || out != nil || !strings.Contains(err.Error(), "node lookup by ID failed: index error: invalid UUID") { + t.Errorf("want an error, nil value, err:=%q ; out:=%q", err, out) } - _, out, err = s.GetNodeID(types.NodeID("00a916bc-a357-4a19-b886-59419fcee50Z")) - if err == nil || out != nil || !strings.Contains(err.Error(), "node lookup by ID failed, wrong UUID") { - t.Fatalf("want an error, nil value, err:=%q ; out:=%q", err, out) + _, out, err = s.GetNodeID(types.NodeID("00a916bc-a357-4a19-b886-59419fcee50Z"), nil) + if err == nil || out != nil || !strings.Contains(err.Error(), "node lookup by ID failed: index error: invalid UUID") { + t.Errorf("want an error, nil value, err:=%q ; out:=%q", err, out) } - _, out, err = s.GetNodeID(types.NodeID("00a916bc-a357-4a19-b886-59419fcee506")) + _, out, err = s.GetNodeID(types.NodeID("00a916bc-a357-4a19-b886-59419fcee506"), nil) if err != nil || out != nil { - t.Fatalf("do not want any error nor returned value, err:=%q ; out:=%q", err, out) + t.Errorf("do not want any error nor returned value, err:=%q ; out:=%q", err, out) } nodeID := types.NodeID("00a916bc-a357-4a19-b886-59419fceeaaa") @@ -56,14 +57,14 @@ func TestStateStore_GetNodeID(t *testing.T) { } require.NoError(t, s.EnsureRegistration(1, req)) - _, out, err = s.GetNodeID(nodeID) + _, out, err = s.GetNodeID(nodeID, nil) require.NoError(t, err) if out == nil || out.ID != nodeID { t.Fatalf("out should not be nil and contain nodeId, but was:=%#v", out) } // Case insensitive lookup should work as well - _, out, err = s.GetNodeID(types.NodeID("00a916bC-a357-4a19-b886-59419fceeAAA")) + _, out, err = s.GetNodeID(types.NodeID("00a916bC-a357-4a19-b886-59419fceeAAA"), nil) require.NoError(t, err) if out == nil || out.ID != nodeID { t.Fatalf("out should not be nil and contain nodeId, but was:=%#v", out) @@ -201,7 +202,7 @@ func TestStateStore_EnsureRegistration(t *testing.T) { } require.Equal(t, node, out) - _, out2, err := s.GetNodeID(nodeID) + _, out2, err := s.GetNodeID(nodeID, nil) if err != nil { t.Fatalf("got err %s want nil", err) } @@ -416,7 +417,7 @@ func TestStateStore_EnsureRegistration_Restore(t *testing.T) { t.Fatalf("err: %s", err) } if out == nil { - _, out, err = s.GetNodeID(types.NodeID(nodeLookup)) + _, out, err = s.GetNodeID(types.NodeID(nodeLookup), nil) if err != nil { t.Fatalf("err: %s", err) } @@ -695,11 +696,11 @@ func TestNodeRenamingNodes(t *testing.T) { t.Fatalf("err: %s", err) } - if _, node, err := s.GetNodeID(nodeID1); err != nil || node == nil || node.ID != nodeID1 { + if _, node, err := s.GetNodeID(nodeID1, nil); err != nil || node == nil || node.ID != nodeID1 { t.Fatalf("err: %s, node:= %q", err, node) } - if _, node, err := s.GetNodeID(nodeID2); err != nil && node == nil || node.ID != nodeID2 { + if _, node, err := s.GetNodeID(nodeID2, nil); err != nil && node == nil || node.ID != nodeID2 { t.Fatalf("err: %s", err) } @@ -750,7 +751,7 @@ func TestNodeRenamingNodes(t *testing.T) { } // Retrieve the node again - idx2, out2, err := s.GetNodeID(nodeID2) + idx2, out2, err := s.GetNodeID(nodeID2, nil) if err != nil { t.Fatalf("err: %s", err) } @@ -1137,6 +1138,11 @@ func TestStateStore_GetNodesByMeta(t *testing.T) { filters map[string]string nodes []string }{ + // Empty meta filter + { + filters: map[string]string{}, + nodes: []string{}, + }, // Simple meta filter { filters: map[string]string{"role": "server"}, @@ -1206,9 +1212,7 @@ func TestStateStore_NodeServices(t *testing.T) { Node: "node1", Address: "1.2.3.4", } - if err := s.EnsureRegistration(1, req); err != nil { - t.Fatalf("err: %s", err) - } + require.NoError(t, s.EnsureRegistration(1, req)) } { req := &structs.RegisterRequest{ @@ -1216,83 +1220,59 @@ func TestStateStore_NodeServices(t *testing.T) { Node: "node2", Address: "5.6.7.8", } - if err := s.EnsureRegistration(2, req); err != nil { - t.Fatalf("err: %s", err) - } + require.NoError(t, s.EnsureRegistration(2, req)) } // Look up by name. - { - _, ns, err := s.NodeServices(nil, "node1", nil) - if err != nil { - t.Fatalf("err: %v", err) + t.Run("Look up by name", func(t *testing.T) { + { + _, ns, err := s.NodeServices(nil, "node1", nil) + require.NoError(t, err) + require.NotNil(t, ns) + require.Equal(t, "node1", ns.Node.Node) } - if ns == nil || ns.Node.Node != "node1" { - t.Fatalf("bad: %#v", *ns) + { + _, ns, err := s.NodeServices(nil, "node2", nil) + require.NoError(t, err) + require.NotNil(t, ns) + require.Equal(t, "node2", ns.Node.Node) } - } - { - _, ns, err := s.NodeServices(nil, "node2", nil) - if err != nil { - t.Fatalf("err: %v", err) - } - if ns == nil || ns.Node.Node != "node2" { - t.Fatalf("bad: %#v", *ns) - } - } + }) - // Look up by UUID. - { - _, ns, err := s.NodeServices(nil, "40e4a748-2192-161a-0510-aaaaaaaaaaaa", nil) - if err != nil { - t.Fatalf("err: %v", err) + t.Run("Look up by UUID", func(t *testing.T) { + { + _, ns, err := s.NodeServices(nil, "40e4a748-2192-161a-0510-aaaaaaaaaaaa", nil) + require.NoError(t, err) + require.NotNil(t, ns) + require.Equal(t, "node1", ns.Node.Node) } - if ns == nil || ns.Node.Node != "node1" { - t.Fatalf("bad: %#v", ns) + { + _, ns, err := s.NodeServices(nil, "40e4a748-2192-161a-0510-bbbbbbbbbbbb", nil) + require.NoError(t, err) + require.NotNil(t, ns) + require.Equal(t, "node2", ns.Node.Node) } - } - { - _, ns, err := s.NodeServices(nil, "40e4a748-2192-161a-0510-bbbbbbbbbbbb", nil) - if err != nil { - t.Fatalf("err: %v", err) - } - if ns == nil || ns.Node.Node != "node2" { - t.Fatalf("bad: %#v", ns) - } - } + }) - // Ambiguous prefix. - { + t.Run("Ambiguous prefix", func(t *testing.T) { _, ns, err := s.NodeServices(nil, "40e4a748-2192-161a-0510", nil) - if err != nil { - t.Fatalf("err: %v", err) - } - if ns != nil { - t.Fatalf("bad: %#v", ns) - } - } + require.NoError(t, err) + require.Nil(t, ns) + }) - // Bad node, and not a UUID (should not get a UUID error). - { + t.Run("Bad node", func(t *testing.T) { + // Bad node, and not a UUID (should not get a UUID error). _, ns, err := s.NodeServices(nil, "nope", nil) - if err != nil { - t.Fatalf("err: %v", err) - } - if ns != nil { - t.Fatalf("bad: %#v", ns) - } - } + require.NoError(t, err) + require.Nil(t, ns) + }) - // Specific prefix. - { + t.Run("Specific prefix", func(t *testing.T) { _, ns, err := s.NodeServices(nil, "40e4a748-2192-161a-0510-bb", nil) - if err != nil { - t.Fatalf("err: %v", err) - } - if ns == nil || ns.Node.Node != "node2" { - t.Fatalf("bad: %#v", ns) - } - } + require.NoError(t, err) + require.NotNil(t, ns) + require.Equal(t, "node2", ns.Node.Node) + }) } func TestStateStore_DeleteNode(t *testing.T) { @@ -7582,3 +7562,17 @@ func dumpMaxIndexes(t *testing.T, tx ReadTxn) map[string]uint64 { } return out } + +func generateUUID() ([]byte, string) { + buf := make([]byte, 16) + if _, err := crand.Read(buf); err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + uuid := fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", + buf[0:4], + buf[4:6], + buf[6:8], + buf[8:10], + buf[10:16]) + return buf, uuid +} diff --git a/agent/consul/state/query.go b/agent/consul/state/query.go index 341f995808..3dba99a4eb 100644 --- a/agent/consul/state/query.go +++ b/agent/consul/state/query.go @@ -51,13 +51,25 @@ func indexFromServiceNameAsString(arg interface{}) ([]byte, error) { return b.Bytes(), nil } -// uuidStringToBytes is a modified version of memdb.UUIDFieldIndex.parseString func uuidStringToBytes(uuid string) ([]byte, error) { - l := len(uuid) - if l != 36 { + // Verify the length + if l := len(uuid); l != 36 { return nil, fmt.Errorf("UUID must be 36 characters") } + return parseUUIDString(uuid) +} +func variableLengthUUIDStringToBytes(uuid string) ([]byte, error) { + // Verify the length + if l := len(uuid); l > 36 { + return nil, fmt.Errorf("Invalid UUID length. UUID have 36 characters; got %d", l) + } + return parseUUIDString(uuid) +} + +// parseUUIDString is a modified version of memdb.UUIDFieldIndex.parseString. +// Callers should verify the length. +func parseUUIDString(uuid string) ([]byte, error) { hyphens := strings.Count(uuid, "-") if hyphens > 4 { return nil, fmt.Errorf(`UUID should have maximum of 4 "-"; got %d`, hyphens) @@ -83,3 +95,36 @@ type BoolQuery struct { Value bool structs.EnterpriseMeta } + +// KeyValueQuery is a type used to query for both a key and a value that may +// include an enterprise identifier. +type KeyValueQuery struct { + Key string + Value string + structs.EnterpriseMeta +} + +// NamespaceOrDefault exists because structs.EnterpriseMeta uses a pointer +// receiver for this method. Remove once that is fixed. +func (q KeyValueQuery) NamespaceOrDefault() string { + return q.EnterpriseMeta.NamespaceOrDefault() +} + +// PartitionOrDefault exists because structs.EnterpriseMeta uses a pointer +// receiver for this method. Remove once that is fixed. +func (q KeyValueQuery) PartitionOrDefault() string { + return q.EnterpriseMeta.PartitionOrDefault() +} + +func indexFromKeyValueQuery(arg interface{}) ([]byte, error) { + // NOTE: this is case-sensitive! + q, ok := arg.(KeyValueQuery) + if !ok { + return nil, fmt.Errorf("unexpected type %T for Query index", arg) + } + + var b indexBuilder + b.String(q.Key) + b.String(q.Value) + return b.Bytes(), nil +} diff --git a/agent/consul/state/schema_test.go b/agent/consul/state/schema_test.go index 8b1c6507e9..b84f3d7f94 100644 --- a/agent/consul/state/schema_test.go +++ b/agent/consul/state/schema_test.go @@ -1,6 +1,7 @@ package state import ( + "sort" "testing" "github.com/hashicorp/go-memdb" @@ -110,12 +111,20 @@ func (tc indexerTestCase) run(t *testing.T, indexer memdb.Indexer) { } } + sortMultiByteSlice := func(v [][]byte) { + sort.Slice(v, func(i, j int) bool { + return string(v[i]) < string(v[j]) + }) + } + if i, ok := indexer.(memdb.MultiIndexer); ok { t.Run("writeIndexMulti", func(t *testing.T) { valid, actual, err := i.FromObject(tc.writeMulti.source) require.NoError(t, err) require.True(t, valid) - require.Equal(t, tc.writeMulti.expected, actual) + sortMultiByteSlice(actual) + sortMultiByteSlice(tc.writeMulti.expected) + require.ElementsMatch(t, tc.writeMulti.expected, actual) }) } diff --git a/agent/consul/state/txn.go b/agent/consul/state/txn.go index bb682efd37..8caa4a9d75 100644 --- a/agent/consul/state/txn.go +++ b/agent/consul/state/txn.go @@ -153,7 +153,7 @@ func (s *Store) txnNode(tx WriteTxn, idx uint64, op *structs.TxnNodeOp) (structs getNode := func() (*structs.Node, error) { if op.Node.ID != "" { - return getNodeIDTxn(tx, op.Node.ID) + return getNodeIDTxn(tx, op.Node.ID, op.Node.GetEnterpriseMeta()) } else { return getNodeTxn(tx, op.Node.Node, op.Node.GetEnterpriseMeta()) } From c42ea82883ffe78bb06f7fdd137ad7fcb7adef59 Mon Sep 17 00:00:00 2001 From: Zachary Shilton <4624598+zchsh@users.noreply.github.com> Date: Fri, 20 Aug 2021 12:20:01 -0400 Subject: [PATCH 17/28] Upgrade global styles (#10692) * website: upgrade global-styles packages * website: move community page to CSS modules * website: replace g-container with g-grid-container * website: hide alert-banner on mobile * website: backfill missing global type styles * website: fix code font-size in download custom content * website: bump to latest patched dependencies --- website/components/callout-blade/index.jsx | 6 +- .../components/cloud-offerings-list/index.jsx | 2 +- website/components/cta-hero/index.jsx | 2 +- website/components/footer/index.jsx | 2 +- website/components/footer/style.css | 2 +- .../components/hcp-callout-section/index.jsx | 4 +- .../static-dynamic-diagram/style.module.css | 1 + website/package-lock.json | 1225 +++++++++-------- website/package.json | 14 +- website/pages/404.jsx | 2 +- website/pages/_app.js | 2 +- website/pages/community/index.jsx | 3 +- website/pages/community/style.css | 8 - website/pages/community/style.module.css | 9 + website/pages/downloads/style.module.css | 4 + website/pages/print.css | 2 +- website/pages/style.css | 5 - 17 files changed, 653 insertions(+), 640 deletions(-) delete mode 100644 website/pages/community/style.css create mode 100644 website/pages/community/style.module.css diff --git a/website/components/callout-blade/index.jsx b/website/components/callout-blade/index.jsx index 104bea2370..e86ad03b9e 100644 --- a/website/components/callout-blade/index.jsx +++ b/website/components/callout-blade/index.jsx @@ -7,7 +7,7 @@ export default function CalloutBlade({ title, callouts }) { return (
-

{title}

+

{title}

    - {callout.title &&
    {callout.title}
    } + {callout.title && ( +
    {callout.title}
    + )}

    {callout.description}

    diff --git a/website/components/cloud-offerings-list/index.jsx b/website/components/cloud-offerings-list/index.jsx index 891ad61237..7a32e1e9ff 100644 --- a/website/components/cloud-offerings-list/index.jsx +++ b/website/components/cloud-offerings-list/index.jsx @@ -12,7 +12,7 @@ export default function CloudOfferingsList({ offerings }) { > {offering.title} {offering.eyebrow} -

    {offering.title}

    +

    {offering.title}

    {offering.description}