Use agent token for service/check deregistration during anti-entropy (#16097)

Use only the agent token for deregistration during anti-entropy The previous behavior had the agent attempt to use the "service" token (i.e. from the `token` field in a service definition file), and if that was not set then it would use the agent token. The previous behavior was problematic because, if the service token had been deleted, the deregistration request would fail. The agent would retry the deregistration during each anti-entropy sync, and the situation would never resolve. The new behavior is to only/always use the agent token for service and check deregistration during anti-entropy. This approach is: * Simpler: No fallback logic to try different tokens * Faster (slightly): No time spent attempting the service token * Correct: The agent token is able to deregister services on that agent's node, because: * node:write permissions allow deregistration of services/checks on that node. * The agent token must have node:write permission, or else the agent is not be able to (de)register itself into the catalog Co-authored-by: Vesa Hagström <weeezes@gmail.com>
2 years ago · a884d0d7c7
3 changed files with 46 additions and 9 deletions
--- a/.changelog/16097.txt
+++ b/.changelog/16097.txt
@ -0,0 +1,3 @@
 ```release-note:bug-fix
 agent: Only use the `agent` token for internal deregistration of checks and services during anti-entropy syncing. The service token specified in the `token` field of the check or service definition is no longer used for deregistration. This fixes an issue where the agent failed to ever deregister a service or check if the service token was deleted.
 ```
--- a/agent/local/state.go
+++ b/agent/local/state.go
@ -1293,7 +1293,12 @@ func (l *State) deleteService(key structs.ServiceID) error {
 		return fmt.Errorf("ServiceID missing")
 	}
-	st := l.aclTokenForServiceSync(key, l.tokens.AgentToken)
+	// Always use the agent token to delete without trying the service token.
 	// This works because the agent token really must have node:write
 	// permission and node:write allows deregistration of services/checks on
 	// that node. Because the service token may have been deleted, using the
 	// agent token without fallback logic is a bit faster, simpler, and safer.
 	st := l.tokens.AgentToken()
 	req := structs.DeregisterRequest{
 		Datacenter:     l.config.Datacenter,
 		Node:           l.config.NodeName,
@ -1344,7 +1349,9 @@ func (l *State) deleteCheck(key structs.CheckID) error {
 		return fmt.Errorf("CheckID missing")
 	}
-	ct := l.aclTokenForCheckSync(key, l.tokens.AgentToken)
+	// Always use the agent token for deletion. Refer to deleteService() for
 	// an explanation.
 	ct := l.tokens.AgentToken()
 	req := structs.DeregisterRequest{
 		Datacenter:     l.config.Datacenter,
 		Node:           l.config.NodeName,
--- a/agent/local/state_test.go
+++ b/agent/local/state_test.go
@ -829,10 +829,6 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) {
 }
 var testRegisterRules = `
 node "" {
 	policy = "write"
 }
 service "api" {
 	policy = "write"
 }
@ -863,6 +859,9 @@ func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) {
 	defer a.Shutdown()
 	testrpc.WaitForLeader(t, a.RPC, "dc1")
 	// The agent token is the only token used for deleteService.
 	setAgentToken(t, a)
 	token := createToken(t, a, testRegisterRules)
 	// Create service (disallowed)
@ -1153,15 +1152,19 @@ type RPC interface {
 func createToken(t *testing.T, rpc RPC, policyRules string) string {
 	t.Helper()
 	uniqueId, err := uuid.GenerateUUID()
 	require.NoError(t, err)
 	policyName := "the-policy-" + uniqueId
 	reqPolicy := structs.ACLPolicySetRequest{
 		Datacenter: "dc1",
 		Policy: structs.ACLPolicy{
-			Name:  "the-policy",
+			Name:  policyName,
 			Rules: policyRules,
 		},
 		WriteRequest: structs.WriteRequest{Token: "root"},
 	}
-	err := rpc.RPC(context.Background(), "ACL.PolicySet", &reqPolicy, &structs.ACLPolicy{})
+	err = rpc.RPC(context.Background(), "ACL.PolicySet", &reqPolicy, &structs.ACLPolicy{})
 	require.NoError(t, err)
 	token, err := uuid.GenerateUUID()
@ -1171,7 +1174,7 @@ func createToken(t *testing.T, rpc RPC, policyRules string) string {
 		Datacenter: "dc1",
 		ACLToken: structs.ACLToken{
 			SecretID: token,
-			Policies: []structs.ACLTokenPolicyLink{{Name: "the-policy"}},
+			Policies: []structs.ACLTokenPolicyLink{{Name: policyName}},
 		},
 		WriteRequest: structs.WriteRequest{Token: "root"},
 	}
@ -1180,6 +1183,27 @@ func createToken(t *testing.T, rpc RPC, policyRules string) string {
 	return token
 }
 // setAgentToken sets the 'agent' token for this agent. It creates a new token
 // with node:write for the agent's node name, and service:write for any
 // service.
 func setAgentToken(t *testing.T, a *agent.TestAgent) {
 	var policy = fmt.Sprintf(`
 	  node "%s" {
 		policy = "write"
 	  }
 	  service_prefix "" {
 		policy = "read"
 	  }
 	`, a.Config.NodeName)
 	token := createToken(t, a, policy)
 	_, err := a.Client().Agent().UpdateAgentACLToken(token, &api.WriteOptions{Token: "root"})
 	if err != nil {
 		t.Fatalf("setting agent token: %v", err)
 	}
 }
 func TestAgentAntiEntropy_Checks(t *testing.T) {
 	if testing.Short() {
 		t.Skip("too slow for testing.Short")
@ -1481,6 +1505,9 @@ func TestAgentAntiEntropy_Checks_ACLDeny(t *testing.T) {
 	testrpc.WaitForLeader(t, a.RPC, dc)
 	// The agent token is the only token used for deleteCheck.
 	setAgentToken(t, a)
 	token := createToken(t, a, testRegisterRules)
 	// Create services using the root token