Migrate the node health controller to use the cache (#20248)

Some edge case error testing had to be removed because it was no longer possible to force errors when going through the cache layer as opposed to the resource service itself.
pull/20292/head
Matt Keeler 2024-01-19 12:22:45 -05:00 committed by GitHub
parent f942f2dc18
commit 0a261682cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 42 additions and 87 deletions

View File

@ -7,19 +7,22 @@ import (
"context" "context"
"fmt" "fmt"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"github.com/hashicorp/consul/internal/controller" "github.com/hashicorp/consul/internal/controller"
"github.com/hashicorp/consul/internal/controller/cache"
"github.com/hashicorp/consul/internal/controller/cache/indexers"
"github.com/hashicorp/consul/internal/controller/dependency" "github.com/hashicorp/consul/internal/controller/dependency"
"github.com/hashicorp/consul/internal/resource" "github.com/hashicorp/consul/internal/resource"
pbcatalog "github.com/hashicorp/consul/proto-public/pbcatalog/v2beta1" pbcatalog "github.com/hashicorp/consul/proto-public/pbcatalog/v2beta1"
"github.com/hashicorp/consul/proto-public/pbresource" "github.com/hashicorp/consul/proto-public/pbresource"
) )
const (
nodeOwnerIndexName = "owner"
)
func NodeHealthController() *controller.Controller { func NodeHealthController() *controller.Controller {
return controller.NewController(StatusKey, pbcatalog.NodeType). return controller.NewController(StatusKey, pbcatalog.NodeType).
WithWatch(pbcatalog.NodeHealthStatusType, dependency.MapOwnerFiltered(pbcatalog.NodeType)). WithWatch(pbcatalog.NodeHealthStatusType, dependency.MapOwnerFiltered(pbcatalog.NodeType), indexers.OwnerIndex(nodeOwnerIndexName)).
WithReconciler(&nodeHealthReconciler{}) WithReconciler(&nodeHealthReconciler{})
} }
@ -33,38 +36,36 @@ func (r *nodeHealthReconciler) Reconcile(ctx context.Context, rt controller.Runt
rt.Logger.Trace("reconciling node health") rt.Logger.Trace("reconciling node health")
// read the node // read the node
rsp, err := rt.Client.Read(ctx, &pbresource.ReadRequest{Id: req.ID}) node, err := rt.Cache.Get(pbcatalog.NodeType, "id", req.ID)
switch { if err != nil {
case status.Code(err) == codes.NotFound: rt.Logger.Error("the cache has returned an unexpected error", "error", err)
rt.Logger.Trace("node has been deleted")
return nil
case err != nil:
rt.Logger.Error("the resource service has returned an unexpected error", "error", err)
return err return err
} }
if node == nil {
rt.Logger.Trace("node has been deleted")
return nil
}
res := rsp.Resource health, err := getNodeHealth(rt, req.ID)
health, err := getNodeHealth(ctx, rt, req.ID)
if err != nil { if err != nil {
rt.Logger.Error("failed to calculate the nodes health", "error", err) rt.Logger.Error("failed to calculate the nodes health", "error", err)
return err return err
} }
newStatus := &pbresource.Status{ newStatus := &pbresource.Status{
ObservedGeneration: res.Generation, ObservedGeneration: node.Generation,
Conditions: []*pbresource.Condition{ Conditions: []*pbresource.Condition{
Conditions[health], Conditions[health],
}, },
} }
if resource.EqualStatus(res.Status[StatusKey], newStatus, false) { if resource.EqualStatus(node.Status[StatusKey], newStatus, false) {
rt.Logger.Trace("resources node health status is unchanged", "health", health.String()) rt.Logger.Trace("resources node health status is unchanged", "health", health.String())
return nil return nil
} }
_, err = rt.Client.WriteStatus(ctx, &pbresource.WriteStatusRequest{ _, err = rt.Client.WriteStatus(ctx, &pbresource.WriteStatusRequest{
Id: res.Id, Id: node.Id,
Key: StatusKey, Key: StatusKey,
Status: newStatus, Status: newStatus,
}) })
@ -78,30 +79,24 @@ func (r *nodeHealthReconciler) Reconcile(ctx context.Context, rt controller.Runt
return nil return nil
} }
func getNodeHealth(ctx context.Context, rt controller.Runtime, nodeRef *pbresource.ID) (pbcatalog.Health, error) { func getNodeHealth(rt controller.Runtime, nodeRef *pbresource.ID) (pbcatalog.Health, error) {
rsp, err := rt.Client.ListByOwner(ctx, &pbresource.ListByOwnerRequest{ iter, err := cache.ListIteratorDecoded[*pbcatalog.NodeHealthStatus](rt.Cache, pbcatalog.NodeHealthStatusType, nodeOwnerIndexName, nodeRef)
Owner: nodeRef,
})
if err != nil { if err != nil {
return pbcatalog.Health_HEALTH_CRITICAL, err return pbcatalog.Health_HEALTH_CRITICAL, err
} }
health := pbcatalog.Health_HEALTH_PASSING health := pbcatalog.Health_HEALTH_PASSING
for _, res := range rsp.Resources { for hs, err := iter.Next(); hs != nil || err != nil; hs, err = iter.Next() {
if resource.EqualType(res.Id.Type, pbcatalog.NodeHealthStatusType) { if err != nil {
var hs pbcatalog.NodeHealthStatus // This should be impossible as the resource service + type validations the
if err := res.Data.UnmarshalTo(&hs); err != nil { // catalog is performing will ensure that no data gets written where unmarshalling
// This should be impossible as the resource service + type validations the // to this type will error.
// catalog is performing will ensure that no data gets written where unmarshalling return pbcatalog.Health_HEALTH_CRITICAL, fmt.Errorf("error getting decoded health status data: %w", err)
// to this type will error. }
return pbcatalog.Health_HEALTH_CRITICAL, fmt.Errorf("error unmarshalling health status data: %w", err)
}
if hs.Status > health { if hs.Data.Status > health {
health = hs.Status health = hs.Data.Status
}
} }
} }

View File

@ -11,8 +11,6 @@ import (
"github.com/oklog/ulid/v2" "github.com/oklog/ulid/v2"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite" "github.com/stretchr/testify/suite"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
svctest "github.com/hashicorp/consul/agent/grpc-external/services/resource/testing" svctest "github.com/hashicorp/consul/agent/grpc-external/services/resource/testing"
"github.com/hashicorp/consul/internal/catalog/internal/types" "github.com/hashicorp/consul/internal/catalog/internal/types"
@ -70,7 +68,7 @@ type nodeHealthControllerTestSuite struct {
resourceClient *resourcetest.Client resourceClient *resourcetest.Client
runtime controller.Runtime runtime controller.Runtime
ctl nodeHealthReconciler ctl *controller.TestController
nodeNoHealth *pbresource.ID nodeNoHealth *pbresource.ID
nodePassing *pbresource.ID nodePassing *pbresource.ID
@ -97,30 +95,13 @@ func (suite *nodeHealthControllerTestSuite) SetupTest() {
WithTenancies(suite.tenancies...). WithTenancies(suite.tenancies...).
Run(suite.T()) Run(suite.T())
suite.resourceClient = resourcetest.NewClient(client) suite.ctl = controller.NewTestController(NodeHealthController(), client).
suite.runtime = controller.Runtime{Client: suite.resourceClient, Logger: testutil.Logger(suite.T())} WithLogger(testutil.Logger(suite.T()))
suite.runtime = suite.ctl.Runtime()
suite.resourceClient = resourcetest.NewClient(suite.runtime.Client)
suite.isEnterprise = versiontest.IsEnterprise() suite.isEnterprise = versiontest.IsEnterprise()
} }
func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthListError() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
// This resource id references a resource type that will not be
// registered with the resource service. The ListByOwner call
// should produce an InvalidArgument error. This test is meant
// to validate how that error is handled (its propagated back
// to the caller)
ref := resourceID(
&pbresource.Type{Group: "not", GroupVersion: "v1", Kind: "found"},
"irrelevant",
tenancy,
)
health, err := getNodeHealth(context.Background(), suite.runtime, ref)
require.Equal(suite.T(), pbcatalog.Health_HEALTH_CRITICAL, health)
require.Error(suite.T(), err)
require.Equal(suite.T(), codes.InvalidArgument, status.Code(err))
})
}
func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthNoNode() { func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthNoNode() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) { suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
// This test is meant to ensure that when the node doesn't exist // This test is meant to ensure that when the node doesn't exist
@ -131,7 +112,7 @@ func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthNoNode() {
Partition: tenancy.Partition, Partition: tenancy.Partition,
}) })
ref.Uid = ulid.Make().String() ref.Uid = ulid.Make().String()
health, err := getNodeHealth(context.Background(), suite.runtime, ref) health, err := getNodeHealth(suite.runtime, ref)
require.NoError(suite.T(), err) require.NoError(suite.T(), err)
require.Equal(suite.T(), pbcatalog.Health_HEALTH_PASSING, health) require.Equal(suite.T(), pbcatalog.Health_HEALTH_PASSING, health)
@ -141,7 +122,7 @@ func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthNoNode() {
func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthNoStatus() { func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthNoStatus() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) { suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
health, err := getNodeHealth(context.Background(), suite.runtime, suite.nodeNoHealth) health, err := getNodeHealth(suite.runtime, suite.nodeNoHealth)
require.NoError(suite.T(), err) require.NoError(suite.T(), err)
require.Equal(suite.T(), pbcatalog.Health_HEALTH_PASSING, health) require.Equal(suite.T(), pbcatalog.Health_HEALTH_PASSING, health)
}) })
@ -150,7 +131,7 @@ func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthNoStatus() {
func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthPassingStatus() { func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthPassingStatus() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) { suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
health, err := getNodeHealth(context.Background(), suite.runtime, suite.nodePassing) health, err := getNodeHealth(suite.runtime, suite.nodePassing)
require.NoError(suite.T(), err) require.NoError(suite.T(), err)
require.Equal(suite.T(), pbcatalog.Health_HEALTH_PASSING, health) require.Equal(suite.T(), pbcatalog.Health_HEALTH_PASSING, health)
}) })
@ -159,7 +140,7 @@ func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthPassingStatus() {
func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthCriticalStatus() { func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthCriticalStatus() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) { suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
health, err := getNodeHealth(context.Background(), suite.runtime, suite.nodeCritical) health, err := getNodeHealth(suite.runtime, suite.nodeCritical)
require.NoError(suite.T(), err) require.NoError(suite.T(), err)
require.Equal(suite.T(), pbcatalog.Health_HEALTH_CRITICAL, health) require.Equal(suite.T(), pbcatalog.Health_HEALTH_CRITICAL, health)
}) })
@ -168,7 +149,7 @@ func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthCriticalStatus() {
func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthWarningStatus() { func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthWarningStatus() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) { suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
health, err := getNodeHealth(context.Background(), suite.runtime, suite.nodeWarning) health, err := getNodeHealth(suite.runtime, suite.nodeWarning)
require.NoError(suite.T(), err) require.NoError(suite.T(), err)
require.Equal(suite.T(), pbcatalog.Health_HEALTH_WARNING, health) require.Equal(suite.T(), pbcatalog.Health_HEALTH_WARNING, health)
}) })
@ -177,7 +158,7 @@ func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthWarningStatus() {
func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthMaintenanceStatus() { func (suite *nodeHealthControllerTestSuite) TestGetNodeHealthMaintenanceStatus() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) { suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
health, err := getNodeHealth(context.Background(), suite.runtime, suite.nodeMaintenance) health, err := getNodeHealth(suite.runtime, suite.nodeMaintenance)
require.NoError(suite.T(), err) require.NoError(suite.T(), err)
require.Equal(suite.T(), pbcatalog.Health_HEALTH_MAINTENANCE, health) require.Equal(suite.T(), pbcatalog.Health_HEALTH_MAINTENANCE, health)
}) })
@ -187,7 +168,7 @@ func (suite *nodeHealthControllerTestSuite) TestReconcileNodeNotFound() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) { suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
// This test ensures that removed nodes are ignored. In particular we don't // This test ensures that removed nodes are ignored. In particular we don't
// want to propagate the error and indefinitely keep re-reconciling in this case. // want to propagate the error and indefinitely keep re-reconciling in this case.
err := suite.ctl.Reconcile(context.Background(), suite.runtime, controller.Request{ err := suite.ctl.Reconcile(context.Background(), controller.Request{
ID: resourceID(pbcatalog.NodeType, "not-found", &pbresource.Tenancy{ ID: resourceID(pbcatalog.NodeType, "not-found", &pbresource.Tenancy{
Partition: tenancy.Partition, Partition: tenancy.Partition,
}), }),
@ -196,31 +177,10 @@ func (suite *nodeHealthControllerTestSuite) TestReconcileNodeNotFound() {
}) })
} }
func (suite *nodeHealthControllerTestSuite) TestReconcilePropagateReadError() {
suite.runTestCaseWithTenancies(func(tenancy *pbresource.Tenancy) {
// This test aims to ensure that errors other than NotFound errors coming
// from the initial resource read get propagated. This case is very unrealistic
// as the controller should not have given us a request ID for a resource type
// that doesn't exist but this was the easiest way I could think of to synthesize
// a Read error.
ref := resourceID(
&pbresource.Type{Group: "not", GroupVersion: "v1", Kind: "found"},
"irrelevant",
tenancy,
)
err := suite.ctl.Reconcile(context.Background(), suite.runtime, controller.Request{
ID: ref,
})
require.Error(suite.T(), err)
require.Equal(suite.T(), codes.InvalidArgument, status.Code(err))
})
}
func (suite *nodeHealthControllerTestSuite) testReconcileStatus(id *pbresource.ID, expectedStatus *pbresource.Condition) *pbresource.Resource { func (suite *nodeHealthControllerTestSuite) testReconcileStatus(id *pbresource.ID, expectedStatus *pbresource.Condition) *pbresource.Resource {
suite.T().Helper() suite.T().Helper()
err := suite.ctl.Reconcile(context.Background(), suite.runtime, controller.Request{ err := suite.ctl.Reconcile(context.Background(), controller.Request{
ID: id, ID: id,
}) })
require.NoError(suite.T(), err) require.NoError(suite.T(), err)