Browse Source

Fix netpol crash when node remains tained unintialized

It is concievable that users might take more than 60 seconds to deploy their own cloud-provider. Instead of exiting, we should wait forever, but with more logging to indicate what's being waited on.

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
pull/10241/head
Brad Davidson 7 months ago committed by Brad Davidson
parent
commit
ed23a2bb48
  1. 15
      pkg/agent/netpol/netpol.go
  2. 4
      pkg/etcd/metadata_controller.go

15
pkg/agent/netpol/netpol.go

@ -67,27 +67,26 @@ func Run(ctx context.Context, nodeConfig *config.Node) error {
return err
}
// As kube-router netpol requires addresses to be available in the node object
// Wait until the node has ready addresses to avoid race conditions (max 1 minute).
// kube-router netpol requires addresses to be available in the node object.
// Wait until the uninitialized taint has been removed, at which point the addresses should be set.
// TODO: Replace with non-deprecated PollUntilContextTimeout when our and Kubernetes code migrate to it
if err := wait.PollImmediateWithContext(ctx, 2*time.Second, 60*time.Second, func(ctx context.Context) (bool, error) {
if err := wait.PollImmediateInfiniteWithContext(ctx, 2*time.Second, func(ctx context.Context) (bool, error) {
// Get the node object
node, err := client.CoreV1().Nodes().Get(ctx, nodeConfig.AgentConfig.NodeName, metav1.GetOptions{})
if err != nil {
logrus.Debugf("Network policy controller waiting to get Node %s: %v", nodeConfig.AgentConfig.NodeName, err)
logrus.Infof("Network policy controller waiting to get Node %s: %v", nodeConfig.AgentConfig.NodeName, err)
return false, nil
}
// Check for the uninitialized taint that should be removed by cloud-provider
// If there is no cloud-provider, the taint will not be there
// Check for the taint that should be removed by cloud-provider when the node has been initialized.
for _, taint := range node.Spec.Taints {
if taint.Key == cloudproviderapi.TaintExternalCloudProvider {
logrus.Debugf("Network policy controller waiting for removal of %s taint", cloudproviderapi.TaintExternalCloudProvider)
logrus.Infof("Network policy controller waiting for removal of %s taint", cloudproviderapi.TaintExternalCloudProvider)
return false, nil
}
}
return true, nil
}); err != nil {
return errors.Wrapf(err, "network policy controller timed out waiting for %s taint to be removed from Node %s", cloudproviderapi.TaintExternalCloudProvider, nodeConfig.AgentConfig.NodeName)
return errors.Wrapf(err, "network policy controller failed to wait for %s taint to be removed from Node %s", cloudproviderapi.TaintExternalCloudProvider, nodeConfig.AgentConfig.NodeName)
}
krConfig := options.NewKubeRouterConfig()

4
pkg/etcd/metadata_controller.go

@ -13,7 +13,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/util/retry"
nodeUtil "k8s.io/kubernetes/pkg/controller/util/node"
nodeutil "k8s.io/kubernetes/pkg/controller/util/node"
)
func registerMetadataHandlers(ctx context.Context, etcd *ETCD) {
@ -109,7 +109,7 @@ func (m *metadataHandler) handleSelf(node *v1.Node) (*v1.Node, error) {
node.Labels = map[string]string{}
}
if find, _ := nodeUtil.GetNodeCondition(&node.Status, etcdStatusType); find >= 0 {
if find, _ := nodeutil.GetNodeCondition(&node.Status, etcdStatusType); find >= 0 {
node.Status.Conditions = append(node.Status.Conditions[:find], node.Status.Conditions[find+1:]...)
}

Loading…
Cancel
Save