mirror of https://github.com/k3s-io/k3s
taint also node controller
fix function fix gofmt fix function return value fix tests skip notimplemented error remove factory unused in openstack we should try to find instanceid from all states instead of ACTIVE, all other cloudproviders do this already fix tests and lint fix gofmt fix nodelifecycletest fix lint errorspull/6/head
parent
3cf5b172fa
commit
6665fa7144
|
@ -160,7 +160,8 @@ func (os *OpenStack) InstanceID() (string, error) {
|
|||
|
||||
// InstanceID returns the cloud provider ID of the specified instance.
|
||||
func (i *Instances) InstanceID(ctx context.Context, name types.NodeName) (string, error) {
|
||||
srv, err := getServerByName(i.compute, name, true)
|
||||
// we should fetch instanceid from all states instead of ACTIVE
|
||||
srv, err := getServerByName(i.compute, name, false)
|
||||
if err != nil {
|
||||
if err == ErrNotFound {
|
||||
return "", cloudprovider.InstanceNotFound
|
||||
|
|
|
@ -79,6 +79,11 @@ var (
|
|||
Effect: v1.TaintEffectNoExecute,
|
||||
}
|
||||
|
||||
shutDownTaint = &v1.Taint{
|
||||
Key: algorithm.TaintNodeShutdown,
|
||||
Effect: v1.TaintEffectNoSchedule,
|
||||
}
|
||||
|
||||
nodeConditionToTaintKeyMap = map[v1.NodeConditionType]string{
|
||||
v1.NodeMemoryPressure: algorithm.TaintNodeMemoryPressure,
|
||||
v1.NodeOutOfDisk: algorithm.TaintNodeOutOfDisk,
|
||||
|
@ -151,9 +156,10 @@ type Controller struct {
|
|||
daemonSetStore extensionslisters.DaemonSetLister
|
||||
daemonSetInformerSynced cache.InformerSynced
|
||||
|
||||
nodeLister corelisters.NodeLister
|
||||
nodeInformerSynced cache.InformerSynced
|
||||
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
|
||||
nodeLister corelisters.NodeLister
|
||||
nodeInformerSynced cache.InformerSynced
|
||||
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
|
||||
nodeShutdownInCloudProvider func(types.NodeName) (bool, error)
|
||||
|
||||
recorder record.EventRecorder
|
||||
|
||||
|
@ -239,6 +245,9 @@ func NewNodeLifecycleController(podInformer coreinformers.PodInformer,
|
|||
nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) {
|
||||
return nodeutil.ExistsInCloudProvider(cloud, nodeName)
|
||||
},
|
||||
nodeShutdownInCloudProvider: func(nodeName types.NodeName) (bool, error) {
|
||||
return nodeutil.ShutdownInCloudProvider(cloud, nodeName)
|
||||
},
|
||||
recorder: recorder,
|
||||
nodeMonitorPeriod: nodeMonitorPeriod,
|
||||
nodeStartupGracePeriod: nodeStartupGracePeriod,
|
||||
|
@ -653,6 +662,11 @@ func (nc *Controller) monitorNodeStatus() error {
|
|||
glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
|
||||
}
|
||||
}
|
||||
// remove shutdown taint this is needed always depending do we use taintbased or not
|
||||
err := nc.markNodeAsNotShutdown(node)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// Report node event.
|
||||
|
@ -666,7 +680,21 @@ func (nc *Controller) monitorNodeStatus() error {
|
|||
// Check with the cloud provider to see if the node still exists. If it
|
||||
// doesn't, delete the node immediately.
|
||||
if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil {
|
||||
exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name))
|
||||
// check is node shutdowned, if yes do not deleted it. Instead add taint
|
||||
exists, err := nc.nodeShutdownInCloudProvider(types.NodeName(node.Name))
|
||||
if err != nil && err != cloudprovider.NotImplemented {
|
||||
glog.Errorf("Error determining if node %v shutdown in cloud: %v", node.Name, err)
|
||||
continue
|
||||
}
|
||||
// node shutdown
|
||||
if exists {
|
||||
err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, node.Name, shutDownTaint)
|
||||
if err != nil {
|
||||
glog.Errorf("Error patching node taints: %v", err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
exists, err = nc.nodeExistsInCloudProvider(types.NodeName(node.Name))
|
||||
if err != nil {
|
||||
glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
|
||||
continue
|
||||
|
@ -1102,6 +1130,17 @@ func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) {
|
|||
return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil
|
||||
}
|
||||
|
||||
func (nc *Controller) markNodeAsNotShutdown(node *v1.Node) error {
|
||||
nc.evictorLock.Lock()
|
||||
defer nc.evictorLock.Unlock()
|
||||
err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, shutDownTaint)
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
|
||||
// The zone is considered:
|
||||
// - fullyDisrupted if there're no Ready Nodes,
|
||||
|
|
|
@ -1360,6 +1360,118 @@ func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestCloudProviderNodeShutdown(t *testing.T) {
|
||||
|
||||
testCases := []struct {
|
||||
testName string
|
||||
node *v1.Node
|
||||
shutdown bool
|
||||
}{
|
||||
{
|
||||
testName: "node shutdowned add taint",
|
||||
shutdown: true,
|
||||
node: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "node0",
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionUnknown,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
testName: "node started after shutdown remove taint",
|
||||
shutdown: false,
|
||||
node: &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
},
|
||||
Spec: v1.NodeSpec{
|
||||
ProviderID: "node0",
|
||||
Taints: []v1.Taint{
|
||||
{
|
||||
Key: algorithm.TaintNodeShutdown,
|
||||
Effect: v1.TaintEffectNoSchedule,
|
||||
},
|
||||
},
|
||||
},
|
||||
Status: v1.NodeStatus{
|
||||
Conditions: []v1.NodeCondition{
|
||||
{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionTrue,
|
||||
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.testName, func(t *testing.T) {
|
||||
fnh := &testutil.FakeNodeHandler{
|
||||
Existing: []*v1.Node{tc.node},
|
||||
Clientset: fake.NewSimpleClientset(),
|
||||
}
|
||||
nodeController, _ := newNodeLifecycleControllerFromClient(
|
||||
nil,
|
||||
fnh,
|
||||
10*time.Minute,
|
||||
testRateLimiterQPS,
|
||||
testRateLimiterQPS,
|
||||
testLargeClusterThreshold,
|
||||
testUnhealthyThreshold,
|
||||
testNodeMonitorGracePeriod,
|
||||
testNodeStartupGracePeriod,
|
||||
testNodeMonitorPeriod,
|
||||
false)
|
||||
nodeController.cloud = &fakecloud.FakeCloud{}
|
||||
nodeController.now = func() metav1.Time { return metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) }
|
||||
nodeController.recorder = testutil.NewFakeRecorder()
|
||||
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
||||
return tc.shutdown, nil
|
||||
}
|
||||
|
||||
if err := nodeController.syncNodeStore(fnh); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if err := nodeController.monitorNodeStatus(); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
if len(fnh.UpdatedNodes) != 1 {
|
||||
t.Errorf("Node was not updated")
|
||||
}
|
||||
if tc.shutdown {
|
||||
if len(fnh.UpdatedNodes[0].Spec.Taints) != 1 {
|
||||
t.Errorf("Node Taint was not added")
|
||||
}
|
||||
if fnh.UpdatedNodes[0].Spec.Taints[0].Key != "node.cloudprovider.kubernetes.io/shutdown" {
|
||||
t.Errorf("Node Taint key is not correct")
|
||||
}
|
||||
} else {
|
||||
if len(fnh.UpdatedNodes[0].Spec.Taints) != 0 {
|
||||
t.Errorf("Node Taint was not removed after node is back in ready state")
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TestCloudProviderNoRateLimit tests that monitorNodes() immediately deletes
|
||||
// pods and the node when kubelet has not reported, and the cloudprovider says
|
||||
// the node is gone.
|
||||
|
@ -1404,6 +1516,9 @@ func TestCloudProviderNoRateLimit(t *testing.T) {
|
|||
nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
// monitorNodeStatus should allow this node to be immediately deleted
|
||||
if err := nodeController.syncNodeStore(fnh); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
|
@ -2242,6 +2357,9 @@ func TestNodeEventGeneration(t *testing.T) {
|
|||
nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
|
||||
return false, nil
|
||||
}
|
||||
nodeController.now = func() metav1.Time { return fakeNow }
|
||||
fakeRecorder := testutil.NewFakeRecorder()
|
||||
nodeController.recorder = fakeRecorder
|
||||
|
|
|
@ -187,6 +187,21 @@ func ExistsInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeNam
|
|||
return true, nil
|
||||
}
|
||||
|
||||
// ShutdownInCloudProvider returns true if the node is shutdowned in
|
||||
// cloud provider.
|
||||
func ShutdownInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeName) (bool, error) {
|
||||
instances, ok := cloud.Instances()
|
||||
if !ok {
|
||||
return false, fmt.Errorf("%v", ErrCloudInstance)
|
||||
}
|
||||
providerID, err := cloudprovider.GetInstanceProviderID(context.TODO(), cloud, nodeName)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
shutdown, err := instances.InstanceShutdownByProviderID(context.TODO(), providerID)
|
||||
return shutdown, err
|
||||
}
|
||||
|
||||
// RecordNodeEvent records a event related to a node.
|
||||
func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) {
|
||||
ref := &v1.ObjectReference{
|
||||
|
|
|
@ -62,8 +62,6 @@ const (
|
|||
// the taint
|
||||
TaintExternalCloudProvider = "node.cloudprovider.kubernetes.io/uninitialized"
|
||||
|
||||
// When node is shutdown in external cloud provider
|
||||
// shutdown flag is needed for immediately volume detach
|
||||
// after node comes back, the taint is removed
|
||||
// TaintNodeShutdown when node is shutdown in external cloud provider
|
||||
TaintNodeShutdown = "node.cloudprovider.kubernetes.io/shutdown"
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue