taint also node controller

fix function

fix gofmt

fix function return value

fix tests

skip notimplemented error

remove factory unused

in openstack we should try to find instanceid from all states instead of ACTIVE, all other cloudproviders do this already

fix tests and lint

fix gofmt

fix nodelifecycletest

fix lint errors
pull/6/head
Jesse Haka 2018-02-09 23:05:05 +02:00
parent 3cf5b172fa
commit 6665fa7144
5 changed files with 179 additions and 8 deletions

View File

@ -160,7 +160,8 @@ func (os *OpenStack) InstanceID() (string, error) {
// InstanceID returns the cloud provider ID of the specified instance.
func (i *Instances) InstanceID(ctx context.Context, name types.NodeName) (string, error) {
srv, err := getServerByName(i.compute, name, true)
// we should fetch instanceid from all states instead of ACTIVE
srv, err := getServerByName(i.compute, name, false)
if err != nil {
if err == ErrNotFound {
return "", cloudprovider.InstanceNotFound

View File

@ -79,6 +79,11 @@ var (
Effect: v1.TaintEffectNoExecute,
}
shutDownTaint = &v1.Taint{
Key: algorithm.TaintNodeShutdown,
Effect: v1.TaintEffectNoSchedule,
}
nodeConditionToTaintKeyMap = map[v1.NodeConditionType]string{
v1.NodeMemoryPressure: algorithm.TaintNodeMemoryPressure,
v1.NodeOutOfDisk: algorithm.TaintNodeOutOfDisk,
@ -151,9 +156,10 @@ type Controller struct {
daemonSetStore extensionslisters.DaemonSetLister
daemonSetInformerSynced cache.InformerSynced
nodeLister corelisters.NodeLister
nodeInformerSynced cache.InformerSynced
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
nodeLister corelisters.NodeLister
nodeInformerSynced cache.InformerSynced
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
nodeShutdownInCloudProvider func(types.NodeName) (bool, error)
recorder record.EventRecorder
@ -239,6 +245,9 @@ func NewNodeLifecycleController(podInformer coreinformers.PodInformer,
nodeExistsInCloudProvider: func(nodeName types.NodeName) (bool, error) {
return nodeutil.ExistsInCloudProvider(cloud, nodeName)
},
nodeShutdownInCloudProvider: func(nodeName types.NodeName) (bool, error) {
return nodeutil.ShutdownInCloudProvider(cloud, nodeName)
},
recorder: recorder,
nodeMonitorPeriod: nodeMonitorPeriod,
nodeStartupGracePeriod: nodeStartupGracePeriod,
@ -653,6 +662,11 @@ func (nc *Controller) monitorNodeStatus() error {
glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
}
}
// remove shutdown taint this is needed always depending do we use taintbased or not
err := nc.markNodeAsNotShutdown(node)
if err != nil {
glog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name)
}
}
// Report node event.
@ -666,7 +680,21 @@ func (nc *Controller) monitorNodeStatus() error {
// Check with the cloud provider to see if the node still exists. If it
// doesn't, delete the node immediately.
if currentReadyCondition.Status != v1.ConditionTrue && nc.cloud != nil {
exists, err := nc.nodeExistsInCloudProvider(types.NodeName(node.Name))
// check is node shutdowned, if yes do not deleted it. Instead add taint
exists, err := nc.nodeShutdownInCloudProvider(types.NodeName(node.Name))
if err != nil && err != cloudprovider.NotImplemented {
glog.Errorf("Error determining if node %v shutdown in cloud: %v", node.Name, err)
continue
}
// node shutdown
if exists {
err = controller.AddOrUpdateTaintOnNode(nc.kubeClient, node.Name, shutDownTaint)
if err != nil {
glog.Errorf("Error patching node taints: %v", err)
}
continue
}
exists, err = nc.nodeExistsInCloudProvider(types.NodeName(node.Name))
if err != nil {
glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
continue
@ -1102,6 +1130,17 @@ func (nc *Controller) markNodeAsReachable(node *v1.Node) (bool, error) {
return nc.zoneNoExecuteTainter[utilnode.GetZoneKey(node)].Remove(node.Name), nil
}
func (nc *Controller) markNodeAsNotShutdown(node *v1.Node) error {
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
err := controller.RemoveTaintOffNode(nc.kubeClient, node.Name, node, shutDownTaint)
if err != nil {
glog.Errorf("Failed to remove taint from node %v: %v", node.Name, err)
return err
}
return nil
}
// ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
// The zone is considered:
// - fullyDisrupted if there're no Ready Nodes,

View File

@ -1360,6 +1360,118 @@ func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) {
}
}
func TestCloudProviderNodeShutdown(t *testing.T) {
testCases := []struct {
testName string
node *v1.Node
shutdown bool
}{
{
testName: "node shutdowned add taint",
shutdown: true,
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
},
Spec: v1.NodeSpec{
ProviderID: "node0",
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionUnknown,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
{
testName: "node started after shutdown remove taint",
shutdown: false,
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node0",
CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
},
Spec: v1.NodeSpec{
ProviderID: "node0",
Taints: []v1.Taint{
{
Key: algorithm.TaintNodeShutdown,
Effect: v1.TaintEffectNoSchedule,
},
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
},
},
},
},
},
}
for _, tc := range testCases {
t.Run(tc.testName, func(t *testing.T) {
fnh := &testutil.FakeNodeHandler{
Existing: []*v1.Node{tc.node},
Clientset: fake.NewSimpleClientset(),
}
nodeController, _ := newNodeLifecycleControllerFromClient(
nil,
fnh,
10*time.Minute,
testRateLimiterQPS,
testRateLimiterQPS,
testLargeClusterThreshold,
testUnhealthyThreshold,
testNodeMonitorGracePeriod,
testNodeStartupGracePeriod,
testNodeMonitorPeriod,
false)
nodeController.cloud = &fakecloud.FakeCloud{}
nodeController.now = func() metav1.Time { return metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) }
nodeController.recorder = testutil.NewFakeRecorder()
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
return tc.shutdown, nil
}
if err := nodeController.syncNodeStore(fnh); err != nil {
t.Errorf("unexpected error: %v", err)
}
if err := nodeController.monitorNodeStatus(); err != nil {
t.Errorf("unexpected error: %v", err)
}
if len(fnh.UpdatedNodes) != 1 {
t.Errorf("Node was not updated")
}
if tc.shutdown {
if len(fnh.UpdatedNodes[0].Spec.Taints) != 1 {
t.Errorf("Node Taint was not added")
}
if fnh.UpdatedNodes[0].Spec.Taints[0].Key != "node.cloudprovider.kubernetes.io/shutdown" {
t.Errorf("Node Taint key is not correct")
}
} else {
if len(fnh.UpdatedNodes[0].Spec.Taints) != 0 {
t.Errorf("Node Taint was not removed after node is back in ready state")
}
}
})
}
}
// TestCloudProviderNoRateLimit tests that monitorNodes() immediately deletes
// pods and the node when kubelet has not reported, and the cloudprovider says
// the node is gone.
@ -1404,6 +1516,9 @@ func TestCloudProviderNoRateLimit(t *testing.T) {
nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) {
return false, nil
}
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
return false, nil
}
// monitorNodeStatus should allow this node to be immediately deleted
if err := nodeController.syncNodeStore(fnh); err != nil {
t.Errorf("unexpected error: %v", err)
@ -2242,6 +2357,9 @@ func TestNodeEventGeneration(t *testing.T) {
nodeController.nodeExistsInCloudProvider = func(nodeName types.NodeName) (bool, error) {
return false, nil
}
nodeController.nodeShutdownInCloudProvider = func(nodeName types.NodeName) (bool, error) {
return false, nil
}
nodeController.now = func() metav1.Time { return fakeNow }
fakeRecorder := testutil.NewFakeRecorder()
nodeController.recorder = fakeRecorder

View File

@ -187,6 +187,21 @@ func ExistsInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeNam
return true, nil
}
// ShutdownInCloudProvider returns true if the node is shutdowned in
// cloud provider.
func ShutdownInCloudProvider(cloud cloudprovider.Interface, nodeName types.NodeName) (bool, error) {
instances, ok := cloud.Instances()
if !ok {
return false, fmt.Errorf("%v", ErrCloudInstance)
}
providerID, err := cloudprovider.GetInstanceProviderID(context.TODO(), cloud, nodeName)
if err != nil {
return false, err
}
shutdown, err := instances.InstanceShutdownByProviderID(context.TODO(), providerID)
return shutdown, err
}
// RecordNodeEvent records a event related to a node.
func RecordNodeEvent(recorder record.EventRecorder, nodeName, nodeUID, eventtype, reason, event string) {
ref := &v1.ObjectReference{

View File

@ -62,8 +62,6 @@ const (
// the taint
TaintExternalCloudProvider = "node.cloudprovider.kubernetes.io/uninitialized"
// When node is shutdown in external cloud provider
// shutdown flag is needed for immediately volume detach
// after node comes back, the taint is removed
// TaintNodeShutdown when node is shutdown in external cloud provider
TaintNodeShutdown = "node.cloudprovider.kubernetes.io/shutdown"
)