diff --git a/pkg/controller/deployment/deployment_controller.go b/pkg/controller/deployment/deployment_controller.go index 6f73332f06..fe698f3536 100644 --- a/pkg/controller/deployment/deployment_controller.go +++ b/pkg/controller/deployment/deployment_controller.go @@ -806,20 +806,11 @@ func (dc *DeploymentController) reconcileNewRC(allRCs []*api.ReplicationControll func (dc *DeploymentController) reconcileOldRCs(allRCs []*api.ReplicationController, oldRCs []*api.ReplicationController, newRC *api.ReplicationController, deployment extensions.Deployment, expectationsCheck bool) (bool, error) { oldPodsCount := deploymentutil.GetReplicaCountForRCs(oldRCs) if oldPodsCount == 0 { - // Cant scale down further + // Can't scale down further return false, nil } - maxUnavailable, isPercent, err := util.GetIntOrPercentValue(&deployment.Spec.Strategy.RollingUpdate.MaxUnavailable) - if err != nil { - return false, fmt.Errorf("invalid value for MaxUnavailable: %v", err) - } - if isPercent { - maxUnavailable = util.GetValueFromPercent(maxUnavailable, deployment.Spec.Replicas) - } - // Check if we can scale down. - minAvailable := deployment.Spec.Replicas - maxUnavailable - minReadySeconds := deployment.Spec.MinReadySeconds - // Check the expectations of deployment before counting available pods + + // Check the expectations of deployment before reconciling dKey, err := controller.KeyFunc(&deployment) if err != nil { return false, fmt.Errorf("Couldn't get key for deployment %#v: %v", deployment, err) @@ -828,20 +819,144 @@ func (dc *DeploymentController) reconcileOldRCs(allRCs []*api.ReplicationControl glog.V(4).Infof("Pod expectations not met yet before reconciling old RCs\n") return false, nil } - // Find the number of ready pods. - readyPodCount, err := deploymentutil.GetAvailablePodsForRCs(dc.client, allRCs, minReadySeconds) + + minReadySeconds := deployment.Spec.MinReadySeconds + allPodsCount := deploymentutil.GetReplicaCountForRCs(allRCs) + newRCAvailablePodCount, err := deploymentutil.GetAvailablePodsForRCs(dc.client, []*api.ReplicationController{newRC}, minReadySeconds) if err != nil { return false, fmt.Errorf("could not find available pods: %v", err) } - if readyPodCount <= minAvailable { - // Cannot scale down. + maxUnavailable, isPercent, err := util.GetIntOrPercentValue(&deployment.Spec.Strategy.RollingUpdate.MaxUnavailable) + if err != nil { + return false, fmt.Errorf("invalid value for MaxUnavailable: %v", err) + } + if isPercent { + maxUnavailable = util.GetValueFromPercent(maxUnavailable, deployment.Spec.Replicas) + } + + // Check if we can scale down. We can scale down in the following 2 cases: + // * Some old rcs have unhealthy replicas, we could safely scale down those unhealthy replicas since that won't further + // increase unavailability. + // * New rc has scaled up and it's replicas becomes ready, then we can scale down old rcs in a further step. + // + // maxScaledDown := allPodsCount - minAvailable - newRCPodsUnavailable + // take into account not only maxUnavailable and any surge pods that have been created, but also unavailable pods from + // the newRC, so that the unavailable pods from the newRC would not make us scale down old RCs in a further step(that will + // increase unavailability). + // + // Concrete example: + // + // * 10 replicas + // * 2 maxUnavailable (absolute number, not percent) + // * 3 maxSurge (absolute number, not percent) + // + // case 1: + // * Deployment is updated, newRC is created with 3 replicas, oldRC is scaled down to 8, and newRC is scaled up to 5. + // * The new RC pods crashloop and never become available. + // * allPodsCount is 13. minAvailable is 8. newRCPodsUnavailable is 5. + // * A node fails and causes one of the oldRC pods to become unavailable. However, 13 - 8 - 5 = 0, so the oldRC won't be scaled down. + // * The user notices the crashloop and does kubectl rollout undo to rollback. + // * newRCPodsUnavailable is 1, since we rolled back to the good RC, so maxScaledDown = 13 - 8 - 1 = 4. 4 of the crashlooping pods will be scaled down. + // * The total number of pods will then be 9 and the newRC can be scaled up to 10. + // + // case 2: + // Same example, but pushing a new pod template instead of rolling back (aka "roll over"): + // * The new RC created must start with 0 replicas because allPodsCount is already at 13. + // * However, newRCPodsUnavailable would also be 0, so the 2 old RCs could be scaled down by 5 (13 - 8 - 0), which would then + // allow the new RC to be scaled up by 5. + minAvailable := deployment.Spec.Replicas - maxUnavailable + newRCUnavailablePodCount := newRC.Spec.Replicas - newRCAvailablePodCount + maxScaledDown := allPodsCount - minAvailable - newRCUnavailablePodCount + if maxScaledDown <= 0 { return false, nil } - totalScaleDownCount := readyPodCount - minAvailable + + // Clean up unhealthy replicas first, otherwise unhealthy replicas will block deployment + // and cause timeout. See https://github.com/kubernetes/kubernetes/issues/16737 + cleanupCount, err := dc.cleanupUnhealthyReplicas(oldRCs, deployment, maxScaledDown) + if err != nil { + return false, nil + } + + // Scale down old rcs, need check maxUnavailable to ensure we can scale down + scaledDownCount, err := dc.scaleDownOldRCsForRollingUpdate(allRCs, oldRCs, deployment) + if err != nil { + return false, nil + } + + totalScaledDown := cleanupCount + scaledDownCount + if expectationsCheck { + dc.podExpectations.ExpectDeletions(dKey, totalScaledDown) + } + + return totalScaledDown > 0, nil +} + +// cleanupUnhealthyReplicas will scale down old rcs with unhealthy replicas, so that all unhealthy replicas will be deleted. +func (dc *DeploymentController) cleanupUnhealthyReplicas(oldRCs []*api.ReplicationController, deployment extensions.Deployment, maxCleanupCount int) (int, error) { + sort.Sort(controller.ControllersByCreationTimestamp(oldRCs)) + // Safely scale down all old rcs with unhealthy replicas. ReplicationController/ReplicaSet will sort the pods in the order + // such that not-ready < ready, unscheduled < scheduled, and pending < running. This ensures that unhealthy replicas will + // been deleted first and won't increase unavailability. totalScaledDown := 0 for _, targetRC := range oldRCs { - if totalScaleDownCount == 0 { + if totalScaledDown >= maxCleanupCount { + break + } + if targetRC.Spec.Replicas == 0 { + // cannot scale down this RC. + continue + } + readyPodCount, err := deploymentutil.GetAvailablePodsForRCs(dc.client, []*api.ReplicationController{targetRC}, 0) + if err != nil { + return totalScaledDown, fmt.Errorf("could not find available pods: %v", err) + } + if targetRC.Spec.Replicas == readyPodCount { + // no unhealthy replicas found, no scaling required. + continue + } + + scaledDownCount := int(math.Min(float64(maxCleanupCount-totalScaledDown), float64(targetRC.Spec.Replicas-readyPodCount))) + newReplicasCount := targetRC.Spec.Replicas - scaledDownCount + _, err = dc.scaleRCAndRecordEvent(targetRC, newReplicasCount, deployment) + if err != nil { + return totalScaledDown, err + } + totalScaledDown += scaledDownCount + } + return totalScaledDown, nil +} + +// scaleDownOldRCsForRollingUpdate scales down old rcs when deployment strategy is "RollingUpdate". +// Need check maxUnavailable to ensure availability +func (dc *DeploymentController) scaleDownOldRCsForRollingUpdate(allRCs []*api.ReplicationController, oldRCs []*api.ReplicationController, deployment extensions.Deployment) (int, error) { + maxUnavailable, isPercent, err := util.GetIntOrPercentValue(&deployment.Spec.Strategy.RollingUpdate.MaxUnavailable) + if err != nil { + return 0, fmt.Errorf("invalid value for MaxUnavailable: %v", err) + } + if isPercent { + maxUnavailable = util.GetValueFromPercent(maxUnavailable, deployment.Spec.Replicas) + } + // Check if we can scale down. + minAvailable := deployment.Spec.Replicas - maxUnavailable + minReadySeconds := deployment.Spec.MinReadySeconds + // Find the number of ready pods. + readyPodCount, err := deploymentutil.GetAvailablePodsForRCs(dc.client, allRCs, minReadySeconds) + if err != nil { + return 0, fmt.Errorf("could not find available pods: %v", err) + } + if readyPodCount <= minAvailable { + // Cannot scale down. + return 0, nil + } + + sort.Sort(controller.ControllersByCreationTimestamp(oldRCs)) + + totalScaledDown := 0 + totalScaleDownCount := readyPodCount - minAvailable + for _, targetRC := range oldRCs { + if totalScaledDown >= totalScaleDownCount { // No further scaling required. break } @@ -850,24 +965,17 @@ func (dc *DeploymentController) reconcileOldRCs(allRCs []*api.ReplicationControl continue } // Scale down. - scaleDownCount := int(math.Min(float64(targetRC.Spec.Replicas), float64(totalScaleDownCount))) + scaleDownCount := int(math.Min(float64(targetRC.Spec.Replicas), float64(totalScaleDownCount-totalScaledDown))) newReplicasCount := targetRC.Spec.Replicas - scaleDownCount _, err = dc.scaleRCAndRecordEvent(targetRC, newReplicasCount, deployment) if err != nil { - return false, err + return totalScaledDown, err } + totalScaledDown += scaleDownCount - totalScaleDownCount -= scaleDownCount } - // Expect to see old rcs scaled down by exactly totalScaledDownCount (sum of scaleDownCount) replicas. - dKey, err = controller.KeyFunc(&deployment) - if err != nil { - return false, fmt.Errorf("Couldn't get key for deployment %#v: %v", deployment, err) - } - if expectationsCheck { - dc.podExpectations.ExpectDeletions(dKey, totalScaledDown) - } - return true, err + + return totalScaledDown, nil } // scaleDownOldRCsForRecreate scales down old rcs when deployment strategy is "Recreate" diff --git a/pkg/controller/deployment/deployment_controller_test.go b/pkg/controller/deployment/deployment_controller_test.go index 2417618a0e..ea77b300c2 100644 --- a/pkg/controller/deployment/deployment_controller_test.go +++ b/pkg/controller/deployment/deployment_controller_test.go @@ -123,6 +123,270 @@ func TestDeploymentController_reconcileNewRC(t *testing.T) { } func TestDeploymentController_reconcileOldRCs(t *testing.T) { + tests := []struct { + deploymentReplicas int + maxUnavailable intstr.IntOrString + oldReplicas int + newReplicas int + readyPodsFromOldRC int + readyPodsFromNewRC int + scaleExpected bool + expectedOldReplicas int + }{ + { + deploymentReplicas: 10, + maxUnavailable: intstr.FromInt(0), + oldReplicas: 10, + newReplicas: 0, + readyPodsFromOldRC: 10, + readyPodsFromNewRC: 0, + scaleExpected: false, + }, + { + deploymentReplicas: 10, + maxUnavailable: intstr.FromInt(2), + oldReplicas: 10, + newReplicas: 0, + readyPodsFromOldRC: 10, + readyPodsFromNewRC: 0, + scaleExpected: true, + expectedOldReplicas: 8, + }, + { // expect unhealthy replicas from old rcs been cleaned up + deploymentReplicas: 10, + maxUnavailable: intstr.FromInt(2), + oldReplicas: 10, + newReplicas: 0, + readyPodsFromOldRC: 8, + readyPodsFromNewRC: 0, + scaleExpected: true, + expectedOldReplicas: 8, + }, + { // expect 1 unhealthy replica from old rcs been cleaned up, and 1 ready pod been scaled down + deploymentReplicas: 10, + maxUnavailable: intstr.FromInt(2), + oldReplicas: 10, + newReplicas: 0, + readyPodsFromOldRC: 9, + readyPodsFromNewRC: 0, + scaleExpected: true, + expectedOldReplicas: 8, + }, + { // the unavailable pods from the newRC would not make us scale down old RCs in a further step + deploymentReplicas: 10, + maxUnavailable: intstr.FromInt(2), + oldReplicas: 8, + newReplicas: 2, + readyPodsFromOldRC: 8, + readyPodsFromNewRC: 0, + scaleExpected: false, + }, + } + for i, test := range tests { + t.Logf("executing scenario %d", i) + + newSelector := map[string]string{"foo": "new"} + oldSelector := map[string]string{"foo": "old"} + newRc := rc("foo-new", test.newReplicas, newSelector) + oldRc := rc("foo-old", test.oldReplicas, oldSelector) + oldRCs := []*api.ReplicationController{oldRc} + allRCs := []*api.ReplicationController{oldRc, newRc} + + deployment := deployment("foo", test.deploymentReplicas, intstr.FromInt(0), test.maxUnavailable) + fakeClientset := fake.Clientset{} + fakeClientset.AddReactor("list", "pods", func(action core.Action) (handled bool, ret runtime.Object, err error) { + switch action.(type) { + case core.ListAction: + podList := &api.PodList{} + for podIndex := 0; podIndex < test.readyPodsFromOldRC; podIndex++ { + podList.Items = append(podList.Items, api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: fmt.Sprintf("%s-oldReadyPod-%d", oldRc.Name, podIndex), + Labels: oldSelector, + }, + Status: api.PodStatus{ + Conditions: []api.PodCondition{ + { + Type: api.PodReady, + Status: api.ConditionTrue, + }, + }, + }, + }) + } + for podIndex := 0; podIndex < test.oldReplicas-test.readyPodsFromOldRC; podIndex++ { + podList.Items = append(podList.Items, api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: fmt.Sprintf("%s-oldUnhealthyPod-%d", oldRc.Name, podIndex), + Labels: oldSelector, + }, + Status: api.PodStatus{ + Conditions: []api.PodCondition{ + { + Type: api.PodReady, + Status: api.ConditionFalse, + }, + }, + }, + }) + } + for podIndex := 0; podIndex < test.readyPodsFromNewRC; podIndex++ { + podList.Items = append(podList.Items, api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: fmt.Sprintf("%s-newReadyPod-%d", oldRc.Name, podIndex), + Labels: newSelector, + }, + Status: api.PodStatus{ + Conditions: []api.PodCondition{ + { + Type: api.PodReady, + Status: api.ConditionTrue, + }, + }, + }, + }) + } + for podIndex := 0; podIndex < test.oldReplicas-test.readyPodsFromOldRC; podIndex++ { + podList.Items = append(podList.Items, api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: fmt.Sprintf("%s-newUnhealthyPod-%d", oldRc.Name, podIndex), + Labels: newSelector, + }, + Status: api.PodStatus{ + Conditions: []api.PodCondition{ + { + Type: api.PodReady, + Status: api.ConditionFalse, + }, + }, + }, + }) + } + return true, podList, nil + } + return false, nil, nil + }) + controller := &DeploymentController{ + client: &fakeClientset, + eventRecorder: &record.FakeRecorder{}, + } + + scaled, err := controller.reconcileOldRCs(allRCs, oldRCs, newRc, deployment, false) + if err != nil { + t.Errorf("unexpected error: %v", err) + continue + } + if !test.scaleExpected && scaled { + t.Errorf("unexpected scaling: %v", fakeClientset.Actions()) + } + if test.scaleExpected && !scaled { + t.Errorf("expected scaling to occur") + continue + } + continue + } +} + +func TestDeploymentController_cleanupUnhealthyReplicas(t *testing.T) { + tests := []struct { + oldReplicas int + readyPods int + unHealthyPods int + maxCleanupCount int + cleanupCountExpected int + }{ + { + oldReplicas: 10, + readyPods: 8, + unHealthyPods: 2, + maxCleanupCount: 1, + cleanupCountExpected: 1, + }, + { + oldReplicas: 10, + readyPods: 8, + unHealthyPods: 2, + maxCleanupCount: 3, + cleanupCountExpected: 2, + }, + { + oldReplicas: 10, + readyPods: 8, + unHealthyPods: 2, + maxCleanupCount: 0, + cleanupCountExpected: 0, + }, + { + oldReplicas: 10, + readyPods: 10, + unHealthyPods: 0, + maxCleanupCount: 3, + cleanupCountExpected: 0, + }, + } + + for i, test := range tests { + t.Logf("executing scenario %d", i) + oldRc := rc("foo-v2", test.oldReplicas, nil) + oldRCs := []*api.ReplicationController{oldRc} + deployment := deployment("foo", 10, intstr.FromInt(2), intstr.FromInt(2)) + fakeClientset := fake.Clientset{} + fakeClientset.AddReactor("list", "pods", func(action core.Action) (handled bool, ret runtime.Object, err error) { + switch action.(type) { + case core.ListAction: + podList := &api.PodList{} + for podIndex := 0; podIndex < test.readyPods; podIndex++ { + podList.Items = append(podList.Items, api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: fmt.Sprintf("%s-readyPod-%d", oldRc.Name, podIndex), + }, + Status: api.PodStatus{ + Conditions: []api.PodCondition{ + { + Type: api.PodReady, + Status: api.ConditionTrue, + }, + }, + }, + }) + } + for podIndex := 0; podIndex < test.unHealthyPods; podIndex++ { + podList.Items = append(podList.Items, api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: fmt.Sprintf("%s-unHealthyPod-%d", oldRc.Name, podIndex), + }, + Status: api.PodStatus{ + Conditions: []api.PodCondition{ + { + Type: api.PodReady, + Status: api.ConditionFalse, + }, + }, + }, + }) + } + return true, podList, nil + } + return false, nil, nil + }) + + controller := &DeploymentController{ + client: &fakeClientset, + eventRecorder: &record.FakeRecorder{}, + } + cleanupCount, err := controller.cleanupUnhealthyReplicas(oldRCs, deployment, test.maxCleanupCount) + if err != nil { + t.Errorf("unexpected error: %v", err) + continue + } + if cleanupCount != test.cleanupCountExpected { + t.Errorf("expected %v unhealthy replicas been cleaned up, got %v", test.cleanupCountExpected, cleanupCount) + continue + } + } +} + +func TestDeploymentController_scaleDownOldRCsForRollingUpdate(t *testing.T) { tests := []struct { deploymentReplicas int maxUnavailable intstr.IntOrString @@ -196,18 +460,18 @@ func TestDeploymentController_reconcileOldRCs(t *testing.T) { client: &fakeClientset, eventRecorder: &record.FakeRecorder{}, } - scaled, err := controller.reconcileOldRCs(allRcs, oldRcs, nil, deployment, false) + scaled, err := controller.scaleDownOldRCsForRollingUpdate(allRcs, oldRcs, deployment) if err != nil { t.Errorf("unexpected error: %v", err) continue } if !test.scaleExpected { - if scaled { + if scaled != 0 { t.Errorf("unexpected scaling: %v", fakeClientset.Actions()) } continue } - if test.scaleExpected && !scaled { + if test.scaleExpected && scaled == 0 { t.Errorf("expected scaling to occur; actions: %v", fakeClientset.Actions()) continue }