Merge pull request #60202 from clamoriniere1A/feature/JobBackoffWithParallelism

Automatic merge from submit-queue (batch tested with PRs 60054, 60202, 60219, 58090, 60275). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Improves backoff policy in JobController **What this PR does / why we need it**: This PR is fixing the issue: #56853, It improves the "Job backoff policy" when Job is configure to allow parallelism and few pods' Jobs failed but others succeed. Now, it checks if the number of pods succeeded increased since the last check. If yes the backoff delay is cleared. **Which issue(s) this PR fixes**: Fixes #56853 **Special notes for your reviewer**: **Release note**: ```release-note NONE ```
2018-02-23 23:15:37 -08:00 · 2018-02-23 23:15:37 -08:00 · cf6d59ef38
parent 3c2a0c84c5 c6e8bd62ad
commit cf6d59ef38
2 changed files with 17 additions and 4 deletions
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@ -553,6 +553,14 @@ func (jm *JobController) syncJob(key string) (bool, error) {
 	}

 	forget := false
+	// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
+	// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
+	// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
+	// In this case, we should clear the backoff delay.
+	if job.Status.Succeeded < succeeded {
+		forget = true
+	}
+
 	// no need to update the job if the status hasn't changed since last time
 	if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions {
 		job.Status.Active = active
@ -560,12 +568,12 @@ func (jm *JobController) syncJob(key string) (bool, error) {
 		job.Status.Failed = failed

 		if err := jm.updateHandler(&job); err != nil {
-			return false, err
+			return forget, err
 		}

 		if jobHaveNewFailure && !IsJobFinished(&job) {
 			// returning an error will re-enqueue Job after the backoff period
-			return false, fmt.Errorf("failed pod(s) detected for job key %q", key)
+			return forget, fmt.Errorf("failed pod(s) detected for job key %q", key)
 		}

 		forget = true
--- a/pkg/controller/job/job_controller_test.go
+++ b/pkg/controller/job/job_controller_test.go
@ -218,11 +218,16 @@ func TestControllerSyncJob(t *testing.T) {
 			fmt.Errorf("Fake error"), true, 0, 3, 0, 0,
 			0, 1, 3, 0, 0, nil, "",
 		},
-		"failed pod": {
+		"failed + succeed pods: reset backoff delay": {
 			2, 5, 6, false, 0,
-			fmt.Errorf("Fake error"), false, 0, 1, 1, 1,
+			fmt.Errorf("Fake error"), true, 0, 1, 1, 1,
 			1, 0, 1, 1, 1, nil, "",
 		},
+		"only new failed pod": {
+			2, 5, 6, false, 0,
+			fmt.Errorf("Fake error"), false, 0, 1, 0, 1,
+			1, 0, 1, 0, 1, nil, "",
+		},
 		"job finish": {
 			2, 5, 6, false, 0,
 			nil, true, 0, 0, 5, 0,