diff --git a/api/swagger-spec/v1.json b/api/swagger-spec/v1.json index 70c4a124c5..0db5ef03d3 100644 --- a/api/swagger-spec/v1.json +++ b/api/swagger-spec/v1.json @@ -11017,6 +11017,11 @@ "type": "string", "description": "RFC 3339 date and time at which the object will be deleted; populated by the system when a graceful deletion is requested, read-only; if not set, graceful deletion of the object has not been requested; see http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata" }, + "deletionGracePeriodSeconds": { + "type": "integer", + "format": "int64", + "description": "number of seconds allowed for this object to gracefully terminate before it will be removed from the system; only set when deletionTimestamp is also set, read-only; may only be shortened" + }, "labels": { "type": "any", "description": "map of string keys and values that can be used to organize and categorize objects; may match selectors of replication controllers and services; see http://releases.k8s.io/HEAD/docs/user-guide/labels.md" @@ -12327,7 +12332,7 @@ "terminationGracePeriodSeconds": { "type": "integer", "format": "int64", - "description": "optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process" + "description": "optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process; defaults to 30 seconds" }, "activeDeadlineSeconds": { "type": "integer", diff --git a/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml b/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml index a2075fd927..d5419146f4 100644 --- a/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml +++ b/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml @@ -18,6 +18,7 @@ spec: mountPath: /varlog - name: containers mountPath: /var/lib/docker/containers + terminationGracePeriodSeconds: 30 volumes: - name: varlog hostPath: diff --git a/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml b/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml index c6b1547af3..31ba54acb7 100644 --- a/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml +++ b/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml @@ -19,6 +19,7 @@ spec: mountPath: /varlog - name: containers mountPath: /var/lib/docker/containers + terminationGracePeriodSeconds: 30 volumes: - name: varlog hostPath: diff --git a/cmd/integration/integration.go b/cmd/integration/integration.go index 6f3246666d..44715dbac7 100644 --- a/cmd/integration/integration.go +++ b/cmd/integration/integration.go @@ -196,7 +196,7 @@ func startComponents(firstManifestURL, secondManifestURL, apiVersion string) (st // TODO: Write an integration test for the replication controllers watch. go controllerManager.Run(3, util.NeverStop) - nodeController := nodecontroller.NewNodeController(nil, cl, 5*time.Minute, nodecontroller.NewPodEvictor(util.NewFakeRateLimiter()), + nodeController := nodecontroller.NewNodeController(nil, cl, 5*time.Minute, util.NewFakeRateLimiter(), 40*time.Second, 60*time.Second, 5*time.Second, nil, false) nodeController.Run(5 * time.Second) cadvisorInterface := new(cadvisor.Fake) @@ -872,18 +872,25 @@ func runSchedulerNoPhantomPodsTest(client *client.Client) { // Delete a pod to free up room. glog.Infof("Deleting pod %v", bar.Name) - err = client.Pods(api.NamespaceDefault).Delete(bar.Name, nil) + err = client.Pods(api.NamespaceDefault).Delete(bar.Name, api.NewDeleteOptions(0)) if err != nil { glog.Fatalf("FAILED: couldn't delete pod %q: %v", bar.Name, err) } + //TODO: reenable once fake_docker_client handles deletion cleanly + //time.Sleep(2 * time.Second) + pod.ObjectMeta.Name = "phantom.baz" baz, err := client.Pods(api.NamespaceDefault).Create(pod) if err != nil { glog.Fatalf("Failed to create pod: %v, %v", pod, err) } if err := wait.Poll(time.Second, time.Second*60, podRunning(client, baz.Namespace, baz.Name)); err != nil { - glog.Fatalf("FAILED: (Scheduler probably didn't process deletion of 'phantom.bar') Pod never started running: %v", err) + if pod, perr := client.Pods(api.NamespaceDefault).Get("phantom.bar"); perr == nil { + glog.Fatalf("FAILED: 'phantom.bar' was never deleted: %#v", pod) + } else { + glog.Fatalf("FAILED: (Scheduler probably didn't process deletion of 'phantom.bar') Pod never started running: %v", err) + } } glog.Info("Scheduler doesn't make phantom pods: test passed.") diff --git a/cmd/kube-controller-manager/app/controllermanager.go b/cmd/kube-controller-manager/app/controllermanager.go index 935224d9d0..7deea27fb0 100644 --- a/cmd/kube-controller-manager/app/controllermanager.go +++ b/cmd/kube-controller-manager/app/controllermanager.go @@ -196,7 +196,7 @@ func (s *CMServer) Run(_ []string) error { } nodeController := nodecontroller.NewNodeController(cloud, kubeClient, - s.PodEvictionTimeout, nodecontroller.NewPodEvictor(util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst)), + s.PodEvictionTimeout, util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst), s.NodeMonitorGracePeriod, s.NodeStartupGracePeriod, s.NodeMonitorPeriod, &s.ClusterCIDR, s.AllocateNodeCIDRs) nodeController.Run(s.NodeSyncPeriod) diff --git a/contrib/mesos/pkg/controllermanager/controllermanager.go b/contrib/mesos/pkg/controllermanager/controllermanager.go index f354430640..ad6aa59408 100644 --- a/contrib/mesos/pkg/controllermanager/controllermanager.go +++ b/contrib/mesos/pkg/controllermanager/controllermanager.go @@ -123,7 +123,7 @@ func (s *CMServer) Run(_ []string) error { } nodeController := nodecontroller.NewNodeController(cloud, kubeClient, - s.PodEvictionTimeout, nodecontroller.NewPodEvictor(util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst)), + s.PodEvictionTimeout, util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst), s.NodeMonitorGracePeriod, s.NodeStartupGracePeriod, s.NodeMonitorPeriod, (*net.IPNet)(&s.ClusterCIDR), s.AllocateNodeCIDRs) nodeController.Run(s.NodeSyncPeriod) diff --git a/docs/getting-started-guides/logging.md b/docs/getting-started-guides/logging.md index 7062a0a09e..2d61fa0cc8 100644 --- a/docs/getting-started-guides/logging.md +++ b/docs/getting-started-guides/logging.md @@ -182,6 +182,7 @@ spec: mountPath: /varlog - name: containers mountPath: /var/lib/docker/containers + terminationGracePeriodSeconds: 30 volumes: - name: varlog hostPath: diff --git a/hack/lib/test.sh b/hack/lib/test.sh index 02a4c8ad9e..c48337763a 100644 --- a/hack/lib/test.sh +++ b/hack/lib/test.sh @@ -23,7 +23,7 @@ readonly red=$(tput setaf 1) readonly green=$(tput setaf 2) kube::test::clear_all() { - kubectl delete "${kube_flags[@]}" rc,pods --all + kubectl delete "${kube_flags[@]}" rc,pods --all --grace-period=0 } kube::test::get_object_assert() { diff --git a/hack/test-cmd.sh b/hack/test-cmd.sh index 4e1635ca7a..b872eff37e 100755 --- a/hack/test-cmd.sh +++ b/hack/test-cmd.sh @@ -246,7 +246,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete pod valid-pod "${kube_flags[@]}" + kubectl delete pod valid-pod "${kube_flags[@]}" --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -262,7 +262,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete -f docs/admin/limitrange/valid-pod.yaml "${kube_flags[@]}" + kubectl delete -f docs/admin/limitrange/valid-pod.yaml "${kube_flags[@]}" --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -278,7 +278,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' 'valid-pod:' # Command - kubectl delete pods -l'name in (valid-pod)' "${kube_flags[@]}" + kubectl delete pods -l'name in (valid-pod)' "${kube_flags[@]}" --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' '' @@ -310,7 +310,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete --all pods "${kube_flags[@]}" # --all remove all the pods + kubectl delete --all pods "${kube_flags[@]}" --grace-period=0 # --all remove all the pods # Post-condition: no POD is running kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' '' @@ -327,7 +327,7 @@ runTests() { # Pre-condition: valid-pod and redis-proxy PODs are running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'redis-proxy:valid-pod:' # Command - kubectl delete pods valid-pod redis-proxy "${kube_flags[@]}" # delete multiple pods at once + kubectl delete pods valid-pod redis-proxy "${kube_flags[@]}" --grace-period=0 # delete multiple pods at once # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -344,7 +344,7 @@ runTests() { # Pre-condition: valid-pod and redis-proxy PODs are running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'redis-proxy:valid-pod:' # Command - kubectl stop pods valid-pod redis-proxy "${kube_flags[@]}" # stop multiple pods at once + kubectl stop pods valid-pod redis-proxy "${kube_flags[@]}" --grace-period=0 # stop multiple pods at once # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -368,7 +368,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete pods -lnew-name=new-valid-pod "${kube_flags[@]}" + kubectl delete pods -lnew-name=new-valid-pod --grace-period=0 "${kube_flags[@]}" # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -419,7 +419,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete pods -l'name in (valid-pod-super-sayan)' "${kube_flags[@]}" + kubectl delete pods -l'name in (valid-pod-super-sayan)' --grace-period=0 "${kube_flags[@]}" # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -455,7 +455,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert 'pods --namespace=other' "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete "${kube_flags[@]}" pod --namespace=other valid-pod + kubectl delete "${kube_flags[@]}" pod --namespace=other valid-pod --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert 'pods --namespace=other' "{{range.items}}{{$id_field}}:{{end}}" '' diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index e843d901ec..c6e4467647 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -51,6 +51,7 @@ current-release-pr current-replicas default-container-cpu-limit default-container-mem-limit +delay-shutdown deleting-pods-burst deleting-pods-qps deployment-label-key diff --git a/pkg/api/deep_copy_generated.go b/pkg/api/deep_copy_generated.go index 2a88c4cda3..f4acae42ea 100644 --- a/pkg/api/deep_copy_generated.go +++ b/pkg/api/deep_copy_generated.go @@ -1011,6 +1011,12 @@ func deepCopy_api_ObjectMeta(in ObjectMeta, out *ObjectMeta, c *conversion.Clone } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/api/rest/create.go b/pkg/api/rest/create.go index 3d51ded7e7..e653a6b74a 100644 --- a/pkg/api/rest/create.go +++ b/pkg/api/rest/create.go @@ -59,6 +59,8 @@ func BeforeCreate(strategy RESTCreateStrategy, ctx api.Context, obj runtime.Obje } else { objectMeta.Namespace = api.NamespaceNone } + objectMeta.DeletionTimestamp = nil + objectMeta.DeletionGracePeriodSeconds = nil strategy.PrepareForCreate(obj) api.FillObjectMetaSystemFields(ctx, objectMeta) api.GenerateName(strategy, objectMeta) diff --git a/pkg/api/rest/delete.go b/pkg/api/rest/delete.go index 9d433f80fe..c9df5970ef 100644 --- a/pkg/api/rest/delete.go +++ b/pkg/api/rest/delete.go @@ -17,8 +17,11 @@ limitations under the License. package rest import ( + "time" + "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/runtime" + "k8s.io/kubernetes/pkg/util" ) // RESTDeleteStrategy defines deletion behavior on an object that follows Kubernetes @@ -40,12 +43,41 @@ func BeforeDelete(strategy RESTDeleteStrategy, ctx api.Context, obj runtime.Obje if strategy == nil { return false, false, nil } - _, _, kerr := objectMetaAndKind(strategy, obj) + objectMeta, _, kerr := objectMetaAndKind(strategy, obj) if kerr != nil { return false, false, kerr } + + // if the object is already being deleted + if objectMeta.DeletionTimestamp != nil { + // if we are already being deleted, we may only shorten the deletion grace period + // this means the object was gracefully deleted previously but deletionGracePeriodSeconds was not set, + // so we force deletion immediately + if objectMeta.DeletionGracePeriodSeconds == nil { + return false, false, nil + } + // only a shorter grace period may be provided by a user + if options.GracePeriodSeconds != nil { + period := int64(*options.GracePeriodSeconds) + if period > *objectMeta.DeletionGracePeriodSeconds { + return false, true, nil + } + now := util.NewTime(util.Now().Add(time.Second * time.Duration(*options.GracePeriodSeconds))) + objectMeta.DeletionTimestamp = &now + objectMeta.DeletionGracePeriodSeconds = &period + options.GracePeriodSeconds = &period + return true, false, nil + } + // graceful deletion is pending, do nothing + options.GracePeriodSeconds = objectMeta.DeletionGracePeriodSeconds + return false, true, nil + } + if !strategy.CheckGracefulDelete(obj, options) { return false, false, nil } + now := util.NewTime(util.Now().Add(time.Second * time.Duration(*options.GracePeriodSeconds))) + objectMeta.DeletionTimestamp = &now + objectMeta.DeletionGracePeriodSeconds = options.GracePeriodSeconds return true, false, nil } diff --git a/pkg/api/rest/resttest/resttest.go b/pkg/api/rest/resttest/resttest.go index f8c1716ef6..694782c073 100644 --- a/pkg/api/rest/resttest/resttest.go +++ b/pkg/api/rest/resttest/resttest.go @@ -114,17 +114,20 @@ func (t *Tester) TestUpdate(valid runtime.Object, existing, older runtime.Object // Test deleting an object. func (t *Tester) TestDelete(createFn func() runtime.Object, wasGracefulFn func() bool, invalid ...runtime.Object) { - t.testDeleteNonExist(createFn) - t.testDeleteNoGraceful(createFn, wasGracefulFn) - t.testDeleteInvokesValidation(invalid...) + t.TestDeleteNonExist(createFn) + t.TestDeleteNoGraceful(createFn, wasGracefulFn) + t.TestDeleteInvokesValidation(invalid...) // TODO: Test delete namespace mismatch rejection // once #5684 is fixed. } // Test graceful deletion. func (t *Tester) TestDeleteGraceful(createFn func() runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { - t.testDeleteGracefulHasDefault(createFn(), expectedGrace, wasGracefulFn) - t.testDeleteGracefulUsesZeroOnNil(createFn(), 0) + t.TestDeleteGracefulHasDefault(createFn(), expectedGrace, wasGracefulFn) + t.TestDeleteGracefulWithValue(createFn(), expectedGrace, wasGracefulFn) + t.TestDeleteGracefulUsesZeroOnNil(createFn(), 0) + t.TestDeleteGracefulExtend(createFn(), expectedGrace, wasGracefulFn) + t.TestDeleteGracefulImmediate(createFn(), expectedGrace, wasGracefulFn) } // Test getting object. @@ -298,7 +301,7 @@ func (t *Tester) testUpdateFailsOnVersion(older runtime.Object) { // ============================================================================= // Deletion tests. -func (t *Tester) testDeleteInvokesValidation(invalid ...runtime.Object) { +func (t *Tester) TestDeleteInvokesValidation(invalid ...runtime.Object) { for i, obj := range invalid { objectMeta := t.getObjectMetaOrFail(obj) ctx := t.TestContext() @@ -309,7 +312,7 @@ func (t *Tester) testDeleteInvokesValidation(invalid ...runtime.Object) { } } -func (t *Tester) testDeleteNonExist(createFn func() runtime.Object) { +func (t *Tester) TestDeleteNonExist(createFn func() runtime.Object) { existing := createFn() objectMeta := t.getObjectMetaOrFail(existing) context := t.TestContext() @@ -322,7 +325,10 @@ func (t *Tester) testDeleteNonExist(createFn func() runtime.Object) { }) } -func (t *Tester) testDeleteNoGraceful(createFn func() runtime.Object, wasGracefulFn func() bool) { +// ============================================================================= +// Graceful Deletion tests. + +func (t *Tester) TestDeleteNoGraceful(createFn func() runtime.Object, wasGracefulFn func() bool) { existing := createFn() objectMeta := t.getObjectMetaOrFail(existing) ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) @@ -338,25 +344,142 @@ func (t *Tester) testDeleteNoGraceful(createFn func() runtime.Object, wasGracefu } } -// ============================================================================= -// Graceful Deletion tests. - -func (t *Tester) testDeleteGracefulHasDefault(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { +func (t *Tester) TestDeleteGracefulHasDefault(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { objectMeta := t.getObjectMetaOrFail(existing) ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, &api.DeleteOptions{}) if err != nil { t.Errorf("unexpected error: %v", err) } - if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); err != nil { + if !wasGracefulFn() { + t.Errorf("did not gracefully delete resource") + return + } + object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name) + if err != nil { t.Errorf("unexpected error, object should exist: %v", err) + return + } + objectMeta, err = api.ObjectMetaFor(object) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object) + } + if objectMeta.DeletionTimestamp == nil { + t.Errorf("did not set deletion timestamp") + } + if objectMeta.DeletionGracePeriodSeconds == nil { + t.Fatalf("did not set deletion grace period seconds") + } + if *objectMeta.DeletionGracePeriodSeconds != expectedGrace { + t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds) + } +} + +func (t *Tester) TestDeleteGracefulWithValue(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { + objectMeta, err := api.ObjectMetaFor(existing) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing) + } + + ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) + _, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace+2)) + if err != nil { + t.Errorf("unexpected error: %v", err) } if !wasGracefulFn() { t.Errorf("did not gracefully delete resource") } + object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name) + if err != nil { + t.Errorf("unexpected error, object should exist: %v", err) + } + objectMeta, err = api.ObjectMetaFor(object) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object) + } + if objectMeta.DeletionTimestamp == nil { + t.Errorf("did not set deletion timestamp") + } + if objectMeta.DeletionGracePeriodSeconds == nil { + t.Fatalf("did not set deletion grace period seconds") + } + if *objectMeta.DeletionGracePeriodSeconds != expectedGrace+2 { + t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds) + } } -func (t *Tester) testDeleteGracefulUsesZeroOnNil(existing runtime.Object, expectedGrace int64) { +func (t *Tester) TestDeleteGracefulExtend(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { + objectMeta, err := api.ObjectMetaFor(existing) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing) + } + + ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) + _, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !wasGracefulFn() { + t.Errorf("did not gracefully delete resource") + } + // second delete duration is ignored + _, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace+2)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name) + if err != nil { + t.Errorf("unexpected error, object should exist: %v", err) + } + objectMeta, err = api.ObjectMetaFor(object) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object) + } + if objectMeta.DeletionTimestamp == nil { + t.Errorf("did not set deletion timestamp") + } + if objectMeta.DeletionGracePeriodSeconds == nil { + t.Fatalf("did not set deletion grace period seconds") + } + if *objectMeta.DeletionGracePeriodSeconds != expectedGrace { + t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds) + } +} + +func (t *Tester) TestDeleteGracefulImmediate(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { + objectMeta, err := api.ObjectMetaFor(existing) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing) + } + + ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) + _, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !wasGracefulFn() { + t.Errorf("did not gracefully delete resource") + } + // second delete is immediate, resource is deleted + out, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(0)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + _, err = t.storage.(rest.Getter).Get(ctx, objectMeta.Name) + if !errors.IsNotFound(err) { + t.Errorf("unexpected error, object should be deleted immediately: %v", err) + } + objectMeta, err = api.ObjectMetaFor(out) + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + if objectMeta.DeletionTimestamp == nil || objectMeta.DeletionGracePeriodSeconds == nil || *objectMeta.DeletionGracePeriodSeconds != 0 { + t.Errorf("unexpected deleted meta: %#v", objectMeta) + } +} + +func (t *Tester) TestDeleteGracefulUsesZeroOnNil(existing runtime.Object, expectedGrace int64) { objectMeta := t.getObjectMetaOrFail(existing) ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, nil) @@ -364,7 +487,7 @@ func (t *Tester) testDeleteGracefulUsesZeroOnNil(existing runtime.Object, expect t.Errorf("unexpected error: %v", err) } if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); !errors.IsNotFound(err) { - t.Errorf("unexpected error, object should exist: %v", err) + t.Errorf("unexpected error, object should not exist: %v", err) } } diff --git a/pkg/api/serialization_test.go b/pkg/api/serialization_test.go index ba3b0b86bc..cc19b4d1d1 100644 --- a/pkg/api/serialization_test.go +++ b/pkg/api/serialization_test.go @@ -151,6 +151,7 @@ func TestRoundTripTypes(t *testing.T) { } func TestEncode_Ptr(t *testing.T) { + grace := int64(30) pod := &api.Pod{ ObjectMeta: api.ObjectMeta{ Labels: map[string]string{"name": "foo"}, @@ -158,6 +159,8 @@ func TestEncode_Ptr(t *testing.T) { Spec: api.PodSpec{ RestartPolicy: api.RestartPolicyAlways, DNSPolicy: api.DNSClusterFirst, + + TerminationGracePeriodSeconds: &grace, }, } obj := runtime.Object(pod) diff --git a/pkg/api/testing/fuzzer.go b/pkg/api/testing/fuzzer.go index a88202f8e4..2ea5c8fc46 100644 --- a/pkg/api/testing/fuzzer.go +++ b/pkg/api/testing/fuzzer.go @@ -89,6 +89,15 @@ func FuzzerFor(t *testing.T, version string, src rand.Source) *fuzz.Fuzzer { j.LabelSelector, _ = labels.Parse("a=b") j.FieldSelector, _ = fields.ParseSelector("a=b") }, + func(j *api.PodSpec, c fuzz.Continue) { + c.FuzzNoCustom(j) + // has a default value + ttl := int64(30) + if c.RandBool() { + ttl = int64(c.Uint32()) + } + j.TerminationGracePeriodSeconds = &ttl + }, func(j *api.PodPhase, c fuzz.Continue) { statuses := []api.PodPhase{api.PodPending, api.PodRunning, api.PodFailed, api.PodUnknown} *j = statuses[c.Rand.Intn(len(statuses))] diff --git a/pkg/api/types.go b/pkg/api/types.go index 1698cda387..028d61a467 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -143,6 +143,10 @@ type ObjectMeta struct { // will send a hard termination signal to the container. DeletionTimestamp *util.Time `json:"deletionTimestamp,omitempty"` + // DeletionGracePeriodSeconds records the graceful deletion value set when graceful deletion + // was requested. Represents the most recent grace period, and may only be shortened once set. + DeletionGracePeriodSeconds *int64 `json:"deletionGracePeriodSeconds,omitempty"` + // Labels are key value pairs that may be used to scope and select individual resources. // Label keys are of the form: // label-key ::= prefixed-name | name diff --git a/pkg/api/v1/conversion_generated.go b/pkg/api/v1/conversion_generated.go index c0736192e2..a1bdb14cd0 100644 --- a/pkg/api/v1/conversion_generated.go +++ b/pkg/api/v1/conversion_generated.go @@ -1176,6 +1176,12 @@ func convert_api_ObjectMeta_To_v1_ObjectMeta(in *api.ObjectMeta, out *ObjectMeta } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { @@ -3591,6 +3597,12 @@ func convert_v1_ObjectMeta_To_api_ObjectMeta(in *ObjectMeta, out *api.ObjectMeta } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/api/v1/deep_copy_generated.go b/pkg/api/v1/deep_copy_generated.go index 24a3acdd9c..cc285d8392 100644 --- a/pkg/api/v1/deep_copy_generated.go +++ b/pkg/api/v1/deep_copy_generated.go @@ -1010,6 +1010,12 @@ func deepCopy_v1_ObjectMeta(in ObjectMeta, out *ObjectMeta, c *conversion.Cloner } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/api/v1/defaults.go b/pkg/api/v1/defaults.go index 903d6f42b6..eee2a647c0 100644 --- a/pkg/api/v1/defaults.go +++ b/pkg/api/v1/defaults.go @@ -113,6 +113,10 @@ func addDefaultingFuncs() { if obj.HostNetwork { defaultHostNetworkPorts(&obj.Containers) } + if obj.TerminationGracePeriodSeconds == nil { + period := int64(DefaultTerminationGracePeriodSeconds) + obj.TerminationGracePeriodSeconds = &period + } }, func(obj *Probe) { if obj.TimeoutSeconds == 0 { diff --git a/pkg/api/v1/types.go b/pkg/api/v1/types.go index 097b7b8125..ea0749f584 100644 --- a/pkg/api/v1/types.go +++ b/pkg/api/v1/types.go @@ -141,6 +141,10 @@ type ObjectMeta struct { // will send a hard termination signal to the container. DeletionTimestamp *util.Time `json:"deletionTimestamp,omitempty" description:"RFC 3339 date and time at which the object will be deleted; populated by the system when a graceful deletion is requested, read-only; if not set, graceful deletion of the object has not been requested; see http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata"` + // DeletionGracePeriodSeconds records the graceful deletion value set when graceful deletion + // was requested. Represents the most recent grace period, and may only be shortened once set. + DeletionGracePeriodSeconds *int64 `json:"deletionGracePeriodSeconds,omitempty" description:"number of seconds allowed for this object to gracefully terminate before it will be removed from the system; only set when deletionTimestamp is also set, read-only; may only be shortened"` + // Labels are key value pairs that may be used to scope and select individual resources. // TODO: replace map[string]string with labels.LabelSet type Labels map[string]string `json:"labels,omitempty" description:"map of string keys and values that can be used to organize and categorize objects; may match selectors of replication controllers and services; see http://releases.k8s.io/HEAD/docs/user-guide/labels.md"` @@ -858,6 +862,8 @@ const ( // DNSDefault indicates that the pod should use the default (as // determined by kubelet) DNS settings. DNSDefault DNSPolicy = "Default" + + DefaultTerminationGracePeriodSeconds = 30 ) // PodSpec is a description of a pod @@ -872,7 +878,7 @@ type PodSpec struct { // The grace period is the duration in seconds after the processes running in the pod are sent // a termination signal and the time when the processes are forcibly halted with a kill signal. // Set this value longer than the expected cleanup time for your process. - TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds,omitempty" description:"optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process"` + TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds,omitempty" description:"optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process; defaults to 30 seconds"` ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty" description:"optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers; value must be a positive integer"` // Optional: Set DNS policy. Defaults to "ClusterFirst" DNSPolicy DNSPolicy `json:"dnsPolicy,omitempty" description:"DNS policy for containers within the pod; one of 'ClusterFirst' or 'Default'"` diff --git a/pkg/api/validation/validation.go b/pkg/api/validation/validation.go index 9d42e5b142..d8422a55c5 100644 --- a/pkg/api/validation/validation.go +++ b/pkg/api/validation/validation.go @@ -265,6 +265,16 @@ func ValidateObjectMetaUpdate(new, old *api.ObjectMeta) errs.ValidationErrorList } else { new.CreationTimestamp = old.CreationTimestamp } + // an object can never remove a deletion timestamp or clear/change grace period seconds + if !old.DeletionTimestamp.IsZero() { + new.DeletionTimestamp = old.DeletionTimestamp + } + if old.DeletionGracePeriodSeconds != nil && new.DeletionGracePeriodSeconds == nil { + new.DeletionGracePeriodSeconds = old.DeletionGracePeriodSeconds + } + if new.DeletionGracePeriodSeconds != nil && old.DeletionGracePeriodSeconds != nil && *new.DeletionGracePeriodSeconds != *old.DeletionGracePeriodSeconds { + allErrs = append(allErrs, errs.NewFieldInvalid("deletionGracePeriodSeconds", new.DeletionGracePeriodSeconds, "field is immutable; may only be changed via deletion")) + } // Reject updates that don't specify a resource version if new.ResourceVersion == "" { diff --git a/pkg/api/validation/validation_test.go b/pkg/api/validation/validation_test.go index b20de4cd9d..1c8c1e737c 100644 --- a/pkg/api/validation/validation_test.go +++ b/pkg/api/validation/validation_test.go @@ -1317,6 +1317,9 @@ func TestValidatePod(t *testing.T) { } func TestValidatePodUpdate(t *testing.T) { + now := util.Now() + grace := int64(30) + grace2 := int64(31) tests := []struct { a api.Pod b api.Pod @@ -1403,6 +1406,30 @@ func TestValidatePodUpdate(t *testing.T) { false, "more containers", }, + { + api.Pod{ + ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now}, + Spec: api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}}, + }, + api.Pod{ + ObjectMeta: api.ObjectMeta{Name: "foo"}, + Spec: api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}}, + }, + true, + "deletion timestamp filled out", + }, + { + api.Pod{ + ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now, DeletionGracePeriodSeconds: &grace}, + Spec: api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}}, + }, + api.Pod{ + ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now, DeletionGracePeriodSeconds: &grace2}, + Spec: api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}}, + }, + false, + "deletion grace period seconds cleared", + }, { api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo"}, diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 2438c8148a..77419e155e 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -322,7 +322,8 @@ func FilterActivePods(pods []api.Pod) []*api.Pod { var result []*api.Pod for i := range pods { if api.PodSucceeded != pods[i].Status.Phase && - api.PodFailed != pods[i].Status.Phase { + api.PodFailed != pods[i].Status.Phase && + pods[i].DeletionTimestamp == nil { result = append(result, &pods[i]) } } diff --git a/pkg/controller/endpoint/endpoints_controller.go b/pkg/controller/endpoint/endpoints_controller.go index d4962cf1df..88345f9dea 100644 --- a/pkg/controller/endpoint/endpoints_controller.go +++ b/pkg/controller/endpoint/endpoints_controller.go @@ -310,7 +310,11 @@ func (e *EndpointController) syncService(key string) { continue } if len(pod.Status.PodIP) == 0 { - glog.V(4).Infof("Failed to find an IP for pod %s/%s", pod.Namespace, pod.Name) + glog.V(5).Infof("Failed to find an IP for pod %s/%s", pod.Namespace, pod.Name) + continue + } + if pod.DeletionTimestamp != nil { + glog.V(5).Infof("Pod is being deleted %s/%s", pod.Namespace, pod.Name) continue } diff --git a/pkg/controller/node/nodecontroller.go b/pkg/controller/node/nodecontroller.go index 22fa7f89c4..aa0f3e62ce 100644 --- a/pkg/controller/node/nodecontroller.go +++ b/pkg/controller/node/nodecontroller.go @@ -89,8 +89,11 @@ type NodeController struct { nodeStatusMap map[string]nodeStatusData now func() util.Time // worker that evicts pods from unresponsive nodes. - podEvictor *PodEvictor + podEvictor *RateLimitedTimedQueue + terminationEvictor *RateLimitedTimedQueue podEvictionTimeout time.Duration + // The maximum duration before a pod evicted from a node can be forcefully terminated. + maximumGracePeriod time.Duration recorder record.EventRecorder } @@ -99,7 +102,7 @@ func NewNodeController( cloud cloudprovider.Interface, kubeClient client.Interface, podEvictionTimeout time.Duration, - podEvictor *PodEvictor, + podEvictionLimiter util.RateLimiter, nodeMonitorGracePeriod time.Duration, nodeStartupGracePeriod time.Duration, nodeMonitorPeriod time.Duration, @@ -123,7 +126,9 @@ func NewNodeController( kubeClient: kubeClient, recorder: recorder, podEvictionTimeout: podEvictionTimeout, - podEvictor: podEvictor, + maximumGracePeriod: 5 * time.Minute, + podEvictor: NewRateLimitedTimedQueue(podEvictionLimiter, false), + terminationEvictor: NewRateLimitedTimedQueue(podEvictionLimiter, false), nodeStatusMap: make(map[string]nodeStatusData), nodeMonitorGracePeriod: nodeMonitorGracePeriod, nodeMonitorPeriod: nodeMonitorPeriod, @@ -145,38 +150,36 @@ func (nc *NodeController) Run(period time.Duration) { }, nc.nodeMonitorPeriod) go util.Forever(func() { - nc.podEvictor.TryEvict(func(nodeName string) { nc.deletePods(nodeName) }) + nc.podEvictor.Try(func(value TimedValue) (bool, time.Duration) { + remaining, err := nc.deletePods(value.Value) + if err != nil { + util.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err)) + return false, 0 + } + if remaining { + glog.V(2).Infof("Pods terminating on %q", value.Value) + nc.terminationEvictor.Add(value.Value) + } + return true, 0 + }) }, nodeEvictionPeriod) -} -// We observed a Node deletion in etcd. Currently we only need to remove Pods that -// were assigned to it. -func (nc *NodeController) deleteNode(nodeID string) error { - return nc.deletePods(nodeID) -} - -// deletePods will delete all pods from master running on given node. -func (nc *NodeController) deletePods(nodeID string) error { - glog.V(2).Infof("Delete all pods from %v", nodeID) - pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(), - fields.OneTermEqualSelector(client.PodHost, nodeID)) - if err != nil { - return err - } - nc.recordNodeEvent(nodeID, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeID)) - for _, pod := range pods.Items { - // Defensive check, also needed for tests. - if pod.Spec.NodeName != nodeID { - continue - } - glog.V(2).Infof("Delete pod %v", pod.Name) - nc.recorder.Eventf(&pod, "NodeControllerEviction", "Deleting Pod %s from Node %s", pod.Name, nodeID) - if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil { - glog.Errorf("Error deleting pod %v: %v", pod.Name, err) - } - } - - return nil + // TODO: replace with a controller that ensures pods that are terminating complete + // in a particular time period + go util.Forever(func() { + nc.terminationEvictor.Try(func(value TimedValue) (bool, time.Duration) { + remaining, err := nc.terminatePods(value.Value, value.Added) + if err != nil { + util.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err)) + return false, 0 + } + if remaining != 0 { + glog.V(2).Infof("Pods still terminating on %q, estimated completion %s", value.Value, remaining) + return false, remaining + } + return true, 0 + }) + }, nodeEvictionPeriod) } // Generates num pod CIDRs that could be assigned to nodes. @@ -271,18 +274,18 @@ func (nc *NodeController) monitorNodeStatus() error { // Check eviction timeout against decisionTimestamp if lastReadyCondition.Status == api.ConditionFalse && decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { - if nc.podEvictor.AddNodeToEvict(node.Name) { + if nc.podEvictor.Add(node.Name) { glog.Infof("Adding pods to evict: %v is later than %v + %v", decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout) } } if lastReadyCondition.Status == api.ConditionUnknown && decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout-gracePeriod)) { - if nc.podEvictor.AddNodeToEvict(node.Name) { + if nc.podEvictor.Add(node.Name) { glog.Infof("Adding pods to evict2: %v is later than %v + %v", decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod) } } if lastReadyCondition.Status == api.ConditionTrue { - if nc.podEvictor.RemoveNodeToEvict(node.Name) { + if nc.podEvictor.Remove(node.Name) { glog.Infof("Pods on %v won't be evicted", node.Name) } } @@ -302,8 +305,8 @@ func (nc *NodeController) monitorNodeStatus() error { } if _, err := instances.ExternalID(node.Name); err != nil && err == cloudprovider.InstanceNotFound { glog.Infof("Deleting node (no longer present in cloud provider): %s", node.Name) - nc.recordNodeEvent(node.Name, "DeleteingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name)) - if err := nc.deletePods(node.Name); err != nil { + nc.recordNodeEvent(node.Name, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name)) + if _, err := nc.deletePods(node.Name); err != nil { glog.Errorf("Unable to delete pods from node %s: %v", node.Name, err) continue } @@ -311,6 +314,7 @@ func (nc *NodeController) monitorNodeStatus() error { glog.Errorf("Unable to delete node %s: %v", node.Name, err) continue } + nc.podEvictor.Add(node.Name) } } } @@ -502,3 +506,88 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap return gracePeriod, lastReadyCondition, readyCondition, err } + +// We observed a Node deletion in etcd. Currently we only need to remove Pods that +// were assigned to it. +func (nc *NodeController) deleteNode(nodeID string) error { + nc.podEvictor.Add(nodeID) + return nil +} + +// deletePods will delete all pods from master running on given node, and return true +// if any pods were deleted. +func (nc *NodeController) deletePods(nodeID string) (bool, error) { + remaining := false + glog.V(2).Infof("Delete all pods from %s", nodeID) + pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(), + fields.OneTermEqualSelector(client.PodHost, nodeID)) + if err != nil { + return remaining, err + } + + nc.recordNodeEvent(nodeID, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeID)) + + for _, pod := range pods.Items { + // Defensive check, also needed for tests. + if pod.Spec.NodeName != nodeID { + continue + } + // if the pod has already been deleted, ignore it + if pod.DeletionGracePeriodSeconds != nil { + continue + } + + glog.V(2).Infof("Delete pod %v", pod.Name) + nc.recorder.Eventf(&pod, "NodeControllerEviction", "Deleting Pod %s from Node %s", pod.Name, nodeID) + if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil { + return false, err + } + remaining = true + } + return remaining, nil +} + +// terminatePods will ensure all pods on the given node that are in terminating state are eventually +// cleaned up +func (nc *NodeController) terminatePods(nodeID string, since time.Time) (time.Duration, error) { + remaining := time.Duration(0) + glog.V(2).Infof("Terminating all pods on %s", nodeID) + pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(), + fields.OneTermEqualSelector(client.PodHost, nodeID)) + if err != nil { + return remaining, err + } + + nc.recordNodeEvent(nodeID, "TerminatingAllPods", fmt.Sprintf("Terminating all Pods on Node %s.", nodeID)) + + now := time.Now() + elapsed := now.Sub(since) + for _, pod := range pods.Items { + // Defensive check, also needed for tests. + if pod.Spec.NodeName != nodeID { + continue + } + // only clean terminated pods + if pod.DeletionGracePeriodSeconds == nil { + continue + } + + grace := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second + if grace > nc.maximumGracePeriod { + grace = nc.maximumGracePeriod + } + next := grace - elapsed + if next < 0 { + glog.V(2).Infof("Removing pod %v after %s grace period", pod.Name, grace) + nc.recordNodeEvent(nodeID, "TerminatingEvictedPod", fmt.Sprintf("Pod %s has exceeded the grace period for deletion after being evicted from Node %q and is being force killed", pod.Name, nodeID)) + if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil { + glog.Errorf("Error completing deletion of pod %s: %v", pod.Name, err) + next = 1 + } + } + if remaining < next { + remaining = next + } + } + return remaining, nil +} diff --git a/pkg/controller/node/nodecontroller_test.go b/pkg/controller/node/nodecontroller_test.go index af73e2fc03..6239673967 100644 --- a/pkg/controller/node/nodecontroller_test.go +++ b/pkg/controller/node/nodecontroller_test.go @@ -324,9 +324,8 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) { } for _, item := range table { - podEvictor := NewPodEvictor(util.NewFakeRateLimiter()) nodeController := NewNodeController(nil, item.fakeNodeHandler, - evictionTimeout, podEvictor, testNodeMonitorGracePeriod, + evictionTimeout, util.NewFakeRateLimiter(), testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false) nodeController.now = func() util.Time { return fakeNow } if err := nodeController.monitorNodeStatus(); err != nil { @@ -340,7 +339,17 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) { t.Errorf("unexpected error: %v", err) } - podEvictor.TryEvict(func(nodeName string) { nodeController.deletePods(nodeName) }) + nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) { + remaining, _ := nodeController.deletePods(value.Value) + if remaining { + nodeController.terminationEvictor.Add(value.Value) + } + return true, 0 + }) + nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) { + nodeController.terminatePods(value.Value, value.Added) + return true, 0 + }) podEvicted := false for _, action := range item.fakeNodeHandler.Actions() { if action.GetVerb() == "delete" && action.GetResource() == "pods" { @@ -531,7 +540,7 @@ func TestMonitorNodeStatusUpdateStatus(t *testing.T) { } for _, item := range table { - nodeController := NewNodeController(nil, item.fakeNodeHandler, 5*time.Minute, NewPodEvictor(util.NewFakeRateLimiter()), + nodeController := NewNodeController(nil, item.fakeNodeHandler, 5*time.Minute, util.NewFakeRateLimiter(), testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false) nodeController.now = func() util.Time { return fakeNow } if err := nodeController.monitorNodeStatus(); err != nil { @@ -610,7 +619,7 @@ func TestNodeDeletion(t *testing.T) { Fake: testclient.NewSimpleFake(&api.PodList{Items: []api.Pod{*newPod("pod0", "node0"), *newPod("pod1", "node1")}}), } - nodeController := NewNodeController(nil, fakeNodeHandler, 5*time.Minute, NewPodEvictor(util.NewFakeRateLimiter()), + nodeController := NewNodeController(nil, fakeNodeHandler, 5*time.Minute, util.NewFakeRateLimiter(), testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false) nodeController.now = func() util.Time { return fakeNow } if err := nodeController.monitorNodeStatus(); err != nil { @@ -620,6 +629,10 @@ func TestNodeDeletion(t *testing.T) { if err := nodeController.monitorNodeStatus(); err != nil { t.Errorf("unexpected error: %v", err) } + nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) { + nodeController.deletePods(value.Value) + return true, 0 + }) podEvicted := false for _, action := range fakeNodeHandler.Actions() { if action.GetVerb() == "delete" && action.GetResource() == "pods" { diff --git a/pkg/controller/node/podevictor.go b/pkg/controller/node/podevictor.go deleted file mode 100644 index 1c66d0a211..0000000000 --- a/pkg/controller/node/podevictor.go +++ /dev/null @@ -1,129 +0,0 @@ -/* -Copyright 2015 The Kubernetes Authors All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package nodecontroller - -import ( - "sync" - - "k8s.io/kubernetes/pkg/util" - - "github.com/golang/glog" -) - -// A FIFO queue which additionally guarantees that any element can be added only once until -// it is removed. -type UniqueQueue struct { - lock sync.Mutex - queue []string - set util.StringSet -} - -// Entity responsible for evicting Pods from inserted Nodes. It uses RateLimiter to avoid -// evicting everything at once. Note that we rate limit eviction of Nodes not individual Pods. -type PodEvictor struct { - queue UniqueQueue - deletingPodsRateLimiter util.RateLimiter -} - -// Adds a new value to the queue if it wasn't added before, or was explicitly removed by the -// Remove call. Returns true if new value was added. -func (q *UniqueQueue) Add(value string) bool { - q.lock.Lock() - defer q.lock.Unlock() - - if !q.set.Has(value) { - q.queue = append(q.queue, value) - q.set.Insert(value) - return true - } else { - return false - } -} - -// Removes the value from the queue, so Get() call won't return it, and allow subsequent addition -// of the given value. If the value is not present does nothing and returns false. -func (q *UniqueQueue) Remove(value string) bool { - q.lock.Lock() - defer q.lock.Unlock() - - q.set.Delete(value) - for i, val := range q.queue { - if val == value { - if i > 0 && i < len(q.queue)-1 { - q.queue = append(q.queue[0:i], q.queue[i+1:len(q.queue)]...) - } else if i > 0 { - q.queue = q.queue[0 : len(q.queue)-1] - } else { - q.queue = q.queue[1:len(q.queue)] - } - return true - } - } - return false -} - -// Returns the oldest added value that wasn't returned yet. -func (q *UniqueQueue) Get() (string, bool) { - q.lock.Lock() - defer q.lock.Unlock() - if len(q.queue) == 0 { - return "", false - } - - result := q.queue[0] - q.queue = q.queue[1:len(q.queue)] - return result, true -} - -// Creates new PodEvictor which will use given RateLimiter to oversee eviction. -func NewPodEvictor(deletingPodsRateLimiter util.RateLimiter) *PodEvictor { - return &PodEvictor{ - queue: UniqueQueue{ - queue: make([]string, 0), - set: util.NewStringSet(), - }, - deletingPodsRateLimiter: deletingPodsRateLimiter, - } -} - -// Tries to evict all Pods from previously inserted Nodes. Ends prematurely if RateLimiter forbids any eviction. -// Each Node is processed only once, as long as it's not Removed, i.e. calling multiple AddNodeToEvict does not result -// with multiple evictions as long as RemoveNodeToEvict is not called. -func (pe *PodEvictor) TryEvict(delFunc func(string)) { - val, ok := pe.queue.Get() - for ok { - if pe.deletingPodsRateLimiter.CanAccept() { - glog.Infof("PodEvictor is evicting Pods on Node: %v", val) - delFunc(val) - } else { - glog.V(1).Info("PodEvictor is rate limitted.") - break - } - val, ok = pe.queue.Get() - } -} - -// Adds Node to the Evictor to be processed later. Won't add the same Node second time if it was already -// added and not removed. -func (pe *PodEvictor) AddNodeToEvict(nodeName string) bool { - return pe.queue.Add(nodeName) -} - -// Removes Node from the Evictor. The Node won't be processed until added again. -func (pe *PodEvictor) RemoveNodeToEvict(nodeName string) bool { - return pe.queue.Remove(nodeName) -} diff --git a/pkg/controller/node/rate_limited_queue.go b/pkg/controller/node/rate_limited_queue.go new file mode 100644 index 0000000000..a30d62a1a8 --- /dev/null +++ b/pkg/controller/node/rate_limited_queue.go @@ -0,0 +1,183 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodecontroller + +import ( + "container/heap" + "sync" + "time" + + "k8s.io/kubernetes/pkg/util" +) + +// TimedValue is a value that should be processed at a designated time. +type TimedValue struct { + Value string + Added time.Time + Next time.Time +} + +// now is used to test time +var now func() time.Time = time.Now + +// TimedQueue is a priority heap where the lowest Next is at the front of the queue +type TimedQueue []*TimedValue + +func (h TimedQueue) Len() int { return len(h) } +func (h TimedQueue) Less(i, j int) bool { return h[i].Next.Before(h[j].Next) } +func (h TimedQueue) Swap(i, j int) { h[i], h[j] = h[j], h[i] } + +func (h *TimedQueue) Push(x interface{}) { + *h = append(*h, x.(*TimedValue)) +} + +func (h *TimedQueue) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +// A FIFO queue which additionally guarantees that any element can be added only once until +// it is removed. +type UniqueQueue struct { + lock sync.Mutex + queue TimedQueue + set util.StringSet +} + +// Adds a new value to the queue if it wasn't added before, or was explicitly removed by the +// Remove call. Returns true if new value was added. +func (q *UniqueQueue) Add(value TimedValue) bool { + q.lock.Lock() + defer q.lock.Unlock() + + if q.set.Has(value.Value) { + return false + } + heap.Push(&q.queue, &value) + q.set.Insert(value.Value) + return true +} + +// Removes the value from the queue, so Get() call won't return it, and allow subsequent addition +// of the given value. If the value is not present does nothing and returns false. +func (q *UniqueQueue) Remove(value string) bool { + q.lock.Lock() + defer q.lock.Unlock() + + q.set.Delete(value) + for i, val := range q.queue { + if val.Value == value { + if i > 0 && i < len(q.queue)-1 { + q.queue = append(q.queue[0:i], q.queue[i+1:len(q.queue)]...) + } else if i > 0 { + q.queue = q.queue[0 : len(q.queue)-1] + } else { + q.queue = q.queue[1:len(q.queue)] + } + return true + } + } + return false +} + +// Returns the oldest added value that wasn't returned yet. +func (q *UniqueQueue) Get() (TimedValue, bool) { + q.lock.Lock() + defer q.lock.Unlock() + if len(q.queue) == 0 { + return TimedValue{}, false + } + result := q.queue.Pop().(*TimedValue) + q.set.Delete(result.Value) + return *result, true +} + +// RateLimitedTimedQueue is a unique item priority queue ordered by the expected next time +// of execution. It is also rate limited. +type RateLimitedTimedQueue struct { + queue UniqueQueue + limiter util.RateLimiter + leak bool +} + +// Creates new queue which will use given RateLimiter to oversee execution. If leak is true, +// items which are rate limited will be leakped. Otherwise, rate limited items will be requeued. +func NewRateLimitedTimedQueue(limiter util.RateLimiter, leak bool) *RateLimitedTimedQueue { + return &RateLimitedTimedQueue{ + queue: UniqueQueue{ + queue: TimedQueue{}, + set: util.NewStringSet(), + }, + limiter: limiter, + leak: leak, + } +} + +// ActionFunc takes a timed value and returns false if the item must be retried, with an optional +// time.Duration if some minimum wait interval should be used. +type ActionFunc func(TimedValue) (bool, time.Duration) + +// Try processes the queue. Ends prematurely if RateLimiter forbids an action and leak is true. +// Otherwise, requeues the item to be processed. Each value is processed once if fn returns true, +// otherwise it is added back to the queue. The returned remaining is used to identify the minimum +// time to execute the next item in the queue. +func (q *RateLimitedTimedQueue) Try(fn ActionFunc) { + val, ok := q.queue.Get() + for ok { + // rate limit the queue checking + if q.leak { + if !q.limiter.CanAccept() { + break + } + } else { + q.limiter.Accept() + } + + now := now() + if now.Before(val.Next) { + q.queue.Add(val) + val, ok = q.queue.Get() + // we do not sleep here because other values may be added at the front of the queue + continue + } + + if ok, wait := fn(val); !ok { + val.Next = now.Add(wait + 1) + q.queue.Add(val) + } + val, ok = q.queue.Get() + } +} + +// Adds value to the queue to be processed. Won't add the same value a second time if it was already +// added and not removed. +func (q *RateLimitedTimedQueue) Add(value string) bool { + now := now() + return q.queue.Add(TimedValue{ + Value: value, + Added: now, + Next: now, + }) +} + +// Removes Node from the Evictor. The Node won't be processed until added again. +func (q *RateLimitedTimedQueue) Remove(value string) bool { + return q.queue.Remove(value) +} diff --git a/pkg/controller/node/podevictor_test.go b/pkg/controller/node/rate_limited_queue_test.go similarity index 67% rename from pkg/controller/node/podevictor_test.go rename to pkg/controller/node/rate_limited_queue_test.go index dd9532efff..d57baf9d24 100644 --- a/pkg/controller/node/podevictor_test.go +++ b/pkg/controller/node/rate_limited_queue_test.go @@ -17,14 +17,16 @@ limitations under the License. package nodecontroller import ( + "reflect" "testing" + "time" "k8s.io/kubernetes/pkg/util" ) -func CheckQueueEq(lhs, rhs []string) bool { +func CheckQueueEq(lhs []string, rhs TimedQueue) bool { for i := 0; i < len(lhs); i++ { - if rhs[i] != lhs[i] { + if rhs[i].Value != lhs[i] { return false } } @@ -36,10 +38,10 @@ func CheckSetEq(lhs, rhs util.StringSet) bool { } func TestAddNode(t *testing.T) { - evictor := NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") + evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") queuePattern := []string{"first", "second", "third"} if len(evictor.queue.queue) != len(queuePattern) { @@ -59,11 +61,11 @@ func TestAddNode(t *testing.T) { } func TestDelNode(t *testing.T) { - evictor := NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") - evictor.RemoveNodeToEvict("first") + evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + evictor.Remove("first") queuePattern := []string{"second", "third"} if len(evictor.queue.queue) != len(queuePattern) { @@ -81,11 +83,11 @@ func TestDelNode(t *testing.T) { t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) } - evictor = NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") - evictor.RemoveNodeToEvict("second") + evictor = NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + evictor.Remove("second") queuePattern = []string{"first", "third"} if len(evictor.queue.queue) != len(queuePattern) { @@ -103,11 +105,11 @@ func TestDelNode(t *testing.T) { t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) } - evictor = NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") - evictor.RemoveNodeToEvict("third") + evictor = NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + evictor.Remove("third") queuePattern = []string{"first", "second"} if len(evictor.queue.queue) != len(queuePattern) { @@ -126,15 +128,18 @@ func TestDelNode(t *testing.T) { } } -func TestEvictNode(t *testing.T) { - evictor := NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") - evictor.RemoveNodeToEvict("second") +func TestTry(t *testing.T) { + evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + evictor.Remove("second") deletedMap := util.NewStringSet() - evictor.TryEvict(func(nodeName string) { deletedMap.Insert(nodeName) }) + evictor.Try(func(value TimedValue) (bool, time.Duration) { + deletedMap.Insert(value.Value) + return true, 0 + }) setPattern := util.NewStringSet("first", "third") if len(deletedMap) != len(setPattern) { @@ -144,3 +149,35 @@ func TestEvictNode(t *testing.T) { t.Errorf("Invalid map. Got %v, expected %v", deletedMap, setPattern) } } + +func TestTryOrdering(t *testing.T) { + evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), false) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + + order := []string{} + count := 0 + queued := false + evictor.Try(func(value TimedValue) (bool, time.Duration) { + count++ + if value.Added.IsZero() { + t.Fatalf("added should not be zero") + } + if value.Next.IsZero() { + t.Fatalf("next should not be zero") + } + if !queued && value.Value == "second" { + queued = true + return false, time.Millisecond + } + order = append(order, value.Value) + return true, 0 + }) + if reflect.DeepEqual(order, []string{"first", "third", "second"}) { + t.Fatalf("order was wrong: %v", order) + } + if count != 4 { + t.Fatalf("unexpected iterations: %d", count) + } +} diff --git a/pkg/controller/replication/replication_controller.go b/pkg/controller/replication/replication_controller.go index b212647c45..726a8f735a 100644 --- a/pkg/controller/replication/replication_controller.go +++ b/pkg/controller/replication/replication_controller.go @@ -213,6 +213,12 @@ func (rm *ReplicationManager) getPodController(pod *api.Pod) *api.ReplicationCon // When a pod is created, enqueue the controller that manages it and update it's expectations. func (rm *ReplicationManager) addPod(obj interface{}) { pod := obj.(*api.Pod) + if pod.DeletionTimestamp != nil { + // on a restart of the controller manager, it's possible a new pod shows up in a state that + // is already pending deletion. Prevent the pod from being a creation observation. + rm.deletePod(pod) + return + } if rc := rm.getPodController(pod); rc != nil { rcKey, err := controller.KeyFunc(rc) if err != nil { @@ -234,6 +240,15 @@ func (rm *ReplicationManager) updatePod(old, cur interface{}) { } // TODO: Write a unittest for this case curPod := cur.(*api.Pod) + if curPod.DeletionTimestamp != nil { + // when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period, + // and after such time has passed, the kubelet actually deletes it from the store. We receive an update + // for modification of the deletion timestamp and expect an rc to create more replicas asap, not wait + // until the kubelet actually deletes the pod. This is different from the Phase of a pod changing, because + // an rc never initiates a phase change, and so is never asleep waiting for the same. + rm.deletePod(curPod) + return + } if rc := rm.getPodController(curPod); rc != nil { rm.enqueueController(rc) } diff --git a/pkg/expapi/deep_copy_generated.go b/pkg/expapi/deep_copy_generated.go index 8ef48ff341..a2406f4006 100644 --- a/pkg/expapi/deep_copy_generated.go +++ b/pkg/expapi/deep_copy_generated.go @@ -53,6 +53,12 @@ func deepCopy_api_ObjectMeta(in api.ObjectMeta, out *api.ObjectMeta, c *conversi } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/expapi/v1/conversion_generated.go b/pkg/expapi/v1/conversion_generated.go index 614247b5f4..5f20d3f4fe 100644 --- a/pkg/expapi/v1/conversion_generated.go +++ b/pkg/expapi/v1/conversion_generated.go @@ -57,6 +57,12 @@ func convert_api_ObjectMeta_To_v1_ObjectMeta(in *api.ObjectMeta, out *v1.ObjectM } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { @@ -115,6 +121,12 @@ func convert_v1_ObjectMeta_To_api_ObjectMeta(in *v1.ObjectMeta, out *api.ObjectM } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/expapi/v1/deep_copy_generated.go b/pkg/expapi/v1/deep_copy_generated.go index 654034a541..2e15129234 100644 --- a/pkg/expapi/v1/deep_copy_generated.go +++ b/pkg/expapi/v1/deep_copy_generated.go @@ -70,6 +70,12 @@ func deepCopy_v1_ObjectMeta(in v1.ObjectMeta, out *v1.ObjectMeta, c *conversion. } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/kubectl/cmd/get_test.go b/pkg/kubectl/cmd/get_test.go index bb49d4c476..7d7267d5a4 100644 --- a/pkg/kubectl/cmd/get_test.go +++ b/pkg/kubectl/cmd/get_test.go @@ -38,6 +38,7 @@ import ( ) func testData() (*api.PodList, *api.ServiceList, *api.ReplicationControllerList) { + grace := int64(30) pods := &api.PodList{ ListMeta: api.ListMeta{ ResourceVersion: "15", @@ -46,15 +47,17 @@ func testData() (*api.PodList, *api.ServiceList, *api.ReplicationControllerList) { ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "test", ResourceVersion: "10"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "bar", Namespace: "test", ResourceVersion: "11"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -536,6 +539,7 @@ func TestGetMultipleTypeObjectsWithDirectReference(t *testing.T) { } } func watchTestData() ([]api.Pod, []watch.Event) { + grace := int64(30) pods := []api.Pod{ { ObjectMeta: api.ObjectMeta{ @@ -544,8 +548,9 @@ func watchTestData() ([]api.Pod, []watch.Event) { ResourceVersion: "10", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, } @@ -559,8 +564,9 @@ func watchTestData() ([]api.Pod, []watch.Event) { ResourceVersion: "11", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -573,8 +579,9 @@ func watchTestData() ([]api.Pod, []watch.Event) { ResourceVersion: "12", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, diff --git a/pkg/kubectl/cmd/util/helpers_test.go b/pkg/kubectl/cmd/util/helpers_test.go index c870531160..0a9cd7459a 100644 --- a/pkg/kubectl/cmd/util/helpers_test.go +++ b/pkg/kubectl/cmd/util/helpers_test.go @@ -34,6 +34,7 @@ import ( ) func TestMerge(t *testing.T) { + grace := int64(30) tests := []struct { obj runtime.Object fragment string @@ -54,8 +55,9 @@ func TestMerge(t *testing.T) { Name: "foo", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -122,8 +124,9 @@ func TestMerge(t *testing.T) { VolumeSource: api.VolumeSource{EmptyDir: &api.EmptyDirVolumeSource{}}, }, }, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, diff --git a/pkg/kubectl/describe.go b/pkg/kubectl/describe.go index bb1ec67d12..c3de3b8506 100644 --- a/pkg/kubectl/describe.go +++ b/pkg/kubectl/describe.go @@ -417,7 +417,12 @@ func describePod(pod *api.Pod, rcs []api.ReplicationController, events *api.Even fmt.Fprintf(out, "Image(s):\t%s\n", makeImageList(&pod.Spec)) fmt.Fprintf(out, "Node:\t%s\n", pod.Spec.NodeName+"/"+pod.Status.HostIP) fmt.Fprintf(out, "Labels:\t%s\n", formatLabels(pod.Labels)) - fmt.Fprintf(out, "Status:\t%s\n", string(pod.Status.Phase)) + if pod.DeletionTimestamp != nil { + fmt.Fprintf(out, "Status:\tTerminating (expires %s)\n", pod.DeletionTimestamp.Time.Format(time.RFC1123Z)) + fmt.Fprintf(out, "Termination Grace Period:\t%ds\n", pod.DeletionGracePeriodSeconds) + } else { + fmt.Fprintf(out, "Status:\t%s\n", string(pod.Status.Phase)) + } fmt.Fprintf(out, "Reason:\t%s\n", pod.Status.Reason) fmt.Fprintf(out, "Message:\t%s\n", pod.Status.Message) fmt.Fprintf(out, "IP:\t%s\n", pod.Status.PodIP) diff --git a/pkg/kubectl/resource/builder_test.go b/pkg/kubectl/resource/builder_test.go index ebe480c3f3..7b96f74656 100644 --- a/pkg/kubectl/resource/builder_test.go +++ b/pkg/kubectl/resource/builder_test.go @@ -83,6 +83,7 @@ func fakeClientWith(testName string, t *testing.T, data map[string]string) Clien } func testData() (*api.PodList, *api.ServiceList) { + grace := int64(30) pods := &api.PodList{ ListMeta: api.ListMeta{ ResourceVersion: "15", @@ -91,15 +92,17 @@ func testData() (*api.PodList, *api.ServiceList) { { ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "test", ResourceVersion: "10"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "bar", Namespace: "test", ResourceVersion: "11"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, diff --git a/pkg/kubectl/resource/helper_test.go b/pkg/kubectl/resource/helper_test.go index d40da3d271..51f81a7925 100644 --- a/pkg/kubectl/resource/helper_test.go +++ b/pkg/kubectl/resource/helper_test.go @@ -128,6 +128,7 @@ func TestHelperCreate(t *testing.T) { return true } + grace := int64(30) tests := []struct { Resp *http.Response RespFunc client.HTTPClientFunc @@ -172,8 +173,9 @@ func TestHelperCreate(t *testing.T) { ExpectObject: &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, Resp: &http.Response{StatusCode: http.StatusOK, Body: objBody(&api.Status{Status: api.StatusSuccess})}, @@ -381,6 +383,7 @@ func TestHelperReplace(t *testing.T) { return true } + grace := int64(30) tests := []struct { Resp *http.Response RespFunc client.HTTPClientFunc @@ -418,8 +421,9 @@ func TestHelperReplace(t *testing.T) { ExpectObject: &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "10"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, Overwrite: true, diff --git a/pkg/kubectl/resource_printer.go b/pkg/kubectl/resource_printer.go index 0db84a8c33..09ae64128c 100644 --- a/pkg/kubectl/resource_printer.go +++ b/pkg/kubectl/resource_printer.go @@ -410,6 +410,9 @@ func printPod(pod *api.Pod, w io.Writer, withNamespace bool, wide bool, columnLa readyContainers++ } } + if pod.DeletionTimestamp != nil { + reason = "Terminating" + } if withNamespace { if _, err := fmt.Fprintf(w, "%s\t", namespace); err != nil { diff --git a/pkg/kubectl/rolling_updater_test.go b/pkg/kubectl/rolling_updater_test.go index 827b40777e..67b50955d8 100644 --- a/pkg/kubectl/rolling_updater_test.go +++ b/pkg/kubectl/rolling_updater_test.go @@ -880,6 +880,7 @@ func TestUpdateExistingReplicationController(t *testing.T) { func TestUpdateWithRetries(t *testing.T) { codec := testapi.Codec() + grace := int64(30) rc := &api.ReplicationController{ ObjectMeta: api.ObjectMeta{Name: "rc", Labels: map[string]string{ @@ -897,8 +898,9 @@ func TestUpdateWithRetries(t *testing.T) { }, }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, diff --git a/pkg/kubelet/config/common_test.go b/pkg/kubelet/config/common_test.go index 9ec02de37c..fcf16781ba 100644 --- a/pkg/kubelet/config/common_test.go +++ b/pkg/kubelet/config/common_test.go @@ -31,6 +31,7 @@ import ( func noDefault(*api.Pod) error { return nil } func TestDecodeSinglePod(t *testing.T) { + grace := int64(30) pod := &api.Pod{ TypeMeta: api.TypeMeta{ APIVersion: "", @@ -41,8 +42,9 @@ func TestDecodeSinglePod(t *testing.T) { Namespace: "mynamespace", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, Containers: []api.Container{{ Name: "image", Image: "test/image", @@ -91,6 +93,7 @@ func TestDecodeSinglePod(t *testing.T) { } func TestDecodePodList(t *testing.T) { + grace := int64(30) pod := &api.Pod{ TypeMeta: api.TypeMeta{ APIVersion: "", @@ -101,8 +104,9 @@ func TestDecodePodList(t *testing.T) { Namespace: "mynamespace", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, Containers: []api.Container{{ Name: "image", Image: "test/image", diff --git a/pkg/kubelet/config/config.go b/pkg/kubelet/config/config.go index cbb8ef760c..f8f16fa65e 100644 --- a/pkg/kubelet/config/config.go +++ b/pkg/kubelet/config/config.go @@ -217,9 +217,8 @@ func (s *podStorage) merge(source string, change interface{}) (adds, updates, de for _, ref := range filtered { name := kubecontainer.GetPodFullName(ref) if existing, found := pods[name]; found { - if !reflect.DeepEqual(existing.Spec, ref.Spec) { + if checkAndUpdatePod(existing, ref) { // this is an update - existing.Spec = ref.Spec updates.Pods = append(updates.Pods, existing) continue } @@ -261,9 +260,8 @@ func (s *podStorage) merge(source string, change interface{}) (adds, updates, de name := kubecontainer.GetPodFullName(ref) if existing, found := oldPods[name]; found { pods[name] = existing - if !reflect.DeepEqual(existing.Spec, ref.Spec) { + if checkAndUpdatePod(existing, ref) { // this is an update - existing.Spec = ref.Spec updates.Pods = append(updates.Pods, existing) continue } @@ -335,6 +333,23 @@ func filterInvalidPods(pods []*api.Pod, source string, recorder record.EventReco return } +// checkAndUpdatePod updates existing if ref makes a meaningful change and returns true, or +// returns false if there was no update. +func checkAndUpdatePod(existing, ref *api.Pod) bool { + // TODO: it would be better to update the whole object and only preserve certain things + // like the source annotation or the UID (to ensure safety) + if reflect.DeepEqual(existing.Spec, ref.Spec) && + reflect.DeepEqual(existing.DeletionTimestamp, ref.DeletionTimestamp) && + reflect.DeepEqual(existing.DeletionGracePeriodSeconds, ref.DeletionGracePeriodSeconds) { + return false + } + // this is an update + existing.Spec = ref.Spec + existing.DeletionTimestamp = ref.DeletionTimestamp + existing.DeletionGracePeriodSeconds = ref.DeletionGracePeriodSeconds + return true +} + // Sync sends a copy of the current state through the update channel. func (s *podStorage) Sync() { s.updateLock.Lock() diff --git a/pkg/kubelet/config/file_test.go b/pkg/kubelet/config/file_test.go index 440761e326..66ca28ba97 100644 --- a/pkg/kubelet/config/file_test.go +++ b/pkg/kubelet/config/file_test.go @@ -69,6 +69,7 @@ func writeTestFile(t *testing.T, dir, name string, contents string) *os.File { func TestReadPodsFromFile(t *testing.T) { hostname := "random-test-hostname" + grace := int64(30) var testCases = []struct { desc string pod runtime.Object @@ -98,9 +99,10 @@ func TestReadPodsFromFile(t *testing.T) { SelfLink: getSelfLink("test-"+hostname, "mynamespace"), }, Spec: api.PodSpec{ - NodeName: hostname, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + NodeName: hostname, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, Containers: []api.Container{{ Name: "image", Image: "test/image", diff --git a/pkg/kubelet/config/http_test.go b/pkg/kubelet/config/http_test.go index 08fafe9288..4279609414 100644 --- a/pkg/kubelet/config/http_test.go +++ b/pkg/kubelet/config/http_test.go @@ -123,6 +123,7 @@ func TestExtractInvalidPods(t *testing.T) { func TestExtractPodsFromHTTP(t *testing.T) { hostname := "different-value" + grace := int64(30) var testCases = []struct { desc string pods runtime.Object @@ -156,9 +157,11 @@ func TestExtractPodsFromHTTP(t *testing.T) { SelfLink: getSelfLink("foo-"+hostname, "mynamespace"), }, Spec: api.PodSpec{ - NodeName: hostname, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + NodeName: hostname, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, + Containers: []api.Container{{ Name: "1", Image: "foo", @@ -209,9 +212,11 @@ func TestExtractPodsFromHTTP(t *testing.T) { SelfLink: getSelfLink("foo-"+hostname, kubelet.NamespaceDefault), }, Spec: api.PodSpec{ - NodeName: hostname, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + NodeName: hostname, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, + Containers: []api.Container{{ Name: "1", Image: "foo", @@ -229,9 +234,11 @@ func TestExtractPodsFromHTTP(t *testing.T) { SelfLink: getSelfLink("bar-"+hostname, kubelet.NamespaceDefault), }, Spec: api.PodSpec{ - NodeName: hostname, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + NodeName: hostname, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, + Containers: []api.Container{{ Name: "2", Image: "bar", diff --git a/pkg/kubelet/container/fake_runtime.go b/pkg/kubelet/container/fake_runtime.go index 03d65e50a3..324d7ae6bf 100644 --- a/pkg/kubelet/container/fake_runtime.go +++ b/pkg/kubelet/container/fake_runtime.go @@ -163,13 +163,13 @@ func (f *FakeRuntime) SyncPod(pod *api.Pod, _ Pod, _ api.PodStatus, _ []api.Secr return f.Err } -func (f *FakeRuntime) KillPod(pod Pod) error { +func (f *FakeRuntime) KillPod(pod *api.Pod, runningPod Pod) error { f.Lock() defer f.Unlock() f.CalledFunctions = append(f.CalledFunctions, "KillPod") - f.KilledPods = append(f.KilledPods, string(pod.ID)) - for _, c := range pod.Containers { + f.KilledPods = append(f.KilledPods, string(runningPod.ID)) + for _, c := range runningPod.Containers { f.KilledContainers = append(f.KilledContainers, c.Name) } return f.Err diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index d50f021f6a..8d476f9b16 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -54,8 +54,8 @@ type Runtime interface { GetPods(all bool) ([]*Pod, error) // Syncs the running pod into the desired pod. SyncPod(pod *api.Pod, runningPod Pod, podStatus api.PodStatus, pullSecrets []api.Secret) error - // KillPod kills all the containers of a pod. - KillPod(pod Pod) error + // KillPod kills all the containers of a pod. Pod may be nil, running pod must not be. + KillPod(pod *api.Pod, runningPod Pod) error // GetPodStatus retrieves the status of the pod, including the information of // all containers in the pod. Clients of this interface assume the containers // statuses in a pod always have a deterministic ordering (eg: sorted by name). diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index b845191a82..8c55d779b3 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -56,12 +56,21 @@ import ( const ( maxReasonCacheEntries = 200 - kubernetesPodLabel = "io.kubernetes.pod.data" - kubernetesContainerLabel = "io.kubernetes.container.name" // ndots specifies the minimum number of dots that a domain name must contain for the resolver to consider it as FQDN (fully-qualified) // we want to able to consider SRV lookup names like _dns._udp.kube-dns.default.svc to be considered relative. // hence, setting ndots to be 5. ndotsDNSOption = "options ndots:5\n" + // In order to avoid unnecessary SIGKILLs, give every container a minimum grace + // period after SIGTERM. Docker will guarantee the termination, but SIGTERM is + // potentially dangerous. + // TODO: evaluate whether there are scenarios in which SIGKILL is preferable to + // SIGTERM for certain process types, which may justify setting this to 0. + minimumGracePeriodInSeconds = 2 + + kubernetesNameLabel = "io.kubernetes.pod.name" + kubernetesPodLabel = "io.kubernetes.pod.data" + kubernetesTerminationGracePeriodLabel = "io.kubernetes.pod.terminationGracePeriod" + kubernetesContainerLabel = "io.kubernetes.container.name" ) // DockerManager implements the Runtime interface. @@ -588,12 +597,19 @@ func (dm *DockerManager) runContainer( if len(containerHostname) > hostnameMaxLen { containerHostname = containerHostname[:hostnameMaxLen] } + + // Pod information is recorded on the container as labels to preserve it in the event the pod is deleted + // while the Kubelet is down and there is no information available to recover the pod. This includes + // termination information like the termination grace period and the pre stop hooks. + // TODO: keep these labels up to date if the pod changes namespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} labels := map[string]string{ - "io.kubernetes.pod.name": namespacedName.String(), + kubernetesNameLabel: namespacedName.String(), + } + if pod.Spec.TerminationGracePeriodSeconds != nil { + labels[kubernetesTerminationGracePeriodLabel] = strconv.FormatInt(*pod.Spec.TerminationGracePeriodSeconds, 10) } if container.Lifecycle != nil && container.Lifecycle.PreStop != nil { - glog.V(1).Infof("Setting preStop hook") // TODO: This is kind of hacky, we should really just encode the bits we need. data, err := latest.Codec.Encode(pod) if err != nil { @@ -1104,40 +1120,56 @@ func (dm *DockerManager) PortForward(pod *kubecontainer.Pod, port uint16, stream } // Kills all containers in the specified pod -func (dm *DockerManager) KillPod(pod kubecontainer.Pod) error { +func (dm *DockerManager) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error { // Send the kills in parallel since they may take a long time. Len + 1 since there // can be Len errors + the networkPlugin teardown error. - errs := make(chan error, len(pod.Containers)+1) + errs := make(chan error, len(runningPod.Containers)+1) wg := sync.WaitGroup{} - var networkID types.UID - for _, container := range pod.Containers { + var ( + networkContainer *kubecontainer.Container + networkSpec *api.Container + ) + for _, container := range runningPod.Containers { wg.Add(1) go func(container *kubecontainer.Container) { defer util.HandleCrash() defer wg.Done() + var containerSpec *api.Container + if pod != nil { + for i, c := range pod.Spec.Containers { + if c.Name == container.Name { + containerSpec = &pod.Spec.Containers[i] + break + } + } + } + // TODO: Handle this without signaling the pod infra container to // adapt to the generic container runtime. if container.Name == PodInfraContainerName { // Store the container runtime for later deletion. // We do this so that PreStop handlers can run in the network namespace. - networkID = container.ID + networkContainer = container + networkSpec = containerSpec return } - if err := dm.killContainer(container.ID); err != nil { - glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID) + + err := dm.killContainer(container.ID, containerSpec, pod) + if err != nil { + glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, runningPod.ID) errs <- err } }(container) } wg.Wait() - if len(networkID) > 0 { - if err := dm.networkPlugin.TearDownPod(pod.Namespace, pod.Name, kubeletTypes.DockerID(networkID)); err != nil { + if networkContainer != nil { + if err := dm.networkPlugin.TearDownPod(runningPod.Namespace, runningPod.Name, kubeletTypes.DockerID(networkContainer.ID)); err != nil { glog.Errorf("Failed tearing down the infra container: %v", err) errs <- err } - if err := dm.killContainer(networkID); err != nil { - glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID) + if err := dm.killContainer(networkContainer.ID, networkSpec, pod); err != nil { + glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, runningPod.ID) errs <- err } } @@ -1152,75 +1184,142 @@ func (dm *DockerManager) KillPod(pod kubecontainer.Pod) error { return nil } -// KillContainerInPod kills a container in the pod. -func (dm *DockerManager) KillContainerInPod(container api.Container, pod *api.Pod) error { - // Locate the container. - pods, err := dm.GetPods(false) - if err != nil { - return err - } - targetPod := kubecontainer.Pods(pods).FindPod(kubecontainer.GetPodFullName(pod), pod.UID) - targetContainer := targetPod.FindContainerByName(container.Name) - if targetContainer == nil { - return fmt.Errorf("unable to find container %q in pod %q", container.Name, targetPod.Name) - } - return dm.killContainer(targetContainer.ID) -} - -// TODO(vmarmol): Unexport this as it is no longer used externally. -// KillContainer kills a container identified by containerID. -// Internally, it invokes docker's StopContainer API with a timeout of 10s. -// TODO: Deprecate this function in favor of KillContainerInPod. -func (dm *DockerManager) KillContainer(containerID types.UID) error { - return dm.killContainer(containerID) -} - -func (dm *DockerManager) killContainer(containerID types.UID) error { - ID := string(containerID) - glog.V(2).Infof("Killing container with id %q", ID) - inspect, err := dm.client.InspectContainer(ID) - if err != nil { - return err - } - var found bool - var preStop string - if inspect != nil && inspect.Config != nil && inspect.Config.Labels != nil { - preStop, found = inspect.Config.Labels[kubernetesPodLabel] - } - if found { - var pod api.Pod - err := latest.Codec.DecodeInto([]byte(preStop), &pod) +// KillContainerInPod kills a container in the pod. It must be passed either a container ID or a container and pod, +// and will attempt to lookup the other information if missing. +func (dm *DockerManager) KillContainerInPod(containerID types.UID, container *api.Container, pod *api.Pod) error { + switch { + case len(containerID) == 0: + // Locate the container. + pods, err := dm.GetPods(false) if err != nil { - glog.Errorf("Failed to decode prestop: %s, %s", preStop, ID) - } else { - name := inspect.Config.Labels[kubernetesContainerLabel] - var container *api.Container + return err + } + targetPod := kubecontainer.Pods(pods).FindPod(kubecontainer.GetPodFullName(pod), pod.UID) + targetContainer := targetPod.FindContainerByName(container.Name) + if targetContainer == nil { + return fmt.Errorf("unable to find container %q in pod %q", container.Name, targetPod.Name) + } + containerID = targetContainer.ID + + case container == nil || pod == nil: + // Read information about the container from labels + inspect, err := dm.client.InspectContainer(string(containerID)) + if err != nil { + return err + } + storedPod, storedContainer, cerr := containerAndPodFromLabels(inspect) + if cerr != nil { + glog.Errorf("unable to access pod data from container: %v", err) + } + if container == nil { + container = storedContainer + } + if pod == nil { + pod = storedPod + } + } + return dm.killContainer(containerID, container, pod) +} + +// killContainer accepts a containerID and an optional container or pod containing shutdown policies. Invoke +// KillContainerInPod if information must be retrieved first. +func (dm *DockerManager) killContainer(containerID types.UID, container *api.Container, pod *api.Pod) error { + ID := string(containerID) + name := ID + if container != nil { + name = fmt.Sprintf("%s %s", name, container.Name) + } + if pod != nil { + name = fmt.Sprintf("%s %s/%s", name, pod.Namespace, pod.Name) + } + + gracePeriod := int64(minimumGracePeriodInSeconds) + if pod != nil { + switch { + case pod.DeletionGracePeriodSeconds != nil: + gracePeriod = *pod.DeletionGracePeriodSeconds + case pod.Spec.TerminationGracePeriodSeconds != nil: + gracePeriod = *pod.Spec.TerminationGracePeriodSeconds + } + } + glog.V(2).Infof("Killing container %q with %d second grace period", name, gracePeriod) + start := util.Now() + + if pod != nil && container != nil && container.Lifecycle != nil && container.Lifecycle.PreStop != nil { + glog.V(4).Infof("Running preStop hook for container %q", name) + // TODO: timebox PreStop execution to at most gracePeriod + if err := dm.runner.Run(ID, pod, container, container.Lifecycle.PreStop); err != nil { + glog.Errorf("preStop hook for container %q failed: %v", name, err) + } + gracePeriod -= int64(util.Now().Sub(start.Time).Seconds()) + } + + dm.readinessManager.RemoveReadiness(ID) + + // always give containers a minimal shutdown window to avoid unnecessary SIGKILLs + if gracePeriod < minimumGracePeriodInSeconds { + gracePeriod = minimumGracePeriodInSeconds + } + err := dm.client.StopContainer(ID, uint(gracePeriod)) + if _, ok := err.(*docker.ContainerNotRunning); ok && err != nil { + glog.V(4).Infof("Container %q has already exited", name) + return nil + } + if err == nil { + glog.V(2).Infof("Container %q exited after %s", name, util.Now().Sub(start.Time)) + } else { + glog.V(2).Infof("Container %q termination failed after %s: %v", name, util.Now().Sub(start.Time), err) + } + ref, ok := dm.containerRefManager.GetRef(ID) + if !ok { + glog.Warningf("No ref for pod '%q'", name) + } else { + // TODO: pass reason down here, and state, or move this call up the stack. + dm.recorder.Eventf(ref, "Killing", "Killing with docker id %v", util.ShortenString(ID, 12)) + } + return err +} + +var errNoPodOnContainer = fmt.Errorf("no pod information labels on Docker container") + +// containerAndPodFromLabels tries to load the appropriate container info off of a Docker container's labels +func containerAndPodFromLabels(inspect *docker.Container) (pod *api.Pod, container *api.Container, err error) { + if inspect == nil && inspect.Config == nil && inspect.Config.Labels == nil { + return nil, nil, errNoPodOnContainer + } + labels := inspect.Config.Labels + + // the pod data may not be set + if body, found := labels[kubernetesPodLabel]; found { + pod = &api.Pod{} + if err = latest.Codec.DecodeInto([]byte(body), pod); err == nil { + name := labels[kubernetesContainerLabel] for ix := range pod.Spec.Containers { if pod.Spec.Containers[ix].Name == name { container = &pod.Spec.Containers[ix] break } } - if container != nil { - glog.V(1).Infof("Running preStop hook") - if err := dm.runner.Run(ID, &pod, container, container.Lifecycle.PreStop); err != nil { - glog.Errorf("failed to run preStop hook: %v", err) - } - } else { - glog.Errorf("unable to find container %v, %s", pod, name) + if container == nil { + err = fmt.Errorf("unable to find container %s in pod %v", name, pod) + } + } else { + pod = nil + } + } + + // attempt to find the default grace period if we didn't commit a pod, but set the generic metadata + // field (the one used by kill) + if pod == nil { + if period, ok := labels[kubernetesTerminationGracePeriodLabel]; ok { + if seconds, err := strconv.ParseInt(period, 10, 64); err == nil { + pod = &api.Pod{} + pod.DeletionGracePeriodSeconds = &seconds } } } - dm.readinessManager.RemoveReadiness(ID) - err = dm.client.StopContainer(ID, 10) - ref, ok := dm.containerRefManager.GetRef(ID) - if !ok { - glog.Warningf("No ref for pod '%v'", ID) - } else { - // TODO: pass reason down here, and state, or move this call up the stack. - dm.recorder.Eventf(ref, "Killing", "Killing with docker id %v", util.ShortenString(ID, 12)) - } - return err + + return } // Run a single container from a pod. Returns the docker container ID @@ -1253,7 +1352,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe if container.Lifecycle != nil && container.Lifecycle.PostStart != nil { handlerErr := dm.runner.Run(id, pod, container, container.Lifecycle.PostStart) if handlerErr != nil { - dm.killContainer(types.UID(id)) + dm.killContainer(types.UID(id), container, pod) return kubeletTypes.DockerID(""), fmt.Errorf("failed to call event handler: %v", handlerErr) } } @@ -1534,10 +1633,10 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod podFullName := kubecontainer.GetPodFullName(pod) containerChanges, err := dm.computePodContainerChanges(pod, runningPod, podStatus) - glog.V(3).Infof("Got container changes for pod %q: %+v", podFullName, containerChanges) if err != nil { return err } + glog.V(3).Infof("Got container changes for pod %q: %+v", podFullName, containerChanges) if containerChanges.StartInfraContainer || (len(containerChanges.ContainersToKeep) == 0 && len(containerChanges.ContainersToStart) == 0) { if len(containerChanges.ContainersToKeep) == 0 && len(containerChanges.ContainersToStart) == 0 { @@ -1547,7 +1646,7 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod } // Killing phase: if we want to start new infra container, or nothing is running kill everything (including infra container) - err = dm.KillPod(runningPod) + err = dm.KillPod(pod, runningPod) if err != nil { return err } @@ -1557,7 +1656,15 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod _, keep := containerChanges.ContainersToKeep[kubeletTypes.DockerID(container.ID)] if !keep { glog.V(3).Infof("Killing unwanted container %+v", container) - err = dm.KillContainer(container.ID) + // attempt to find the appropriate container policy + var podContainer *api.Container + for i, c := range pod.Spec.Containers { + if c.Name == container.Name { + podContainer = &pod.Spec.Containers[i] + break + } + } + err = dm.KillContainerInPod(container.ID, podContainer, pod) if err != nil { glog.Errorf("Error killing container: %v", err) } diff --git a/pkg/kubelet/dockertools/manager_test.go b/pkg/kubelet/dockertools/manager_test.go index 137988c0ae..606404fb6a 100644 --- a/pkg/kubelet/dockertools/manager_test.go +++ b/pkg/kubelet/dockertools/manager_test.go @@ -405,7 +405,7 @@ func TestKillContainerInPod(t *testing.T) { manager.readinessManager.SetReadiness(c.ID, true) } - if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err != nil { + if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err != nil { t.Errorf("unexpected error: %v", err) } // Assert the container has been stopped. @@ -478,14 +478,14 @@ func TestKillContainerInPodWithPreStop(t *testing.T) { manager.readinessManager.SetReadiness(c.ID, true) } - if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err != nil { + if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err != nil { t.Errorf("unexpected error: %v", err) } // Assert the container has been stopped. if err := fakeDocker.AssertStopped([]string{containerToKill.ID}); err != nil { t.Errorf("container was not stopped correctly: %v", err) } - verifyCalls(t, fakeDocker, []string{"list", "inspect_container", "create_exec", "start_exec", "stop"}) + verifyCalls(t, fakeDocker, []string{"list", "create_exec", "start_exec", "stop"}) if !reflect.DeepEqual(expectedCmd, fakeDocker.execCmd) { t.Errorf("expected: %v, got %v", expectedCmd, fakeDocker.execCmd) } @@ -522,7 +522,7 @@ func TestKillContainerInPodWithError(t *testing.T) { manager.readinessManager.SetReadiness(c.ID, true) } - if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err == nil { + if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err == nil { t.Errorf("expected error, found nil") } @@ -568,7 +568,7 @@ func replaceProber(dm *DockerManager, result probe.Result, err error) { // Unknown or error. // // PLEASE READ THE PROBE DOCS BEFORE CHANGING THIS TEST IF YOU ARE UNSURE HOW PROBES ARE SUPPOSED TO WORK: -// (See https://github.com/GoogleCloudPlatform/kubernetes/blob/master/docs/pod-states.md#pod-conditions) +// (See https://k8s.io/kubernetes/blob/master/docs/pod-states.md#pod-conditions) func TestProbeContainer(t *testing.T) { manager, _ := newTestDockerManager() dc := &docker.APIContainers{ @@ -1021,7 +1021,7 @@ func TestSyncPodDeletesWithNoPodInfraContainer(t *testing.T) { verifyCalls(t, fakeDocker, []string{ // Kill the container since pod infra container is not running. - "inspect_container", "stop", + "stop", // Create pod infra container. "create", "start", "inspect_container", // Create container. @@ -1096,7 +1096,7 @@ func TestSyncPodDeletesDuplicate(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill the duplicated container. - "inspect_container", "stop", + "stop", }) // Expect one of the duplicates to be killed. if len(fakeDocker.Stopped) != 1 || (fakeDocker.Stopped[0] != "1234" && fakeDocker.Stopped[0] != "4567") { @@ -1150,7 +1150,7 @@ func TestSyncPodBadHash(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill and restart the bad hash container. - "inspect_container", "stop", "create", "start", "inspect_container", + "stop", "create", "start", "inspect_container", }) if err := fakeDocker.AssertStopped([]string{"1234"}); err != nil { @@ -1208,7 +1208,7 @@ func TestSyncPodsUnhealthy(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill the unhealthy container. - "inspect_container", "stop", + "stop", // Restart the unhealthy container. "create", "start", "inspect_container", }) @@ -1443,7 +1443,7 @@ func TestSyncPodWithRestartPolicy(t *testing.T) { // Check the pod infra container. "inspect_container", // Stop the last pod infra container. - "inspect_container", "stop", + "stop", }, []string{}, []string{"9876"}, @@ -1910,7 +1910,7 @@ func TestSyncPodEventHandlerFails(t *testing.T) { // Create the container. "create", "start", // Kill the container since event handler fails. - "inspect_container", "stop", + "stop", }) // TODO(yifan): Check the stopped container's name. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 3fd7c83f45..a34d756b3a 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1120,8 +1120,8 @@ func parseResolvConf(reader io.Reader) (nameservers []string, searches []string, } // Kill all running containers in a pod (includes the pod infra container). -func (kl *Kubelet) killPod(pod kubecontainer.Pod) error { - return kl.containerRuntime.KillPod(pod) +func (kl *Kubelet) killPod(pod *api.Pod, runningPod kubecontainer.Pod) error { + return kl.containerRuntime.KillPod(pod, runningPod) } type empty struct{} @@ -1177,9 +1177,10 @@ func (kl *Kubelet) syncPod(pod *api.Pod, mirrorPod *api.Pod, runningPod kubecont }() // Kill pods we can't run. - err := canRunPod(pod) - if err != nil { - kl.killPod(runningPod) + if err := canRunPod(pod); err != nil || pod.DeletionTimestamp != nil { + if err := kl.killPod(pod, runningPod); err != nil { + util.HandleError(err) + } return err } @@ -1366,6 +1367,32 @@ func (kl *Kubelet) cleanupOrphanedVolumes(pods []*api.Pod, runningPods []*kubeco return nil } +// Delete any pods that are no longer running and are marked for deletion. +func (kl *Kubelet) cleanupTerminatedPods(pods []*api.Pod, runningPods []*kubecontainer.Pod) error { + var terminating []*api.Pod + for _, pod := range pods { + if pod.DeletionTimestamp != nil { + found := false + for _, runningPod := range runningPods { + if runningPod.ID == pod.UID { + found = true + break + } + } + if found { + podFullName := kubecontainer.GetPodFullName(pod) + glog.V(5).Infof("Keeping terminated pod %q and uid %q, still running", podFullName, pod.UID) + continue + } + terminating = append(terminating, pod) + } + } + if !kl.statusManager.TerminatePods(terminating) { + return errors.New("not all pods were successfully terminated") + } + return nil +} + // pastActiveDeadline returns true if the pod has been active for more than // ActiveDeadlineSeconds. func (kl *Kubelet) pastActiveDeadline(pod *api.Pod) bool { @@ -1528,6 +1555,10 @@ func (kl *Kubelet) cleanupPods(allPods []*api.Pod, admittedPods []*api.Pod) erro // Remove any orphaned mirror pods. kl.podManager.DeleteOrphanedMirrorPods() + if err := kl.cleanupTerminatedPods(allPods, runningPods); err != nil { + glog.Errorf("Failed to cleanup terminated pods: %v", err) + } + return err } @@ -1550,7 +1581,7 @@ func (kl *Kubelet) killUnwantedPods(desiredPods map[types.UID]empty, }() glog.V(1).Infof("Killing unwanted pod %q", pod.Name) // Stop the containers. - err = kl.killPod(*pod) + err = kl.killPod(nil, *pod) if err != nil { glog.Errorf("Failed killing the pod %q: %v", pod.Name, err) return diff --git a/pkg/kubelet/mirror_client.go b/pkg/kubelet/mirror_client.go index ae54c33017..8ef630d1b7 100644 --- a/pkg/kubelet/mirror_client.go +++ b/pkg/kubelet/mirror_client.go @@ -64,7 +64,7 @@ func (mc *basicMirrorClient) DeleteMirrorPod(podFullName string) error { return err } glog.V(4).Infof("Deleting a mirror pod %q", podFullName) - if err := mc.apiserverClient.Pods(namespace).Delete(name, nil); err != nil { + if err := mc.apiserverClient.Pods(namespace).Delete(name, api.NewDeleteOptions(0)); err != nil { glog.Errorf("Failed deleting a mirror pod %q: %v", podFullName, err) } return nil diff --git a/pkg/kubelet/rkt/rkt.go b/pkg/kubelet/rkt/rkt.go index 92d718636b..bb9ef9147b 100644 --- a/pkg/kubelet/rkt/rkt.go +++ b/pkg/kubelet/rkt/rkt.go @@ -676,11 +676,11 @@ func (r *runtime) GetPods(all bool) ([]*kubecontainer.Pod, error) { } // KillPod invokes 'systemctl kill' to kill the unit that runs the pod. -func (r *runtime) KillPod(pod kubecontainer.Pod) error { - glog.V(4).Infof("Rkt is killing pod: name %q.", pod.Name) +func (r *runtime) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error { + glog.V(4).Infof("Rkt is killing pod: name %q.", runningPod.Name) // TODO(yifan): More graceful stop. Replace with StopUnit and wait for a timeout. - r.systemd.KillUnit(makePodServiceFileName(pod.ID), int32(syscall.SIGKILL)) + r.systemd.KillUnit(makePodServiceFileName(runningPod.ID), int32(syscall.SIGKILL)) return r.systemd.Reload() } @@ -887,7 +887,7 @@ func (r *runtime) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, podStatus if restartPod { // TODO(yifan): Handle network plugin. - if err := r.KillPod(runningPod); err != nil { + if err := r.KillPod(pod, runningPod); err != nil { return err } if err := r.RunPod(pod); err != nil { diff --git a/pkg/kubelet/status_manager.go b/pkg/kubelet/status_manager.go index 9f7f45e1b0..448c5bd585 100644 --- a/pkg/kubelet/status_manager.go +++ b/pkg/kubelet/status_manager.go @@ -24,6 +24,7 @@ import ( "github.com/golang/glog" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/errors" "k8s.io/kubernetes/pkg/client" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubeletTypes "k8s.io/kubernetes/pkg/kubelet/types" @@ -122,7 +123,7 @@ func (s *statusManager) SetPodStatus(pod *api.Pod, status api.PodStatus) { // Currently this routine is not called for the same pod from multiple // workers and/or the kubelet but dropping the lock before sending the // status down the channel feels like an easy way to get a bullet in foot. - if !found || !isStatusEqual(&oldStatus, &status) { + if !found || !isStatusEqual(&oldStatus, &status) || pod.DeletionTimestamp != nil { s.podStatuses[podFullName] = status s.podStatusChannel <- podStatusSyncRequest{pod, status} } else { @@ -130,6 +131,29 @@ func (s *statusManager) SetPodStatus(pod *api.Pod, status api.PodStatus) { } } +// TerminatePods resets the container status for the provided pods to terminated and triggers +// a status update. This function may not enqueue all the provided pods, in which case it will +// return false +func (s *statusManager) TerminatePods(pods []*api.Pod) bool { + sent := true + s.podStatusesLock.Lock() + defer s.podStatusesLock.Unlock() + for _, pod := range pods { + for i := range pod.Status.ContainerStatuses { + pod.Status.ContainerStatuses[i].State = api.ContainerState{ + Terminated: &api.ContainerStateTerminated{}, + } + } + select { + case s.podStatusChannel <- podStatusSyncRequest{pod, pod.Status}: + default: + sent = false + glog.V(4).Infof("Termination notice for %q was dropped because the status channel is full", kubeletUtil.FormatPodName(pod)) + } + } + return sent +} + func (s *statusManager) DeletePodStatus(podFullName string) { s.podStatusesLock.Lock() defer s.podStatusesLock.Unlock() @@ -161,13 +185,33 @@ func (s *statusManager) syncBatch() error { } // TODO: make me easier to express from client code statusPod, err = s.kubeClient.Pods(statusPod.Namespace).Get(statusPod.Name) + if errors.IsNotFound(err) { + glog.V(3).Infof("Pod %q was deleted on the server", pod.Name) + return nil + } if err == nil { + if len(pod.UID) > 0 && statusPod.UID != pod.UID { + glog.V(3).Infof("Pod %q was deleted and then recreated, skipping status update", kubeletUtil.FormatPodName(pod)) + return nil + } statusPod.Status = status - _, err = s.kubeClient.Pods(pod.Namespace).UpdateStatus(statusPod) // TODO: handle conflict as a retry, make that easier too. + statusPod, err = s.kubeClient.Pods(pod.Namespace).UpdateStatus(statusPod) if err == nil { glog.V(3).Infof("Status for pod %q updated successfully", kubeletUtil.FormatPodName(pod)) - return nil + + if pod.DeletionTimestamp == nil { + return nil + } + if !notRunning(pod.Status.ContainerStatuses) { + glog.V(3).Infof("Pod %q is terminated, but some pods are still running", pod.Name) + return nil + } + if err := s.kubeClient.Pods(statusPod.Namespace).Delete(statusPod.Name, api.NewDeleteOptions(0)); err == nil { + glog.V(3).Infof("Pod %q fully terminated and removed from etcd", statusPod.Name) + s.DeletePodStatus(podFullName) + return nil + } } } @@ -181,3 +225,14 @@ func (s *statusManager) syncBatch() error { go s.DeletePodStatus(podFullName) return fmt.Errorf("error updating status for pod %q: %v", kubeletUtil.FormatPodName(pod), err) } + +// notRunning returns true if every status is terminated or waiting, or the status list +// is empty. +func notRunning(statuses []api.ContainerStatus) bool { + for _, status := range statuses { + if status.State.Terminated == nil && status.State.Waiting == nil { + return false + } + } + return true +} diff --git a/pkg/kubelet/status_manager_test.go b/pkg/kubelet/status_manager_test.go index 585d2ffd34..22e9327502 100644 --- a/pkg/kubelet/status_manager_test.go +++ b/pkg/kubelet/status_manager_test.go @@ -153,8 +153,21 @@ func TestUnchangedStatus(t *testing.T) { verifyUpdates(t, syncer, 1) } +func TestSyncBatchIgnoresNotFound(t *testing.T) { + syncer := newTestStatusManager() + syncer.SetPodStatus(testPod, getRandomPodStatus()) + err := syncer.syncBatch() + if err != nil { + t.Errorf("unexpected syncing error: %v", err) + } + verifyActions(t, syncer.kubeClient, []testclient.Action{ + testclient.GetActionImpl{ActionImpl: testclient.ActionImpl{Verb: "get", Resource: "pods"}}, + }) +} + func TestSyncBatch(t *testing.T) { syncer := newTestStatusManager() + syncer.kubeClient = testclient.NewSimpleFake(testPod) syncer.SetPodStatus(testPod, getRandomPodStatus()) err := syncer.syncBatch() if err != nil { @@ -167,6 +180,22 @@ func TestSyncBatch(t *testing.T) { ) } +func TestSyncBatchChecksMismatchedUID(t *testing.T) { + syncer := newTestStatusManager() + testPod.UID = "first" + differentPod := *testPod + differentPod.UID = "second" + syncer.kubeClient = testclient.NewSimpleFake(testPod) + syncer.SetPodStatus(&differentPod, getRandomPodStatus()) + err := syncer.syncBatch() + if err != nil { + t.Errorf("unexpected syncing error: %v", err) + } + verifyActions(t, syncer.kubeClient, []testclient.Action{ + testclient.GetActionImpl{ActionImpl: testclient.ActionImpl{Verb: "get", Resource: "pods"}}, + }) +} + // shuffle returns a new shuffled list of container statuses. func shuffle(statuses []api.ContainerStatus) []api.ContainerStatus { numStatuses := len(statuses) diff --git a/pkg/registry/controller/etcd/etcd_test.go b/pkg/registry/controller/etcd/etcd_test.go index c826400d35..fddea31d16 100644 --- a/pkg/registry/controller/etcd/etcd_test.go +++ b/pkg/registry/controller/etcd/etcd_test.go @@ -695,7 +695,7 @@ func TestDelete(t *testing.T) { // If the controller is still around after trying to delete either the delete // failed, or we're deleting it gracefully. if fakeClient.Data[key].R.Node != nil { - return true + return fakeClient.Data[key].R.Node.TTL != 0 } return false } diff --git a/pkg/registry/event/etcd/etcd_test.go b/pkg/registry/event/etcd/etcd_test.go index bf3504303d..6d7403368f 100644 --- a/pkg/registry/event/etcd/etcd_test.go +++ b/pkg/registry/event/etcd/etcd_test.go @@ -37,6 +37,7 @@ var testTTL uint64 = 60 func NewTestEventStorage(t *testing.T) (*tools.FakeEtcdClient, *REST) { f := tools.NewFakeEtcdClient(t) + f.HideExpires = true f.TestIndex = true s := etcdstorage.NewEtcdStorage(f, testapi.Codec(), etcdtest.PathPrefix()) diff --git a/pkg/registry/generic/etcd/etcd.go b/pkg/registry/generic/etcd/etcd.go index 0b0d4190ac..3242d20bc7 100644 --- a/pkg/registry/generic/etcd/etcd.go +++ b/pkg/registry/generic/etcd/etcd.go @@ -341,6 +341,11 @@ func (e *Etcd) Get(ctx api.Context, name string) (runtime.Object, error) { return obj, nil } +var ( + errAlreadyDeleting = fmt.Errorf("abort delete") + errDeleteNow = fmt.Errorf("delete now") +) + // Delete removes the item from etcd. func (e *Etcd) Delete(ctx api.Context, name string, options *api.DeleteOptions) (runtime.Object, error) { key, err := e.KeyFunc(ctx, name) @@ -367,13 +372,41 @@ func (e *Etcd) Delete(ctx api.Context, name string, options *api.DeleteOptions) if pendingGraceful { return e.finalizeDelete(obj, false) } - if graceful && *options.GracePeriodSeconds != 0 { + if graceful { trace.Step("Graceful deletion") out := e.NewFunc() - if err := e.Storage.Set(key, obj, out, uint64(*options.GracePeriodSeconds)); err != nil { + lastGraceful := int64(0) + err := e.Storage.GuaranteedUpdate( + key, out, false, + storage.SimpleUpdate(func(existing runtime.Object) (runtime.Object, error) { + graceful, pendingGraceful, err := rest.BeforeDelete(e.DeleteStrategy, ctx, existing, options) + if err != nil { + return nil, err + } + if pendingGraceful { + return nil, errAlreadyDeleting + } + if !graceful { + return nil, errDeleteNow + } + lastGraceful = *options.GracePeriodSeconds + return existing, nil + }), + ) + switch err { + case nil: + if lastGraceful > 0 { + return out, nil + } + // fall through and delete immediately + case errDeleteNow: + // we've updated the object to have a zero grace period, or it's already at 0, so + // we should fall through and truly delete the object. + case errAlreadyDeleting: + return e.finalizeDelete(obj, true) + default: return nil, etcderr.InterpretUpdateError(err, e.EndpointName, name) } - return e.finalizeDelete(out, true) } // delete immediately, or no graceful deletion supported diff --git a/pkg/registry/persistentvolume/etcd/etcd_test.go b/pkg/registry/persistentvolume/etcd/etcd_test.go index 07da7b165b..c7b31488d0 100644 --- a/pkg/registry/persistentvolume/etcd/etcd_test.go +++ b/pkg/registry/persistentvolume/etcd/etcd_test.go @@ -300,7 +300,7 @@ func TestEtcdUpdateStatus(t *testing.T) { key, _ := storage.KeyFunc(ctx, "foo") key = etcdtest.AddPrefix(key) pvStart := validNewPersistentVolume("foo") - fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvStart), 1) + fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvStart), 0) pvIn := &api.PersistentVolume{ ObjectMeta: api.ObjectMeta{ diff --git a/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go b/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go index e6fecd389e..c36d0e01e2 100644 --- a/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go +++ b/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go @@ -298,7 +298,7 @@ func TestEtcdUpdateStatus(t *testing.T) { key, _ := storage.KeyFunc(ctx, "foo") key = etcdtest.AddPrefix(key) pvcStart := validNewPersistentVolumeClaim("foo", api.NamespaceDefault) - fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvcStart), 1) + fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvcStart), 0) pvc := &api.PersistentVolumeClaim{ ObjectMeta: api.ObjectMeta{ diff --git a/pkg/registry/pod/etcd/etcd_test.go b/pkg/registry/pod/etcd/etcd_test.go index e05f3065e9..f565dfcabc 100644 --- a/pkg/registry/pod/etcd/etcd_test.go +++ b/pkg/registry/pod/etcd/etcd_test.go @@ -56,6 +56,7 @@ func newStorage(t *testing.T) (*REST, *BindingREST, *StatusREST, *tools.FakeEtcd } func validNewPod() *api.Pod { + grace := int64(30) return &api.Pod{ ObjectMeta: api.ObjectMeta{ Name: "foo", @@ -64,6 +65,8 @@ func validNewPod() *api.Pod { Spec: api.PodSpec{ RestartPolicy: api.RestartPolicyAlways, DNSPolicy: api.DNSClusterFirst, + + TerminationGracePeriodSeconds: &grace, Containers: []api.Container{ { Name: "foo", @@ -118,8 +121,10 @@ func TestDelete(t *testing.T) { key = etcdtest.AddPrefix(key) test := resttest.New(t, storage, fakeEtcdClient.SetError) + expectedNode := "some-node" createFn := func() runtime.Object { pod := validChangedPod() + pod.Spec.NodeName = expectedNode fakeEtcdClient.Data[key] = tools.EtcdResponseWithError{ R: &etcd.Response{ Node: &etcd.Node{ @@ -134,8 +139,17 @@ func TestDelete(t *testing.T) { if fakeEtcdClient.Data[key].R.Node == nil { return false } - return fakeEtcdClient.Data[key].R.Node.TTL == 30 + obj, err := latest.Codec.Decode([]byte(fakeEtcdClient.Data[key].R.Node.Value)) + if err != nil { + return false + } + pod := obj.(*api.Pod) + t.Logf("found object %#v", pod.ObjectMeta) + return pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil && *pod.DeletionGracePeriodSeconds != 0 } + test.TestDeleteGraceful(createFn, 30, gracefulSetFn) + + expectedNode = "" test.TestDelete(createFn, gracefulSetFn) } @@ -1027,6 +1041,7 @@ func TestEtcdUpdateScheduled(t *testing.T) { }, }), 1) + grace := int64(30) podIn := api.Pod{ ObjectMeta: api.ObjectMeta{ Name: "foo", @@ -1048,6 +1063,8 @@ func TestEtcdUpdateScheduled(t *testing.T) { }, RestartPolicy: api.RestartPolicyAlways, DNSPolicy: api.DNSClusterFirst, + + TerminationGracePeriodSeconds: &grace, }, } _, _, err := registry.Update(ctx, &podIn) @@ -1088,7 +1105,7 @@ func TestEtcdUpdateStatus(t *testing.T) { }, }, } - fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, &podStart), 1) + fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, &podStart), 0) podIn := api.Pod{ ObjectMeta: api.ObjectMeta{ @@ -1117,6 +1134,8 @@ func TestEtcdUpdateStatus(t *testing.T) { expected := podStart expected.ResourceVersion = "2" + grace := int64(30) + expected.Spec.TerminationGracePeriodSeconds = &grace expected.Spec.RestartPolicy = api.RestartPolicyAlways expected.Spec.DNSPolicy = api.DNSClusterFirst expected.Spec.Containers[0].ImagePullPolicy = api.PullIfNotPresent diff --git a/pkg/registry/pod/rest.go b/pkg/registry/pod/rest.go index e62d7ab27a..699d9cddec 100644 --- a/pkg/registry/pod/rest.go +++ b/pkg/registry/pod/rest.go @@ -81,15 +81,49 @@ func (podStrategy) ValidateUpdate(ctx api.Context, obj, old runtime.Object) fiel return append(errorList, validation.ValidatePodUpdate(obj.(*api.Pod), old.(*api.Pod))...) } +// AllowUnconditionalUpdate allows pods to be overwritten func (podStrategy) AllowUnconditionalUpdate() bool { return true } -// CheckGracefulDelete allows a pod to be gracefully deleted. +// CheckGracefulDelete allows a pod to be gracefully deleted. It updates the DeleteOptions to +// reflect the desired grace value. func (podStrategy) CheckGracefulDelete(obj runtime.Object, options *api.DeleteOptions) bool { + if options == nil { + return false + } + pod := obj.(*api.Pod) + period := int64(0) + // user has specified a value + if options.GracePeriodSeconds != nil { + period = *options.GracePeriodSeconds + } else { + // use the default value if set, or deletes the pod immediately (0) + if pod.Spec.TerminationGracePeriodSeconds != nil { + period = *pod.Spec.TerminationGracePeriodSeconds + } + } + // if the pod is not scheduled, delete immediately + if len(pod.Spec.NodeName) == 0 { + period = 0 + } + // ensure the options and the pod are in sync + options.GracePeriodSeconds = &period + return true +} + +type podStrategyWithoutGraceful struct { + podStrategy +} + +// CheckGracefulDelete prohibits graceful deletion. +func (podStrategyWithoutGraceful) CheckGracefulDelete(obj runtime.Object, options *api.DeleteOptions) bool { return false } +// StrategyWithoutGraceful implements the legacy instant delele behavior. +var StrategyWithoutGraceful = podStrategyWithoutGraceful{Strategy} + type podStatusStrategy struct { podStrategy } @@ -100,6 +134,7 @@ func (podStatusStrategy) PrepareForUpdate(obj, old runtime.Object) { newPod := obj.(*api.Pod) oldPod := old.(*api.Pod) newPod.Spec = oldPod.Spec + newPod.DeletionTimestamp = nil } func (podStatusStrategy) ValidateUpdate(ctx api.Context, obj, old runtime.Object) fielderrors.ValidationErrorList { diff --git a/pkg/registry/resourcequota/etcd/etcd_test.go b/pkg/registry/resourcequota/etcd/etcd_test.go index 05198ebdf9..d99ae1f091 100644 --- a/pkg/registry/resourcequota/etcd/etcd_test.go +++ b/pkg/registry/resourcequota/etcd/etcd_test.go @@ -389,7 +389,7 @@ func TestEtcdUpdateStatus(t *testing.T) { key, _ := registry.KeyFunc(ctx, "foo") key = etcdtest.AddPrefix(key) resourcequotaStart := validNewResourceQuota() - fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, resourcequotaStart), 1) + fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, resourcequotaStart), 0) resourcequotaIn := &api.ResourceQuota{ ObjectMeta: api.ObjectMeta{ diff --git a/pkg/storage/cacher_test.go b/pkg/storage/cacher_test.go index 0603ec3465..edbcaf32ca 100644 --- a/pkg/storage/cacher_test.go +++ b/pkg/storage/cacher_test.go @@ -54,11 +54,13 @@ func newTestCacher(client tools.EtcdClient) *storage.Cacher { } func makeTestPod(name string) *api.Pod { + gracePeriod := int64(30) return &api.Pod{ ObjectMeta: api.ObjectMeta{Namespace: "ns", Name: name}, Spec: api.PodSpec{ - DNSPolicy: api.DNSClusterFirst, - RestartPolicy: api.RestartPolicyAlways, + TerminationGracePeriodSeconds: &gracePeriod, + DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, }, } } diff --git a/pkg/storage/etcd/etcd_helper.go b/pkg/storage/etcd/etcd_helper.go index 09efa9819c..f4a0747235 100644 --- a/pkg/storage/etcd/etcd_helper.go +++ b/pkg/storage/etcd/etcd_helper.go @@ -394,14 +394,21 @@ func (h *etcdHelper) GuaranteedUpdate(key string, ptrToType runtime.Object, igno ttl := uint64(0) if node != nil { index = node.ModifiedIndex - if node.TTL > 0 { + if node.TTL != 0 { ttl = uint64(node.TTL) } + if node.Expiration != nil && ttl == 0 { + ttl = 1 + } } else if res != nil { index = res.EtcdIndex } if newTTL != nil { + if ttl != 0 && *newTTL == 0 { + // TODO: remove this after we have verified this is no longer an issue + glog.V(4).Infof("GuaranteedUpdate is clearing TTL for %q, may not be intentional", key) + } ttl = *newTTL } diff --git a/pkg/storage/etcd/etcd_helper_test.go b/pkg/storage/etcd/etcd_helper_test.go index 4cb6ef73d3..c84b66f808 100644 --- a/pkg/storage/etcd/etcd_helper_test.go +++ b/pkg/storage/etcd/etcd_helper_test.go @@ -123,28 +123,32 @@ func TestList(t *testing.T) { }, }, } + grace := int64(30) expect := api.PodList{ ListMeta: api.ListMeta{ResourceVersion: "10"}, Items: []api.Pod{ { ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -206,6 +210,7 @@ func TestListAcrossDirectories(t *testing.T) { }, }, } + grace := int64(30) expect := api.PodList{ ListMeta: api.ListMeta{ResourceVersion: "10"}, Items: []api.Pod{ @@ -213,22 +218,25 @@ func TestListAcrossDirectories(t *testing.T) { { ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -278,28 +286,32 @@ func TestListExcludesDirectories(t *testing.T) { }, }, } + grace := int64(30) expect := api.PodList{ ListMeta: api.ListMeta{ResourceVersion: "10"}, Items: []api.Pod{ { ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -319,11 +331,13 @@ func TestGet(t *testing.T) { fakeClient := tools.NewFakeEtcdClient(t) helper := newEtcdHelper(fakeClient, testapi.Codec(), etcdtest.PathPrefix()) key := etcdtest.AddPrefix("/some/key") + grace := int64(30) expect := api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, } fakeClient.Set(key, runtime.EncodeOrDie(testapi.Codec(), &expect), 0) diff --git a/pkg/storage/etcd/etcd_watcher.go b/pkg/storage/etcd/etcd_watcher.go index 93fdd16829..4ab51cabbd 100644 --- a/pkg/storage/etcd/etcd_watcher.go +++ b/pkg/storage/etcd/etcd_watcher.go @@ -38,6 +38,7 @@ const ( EtcdSet = "set" EtcdCAS = "compareAndSwap" EtcdDelete = "delete" + EtcdExpire = "expire" ) // TransformFunc attempts to convert an object to another object for use with a watcher. @@ -353,7 +354,7 @@ func (w *etcdWatcher) sendResult(res *etcd.Response) { w.sendAdd(res) case EtcdSet, EtcdCAS: w.sendModify(res) - case EtcdDelete: + case EtcdDelete, EtcdExpire: w.sendDelete(res) default: glog.Errorf("unknown action: %v", res.Action) diff --git a/pkg/tools/fake_etcd_client.go b/pkg/tools/fake_etcd_client.go index 22b8a59e76..87a30bd912 100644 --- a/pkg/tools/fake_etcd_client.go +++ b/pkg/tools/fake_etcd_client.go @@ -20,6 +20,7 @@ import ( "errors" "sort" "sync" + "time" "github.com/coreos/go-etcd/etcd" ) @@ -52,6 +53,8 @@ type FakeEtcdClient struct { TestIndex bool ChangeIndex uint64 LastSetTTL uint64 + // Will avoid setting the expires header on objects to make comparison easier + HideExpires bool Machines []string // Will become valid after Watch is called; tester may write to it. Tester may @@ -175,6 +178,11 @@ func (f *FakeEtcdClient) setLocked(key, value string, ttl uint64) (*etcd.Respons prevResult := f.Data[key] createdIndex := prevResult.R.Node.CreatedIndex f.t.Logf("updating %v, index %v -> %v (ttl: %d)", key, createdIndex, i, ttl) + var expires *time.Time + if !f.HideExpires && ttl > 0 { + now := time.Now() + expires = &now + } result := EtcdResponseWithError{ R: &etcd.Response{ Node: &etcd.Node{ @@ -182,6 +190,7 @@ func (f *FakeEtcdClient) setLocked(key, value string, ttl uint64) (*etcd.Respons CreatedIndex: createdIndex, ModifiedIndex: i, TTL: int64(ttl), + Expiration: expires, }, }, } diff --git a/plugin/pkg/scheduler/factory/factory_test.go b/plugin/pkg/scheduler/factory/factory_test.go index a5795b4e38..22f05d186d 100644 --- a/plugin/pkg/scheduler/factory/factory_test.go +++ b/plugin/pkg/scheduler/factory/factory_test.go @@ -132,11 +132,13 @@ func PriorityTwo(pod *api.Pod, podLister algorithm.PodLister, minionLister algor } func TestDefaultErrorFunc(t *testing.T) { + grace := int64(30) testPod := &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "bar"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, } handler := util.FakeHandler{ diff --git a/test/e2e/kubectl.go b/test/e2e/kubectl.go index 02d289ba25..a79946bd5e 100644 --- a/test/e2e/kubectl.go +++ b/test/e2e/kubectl.go @@ -791,7 +791,7 @@ func getUDData(jpgExpected string, ns string) func(*client.Client, string) error if strings.Contains(data.Image, jpgExpected) { return nil } else { - return errors.New(fmt.Sprintf("data served up in container is innaccurate, %s didn't contain %s", data, jpgExpected)) + return errors.New(fmt.Sprintf("data served up in container is inaccurate, %s didn't contain %s", data, jpgExpected)) } } } diff --git a/test/e2e/pd.go b/test/e2e/pd.go index 56517247bd..6d22c6e2cd 100644 --- a/test/e2e/pd.go +++ b/test/e2e/pd.go @@ -78,8 +78,8 @@ var _ = Describe("Pod Disks", func() { By("cleaning up PD-RW test environment") // Teardown pods, PD. Ignore errors. // Teardown should do nothing unless test failed. - podClient.Delete(host0Pod.Name, nil) - podClient.Delete(host1Pod.Name, nil) + podClient.Delete(host0Pod.Name, api.NewDeleteOptions(0)) + podClient.Delete(host1Pod.Name, api.NewDeleteOptions(0)) detachPD(host0Name, diskName) detachPD(host1Name, diskName) deletePD(diskName) @@ -98,7 +98,7 @@ var _ = Describe("Pod Disks", func() { Logf("Wrote value: %v", testFileContents) By("deleting host0Pod") - expectNoError(podClient.Delete(host0Pod.Name, nil), "Failed to delete host0Pod") + expectNoError(podClient.Delete(host0Pod.Name, api.NewDeleteOptions(0)), "Failed to delete host0Pod") By("submitting host1Pod to kubernetes") _, err = podClient.Create(host1Pod) @@ -113,7 +113,7 @@ var _ = Describe("Pod Disks", func() { Expect(strings.TrimSpace(v)).To(Equal(strings.TrimSpace(testFileContents))) By("deleting host1Pod") - expectNoError(podClient.Delete(host1Pod.Name, nil), "Failed to delete host1Pod") + expectNoError(podClient.Delete(host1Pod.Name, api.NewDeleteOptions(0)), "Failed to delete host1Pod") By(fmt.Sprintf("deleting PD %q", diskName)) deletePDWithRetry(diskName) @@ -136,9 +136,9 @@ var _ = Describe("Pod Disks", func() { By("cleaning up PD-RO test environment") // Teardown pods, PD. Ignore errors. // Teardown should do nothing unless test failed. - podClient.Delete(rwPod.Name, nil) - podClient.Delete(host0ROPod.Name, nil) - podClient.Delete(host1ROPod.Name, nil) + podClient.Delete(rwPod.Name, api.NewDeleteOptions(0)) + podClient.Delete(host0ROPod.Name, api.NewDeleteOptions(0)) + podClient.Delete(host1ROPod.Name, api.NewDeleteOptions(0)) detachPD(host0Name, diskName) detachPD(host1Name, diskName) @@ -149,7 +149,7 @@ var _ = Describe("Pod Disks", func() { _, err = podClient.Create(rwPod) expectNoError(err, "Failed to create rwPod") expectNoError(framework.WaitForPodRunning(rwPod.Name)) - expectNoError(podClient.Delete(rwPod.Name, nil), "Failed to delete host0Pod") + expectNoError(podClient.Delete(rwPod.Name, api.NewDeleteOptions(0)), "Failed to delete host0Pod") expectNoError(waitForPDDetach(diskName, host0Name)) By("submitting host0ROPod to kubernetes") @@ -165,10 +165,10 @@ var _ = Describe("Pod Disks", func() { expectNoError(framework.WaitForPodRunning(host1ROPod.Name)) By("deleting host0ROPod") - expectNoError(podClient.Delete(host0ROPod.Name, nil), "Failed to delete host0ROPod") + expectNoError(podClient.Delete(host0ROPod.Name, api.NewDeleteOptions(0)), "Failed to delete host0ROPod") By("deleting host1ROPod") - expectNoError(podClient.Delete(host1ROPod.Name, nil), "Failed to delete host1ROPod") + expectNoError(podClient.Delete(host1ROPod.Name, api.NewDeleteOptions(0)), "Failed to delete host1ROPod") By(fmt.Sprintf("deleting PD %q", diskName)) deletePDWithRetry(diskName) diff --git a/test/e2e/pods.go b/test/e2e/pods.go index 4fc7b2d70d..c28758a36f 100644 --- a/test/e2e/pods.go +++ b/test/e2e/pods.go @@ -43,7 +43,7 @@ func runLivenessTest(c *client.Client, ns string, podDescr *api.Pod, expectResta // At the end of the test, clean up by removing the pod. defer func() { By("deleting the pod") - c.Pods(ns).Delete(podDescr.Name, nil) + c.Pods(ns).Delete(podDescr.Name, api.NewDeleteOptions(0)) }() // Wait until the pod is not pending. (Here we need to check for something other than @@ -86,15 +86,14 @@ func runLivenessTest(c *client.Client, ns string, podDescr *api.Pod, expectResta func testHostIP(c *client.Client, ns string, pod *api.Pod) { podClient := c.Pods(ns) By("creating pod") - defer podClient.Delete(pod.Name, nil) - _, err := podClient.Create(pod) - if err != nil { + defer podClient.Delete(pod.Name, api.NewDeleteOptions(0)) + if _, err := podClient.Create(pod); err != nil { Failf("Failed to create pod: %v", err) } By("ensuring that pod is running and has a hostIP") // Wait for the pods to enter the running state. Waiting loops until the pods // are running so non-running pods cause a timeout for this test. - err = waitForPodRunningInNamespace(c, pod.Name, ns) + err := waitForPodRunningInNamespace(c, pod.Name, ns) Expect(err).NotTo(HaveOccurred()) // Try to make sure we get a hostIP for each pod. hostIPTimeout := 2 * time.Minute @@ -222,7 +221,7 @@ var _ = Describe("Pods", func() { // We call defer here in case there is a problem with // the test so we can ensure that we clean up after // ourselves - defer podClient.Delete(pod.Name, nil) + defer podClient.Delete(pod.Name, api.NewDeleteOptions(0)) _, err = podClient.Create(pod) if err != nil { Failf("Failed to create pod: %v", err) @@ -235,7 +234,7 @@ var _ = Describe("Pods", func() { } Expect(len(pods.Items)).To(Equal(1)) - By("veryfying pod creation was observed") + By("verifying pod creation was observed") select { case event, _ := <-w.ResultChan(): if event.Type != watch.Added { @@ -245,7 +244,7 @@ var _ = Describe("Pods", func() { Fail("Timeout while waiting for pod creation") } - By("deleting the pod") + By("deleting the pod gracefully") if err := podClient.Delete(pod.Name, nil); err != nil { Failf("Failed to delete pod: %v", err) } @@ -253,11 +252,13 @@ var _ = Describe("Pods", func() { By("verifying pod deletion was observed") deleted := false timeout := false + var lastPod *api.Pod timer := time.After(podStartTimeout) for !deleted && !timeout { select { case event, _ := <-w.ResultChan(): if event.Type == watch.Deleted { + lastPod = event.Object.(*api.Pod) deleted = true } case <-timer: @@ -268,6 +269,9 @@ var _ = Describe("Pods", func() { Fail("Failed to observe pod deletion") } + Expect(lastPod.DeletionTimestamp).ToNot(BeNil()) + Expect(lastPod.Spec.TerminationGracePeriodSeconds).ToNot(BeZero()) + pods, err = podClient.List(labels.SelectorFromSet(labels.Set(map[string]string{"time": value})), fields.Everything()) if err != nil { Fail(fmt.Sprintf("Failed to list pods to verify deletion: %v", err)) @@ -312,7 +316,7 @@ var _ = Describe("Pods", func() { By("submitting the pod to kubernetes") defer func() { By("deleting the pod") - podClient.Delete(pod.Name, nil) + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) }() pod, err := podClient.Create(pod) if err != nil { @@ -376,7 +380,7 @@ var _ = Describe("Pods", func() { }, }, } - defer framework.Client.Pods(framework.Namespace.Name).Delete(serverPod.Name, nil) + defer framework.Client.Pods(framework.Namespace.Name).Delete(serverPod.Name, api.NewDeleteOptions(0)) _, err := framework.Client.Pods(framework.Namespace.Name).Create(serverPod) if err != nil { Failf("Failed to create serverPod: %v", err) @@ -600,7 +604,7 @@ var _ = Describe("Pods", func() { // We call defer here in case there is a problem with // the test so we can ensure that we clean up after // ourselves - podClient.Delete(pod.Name) + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) }() By("waiting for the pod to start running") @@ -673,7 +677,7 @@ var _ = Describe("Pods", func() { // We call defer here in case there is a problem with // the test so we can ensure that we clean up after // ourselves - podClient.Delete(pod.Name) + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) }() By("waiting for the pod to start running") diff --git a/test/e2e/util.go b/test/e2e/util.go index 7209e271df..3a11849779 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -831,20 +831,24 @@ func expectNoError(err error, explain ...interface{}) { ExpectWithOffset(1, err).NotTo(HaveOccurred(), explain...) } -// Stops everything from filePath from namespace ns and checks if everything maching selectors from the given namespace is correctly stopped. +// Stops everything from filePath from namespace ns and checks if everything matching selectors from the given namespace is correctly stopped. func cleanup(filePath string, ns string, selectors ...string) { - By("using stop to clean up resources") + By("using delete to clean up resources") var nsArg string if ns != "" { nsArg = fmt.Sprintf("--namespace=%s", ns) } - runKubectl("stop", "-f", filePath, nsArg) + runKubectl("stop", "--grace-period=0", "-f", filePath, nsArg) for _, selector := range selectors { - resources := runKubectl("get", "pods,rc,svc", "-l", selector, "--no-headers", nsArg) + resources := runKubectl("get", "rc,svc", "-l", selector, "--no-headers", nsArg) if resources != "" { Failf("Resources left running after stop:\n%s", resources) } + pods := runKubectl("get", "pods", "-l", selector, nsArg, "-t", "{{ range .items }}{{ if not .metadata.deletionTimestamp }}{{ .metadata.name }}{{ \"\\n\" }}{{ end }}{{ end }}") + if pods != "" { + Failf("Pods left unterminated after stop:\n%s", pods) + } } } diff --git a/test/images/network-tester/slow-pod.json b/test/images/network-tester/slow-pod.json new file mode 100644 index 0000000000..8fce984aed --- /dev/null +++ b/test/images/network-tester/slow-pod.json @@ -0,0 +1,28 @@ +{ + "kind": "Pod", + "apiVersion": "v1", + "metadata": { + "name": "slow-pod", + "labels": { + "name": "nettest" + } + }, + "spec": { + "containers": [ + { + "name": "webserver", + "image": "gcr.io/google_containers/nettest:1.5", + "args": [ + "-service=nettest", + "-delay-shutdown=10" + ], + "ports": [ + { + "containerPort": 8080, + "protocol": "TCP" + } + ] + } + ] + } +} diff --git a/test/images/network-tester/slow-rc.json b/test/images/network-tester/slow-rc.json new file mode 100644 index 0000000000..d70a145555 --- /dev/null +++ b/test/images/network-tester/slow-rc.json @@ -0,0 +1,42 @@ +{ + "kind": "ReplicationController", + "apiVersion": "v1", + "metadata": { + "name": "slow-rc", + "labels": { + "name": "nettest" + } + }, + "spec": { + "replicas": 8, + "selector": { + "name": "nettest" + }, + "template": { + "metadata": { + "labels": { + "name": "nettest" + } + }, + "spec": { + "terminationGracePeriodSeconds": 5, + "containers": [ + { + "name": "webserver", + "image": "gcr.io/google_containers/nettest:1.5", + "args": [ + "-service=nettest", + "-delay-shutdown=10" + ], + "ports": [ + { + "containerPort": 8080, + "protocol": "TCP" + } + ] + } + ] + } + } + } +} diff --git a/test/images/network-tester/webserver.go b/test/images/network-tester/webserver.go index 8f6e6a999f..9f6e906312 100644 --- a/test/images/network-tester/webserver.go +++ b/test/images/network-tester/webserver.go @@ -39,7 +39,9 @@ import ( "math/rand" "net/http" "os" + "os/signal" "sync" + "syscall" "time" "k8s.io/kubernetes/pkg/client" @@ -47,10 +49,11 @@ import ( ) var ( - port = flag.Int("port", 8080, "Port number to serve at.") - peerCount = flag.Int("peers", 8, "Must find at least this many peers for the test to pass.") - service = flag.String("service", "nettest", "Service to find other network test pods in.") - namespace = flag.String("namespace", "default", "Namespace of this pod. TODO: kubernetes should make this discoverable.") + port = flag.Int("port", 8080, "Port number to serve at.") + peerCount = flag.Int("peers", 8, "Must find at least this many peers for the test to pass.") + service = flag.String("service", "nettest", "Service to find other network test pods in.") + namespace = flag.String("namespace", "default", "Namespace of this pod. TODO: kubernetes should make this discoverable.") + delayShutdown = flag.Int("delay-shutdown", 0, "Number of seconds to delay shutdown when receiving SIGTERM.") ) // State tracks the internal state of our little http server. @@ -178,6 +181,17 @@ func main() { log.Fatalf("Error getting hostname: %v", err) } + if *delayShutdown > 0 { + termCh := make(chan os.Signal) + signal.Notify(termCh, syscall.SIGTERM) + go func() { + <-termCh + log.Printf("Sleeping %d seconds before exit ...", *delayShutdown) + time.Sleep(time.Duration(*delayShutdown) * time.Second) + os.Exit(0) + }() + } + state := State{ Hostname: hostname, StillContactingPeers: true, diff --git a/test/integration/auth_test.go b/test/integration/auth_test.go index 9662724fb6..3c1d931087 100644 --- a/test/integration/auth_test.go +++ b/test/integration/auth_test.go @@ -232,7 +232,7 @@ var deleteNow string = ` { "kind": "DeleteOptions", "apiVersion": "` + testapi.Version() + `", - "gracePeriodSeconds": null%s + "gracePeriodSeconds": 0%s } ` diff --git a/test/integration/etcd_tools_test.go b/test/integration/etcd_tools_test.go index da278a5be6..60b9408edb 100644 --- a/test/integration/etcd_tools_test.go +++ b/test/integration/etcd_tools_test.go @@ -80,6 +80,59 @@ func TestGet(t *testing.T) { }) } +func TestWriteTTL(t *testing.T) { + client := framework.NewEtcdClient() + etcdStorage := etcd.NewEtcdStorage(client, testapi.Codec(), "") + framework.WithEtcdKey(func(key string) { + testObject := api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "foo"}} + if err := etcdStorage.Set(key, &testObject, nil, 0); err != nil { + t.Fatalf("unexpected error: %v", err) + } + result := &api.ServiceAccount{} + err := etcdStorage.GuaranteedUpdate(key, result, false, func(obj runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + if in, ok := obj.(*api.ServiceAccount); !ok || in.Name != "foo" { + t.Fatalf("unexpected existing object: %v", obj) + } + if res.TTL != 0 { + t.Fatalf("unexpected TTL: %#v", res) + } + ttl := uint64(10) + out := &api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "out"}} + return out, &ttl, nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.Name != "out" { + t.Errorf("unexpected response: %#v", result) + } + if res, err := client.Get(key, false, false); err != nil || res == nil || res.Node.TTL != 10 { + t.Fatalf("unexpected get: %v %#v", err, res) + } + + result = &api.ServiceAccount{} + err = etcdStorage.GuaranteedUpdate(key, result, false, func(obj runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + if in, ok := obj.(*api.ServiceAccount); !ok || in.Name != "out" { + t.Fatalf("unexpected existing object: %v", obj) + } + if res.TTL <= 1 { + t.Fatalf("unexpected TTL: %#v", res) + } + out := &api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "out2"}} + return out, nil, nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.Name != "out2" { + t.Errorf("unexpected response: %#v", result) + } + if res, err := client.Get(key, false, false); err != nil || res == nil || res.Node.TTL <= 1 { + t.Fatalf("unexpected get: %v %#v", err, res) + } + }) +} + func TestWatch(t *testing.T) { client := framework.NewEtcdClient() etcdStorage := etcd.NewEtcdStorage(client, testapi.Codec(), etcdtest.PathPrefix()) diff --git a/test/integration/scheduler_test.go b/test/integration/scheduler_test.go index 796dc4a7a1..0761cbcdf0 100644 --- a/test/integration/scheduler_test.go +++ b/test/integration/scheduler_test.go @@ -277,7 +277,7 @@ func DoTestUnschedulableNodes(t *testing.T, restClient *client.Client, nodeStore t.Logf("Test %d: Pod got scheduled on a schedulable node", i) } - err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, nil) + err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, api.NewDeleteOptions(0)) if err != nil { t.Errorf("Failed to delete pod: %v", err) }