Merge pull request #9165 from smarterclayton/graceful

Enable graceful deletion using reconciliation loops in the Kubelet without TTL
2015-08-18 10:01:40 -07:00 · 2015-08-18 10:01:40 -07:00 · 4f856b595d
parent d78525a83b 266e6226e5
commit 4f856b595d
78 changed files with 1499 additions and 457 deletions
--- a/api/swagger-spec/v1.json
+++ b/api/swagger-spec/v1.json
@ -11017,6 +11017,11 @@
      "type": "string",
      "description": "RFC 3339 date and time at which the object will be deleted; populated by the system when a graceful deletion is requested, read-only; if not set, graceful deletion of the object has not been requested; see http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata"
     },
+     "deletionGracePeriodSeconds": {
+      "type": "integer",
+      "format": "int64",
+      "description": "number of seconds allowed for this object to gracefully terminate before it will be removed from the system; only set when deletionTimestamp is also set, read-only; may only be shortened"
+     },
     "labels": {
      "type": "any",
      "description": "map of string keys and values that can be used to organize and categorize objects; may match selectors of replication controllers and services; see http://releases.k8s.io/HEAD/docs/user-guide/labels.md"
@ -12327,7 +12332,7 @@
     "terminationGracePeriodSeconds": {
      "type": "integer",
      "format": "int64",
-      "description": "optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process"
+      "description": "optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process; defaults to 30 seconds"
     },
     "activeDeadlineSeconds": {
      "type": "integer",
--- a/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml
+++ b/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml
@ -18,6 +18,7 @@ spec:
      mountPath: /varlog
    - name: containers
      mountPath: /var/lib/docker/containers
+  terminationGracePeriodSeconds: 30
  volumes:
  - name: varlog
    hostPath:
--- a/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml
+++ b/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml
@ -19,6 +19,7 @@ spec:
      mountPath: /varlog
    - name: containers
      mountPath: /var/lib/docker/containers
+  terminationGracePeriodSeconds: 30
  volumes:
  - name: varlog
    hostPath:
--- a/cmd/integration/integration.go
+++ b/cmd/integration/integration.go
@ -196,7 +196,7 @@ func startComponents(firstManifestURL, secondManifestURL, apiVersion string) (st
 	// TODO: Write an integration test for the replication controllers watch.
 	go controllerManager.Run(3, util.NeverStop)

-	nodeController := nodecontroller.NewNodeController(nil, cl, 5*time.Minute, nodecontroller.NewPodEvictor(util.NewFakeRateLimiter()),
+	nodeController := nodecontroller.NewNodeController(nil, cl, 5*time.Minute, util.NewFakeRateLimiter(),
 		40*time.Second, 60*time.Second, 5*time.Second, nil, false)
 	nodeController.Run(5 * time.Second)
 	cadvisorInterface := new(cadvisor.Fake)
@ -872,18 +872,25 @@ func runSchedulerNoPhantomPodsTest(client *client.Client) {

 	// Delete a pod to free up room.
 	glog.Infof("Deleting pod %v", bar.Name)
-	err = client.Pods(api.NamespaceDefault).Delete(bar.Name, nil)
+	err = client.Pods(api.NamespaceDefault).Delete(bar.Name, api.NewDeleteOptions(0))
 	if err != nil {
 		glog.Fatalf("FAILED: couldn't delete pod %q: %v", bar.Name, err)
 	}

+	//TODO: reenable once fake_docker_client handles deletion cleanly
+	//time.Sleep(2 * time.Second)
+
 	pod.ObjectMeta.Name = "phantom.baz"
 	baz, err := client.Pods(api.NamespaceDefault).Create(pod)
 	if err != nil {
 		glog.Fatalf("Failed to create pod: %v, %v", pod, err)
 	}
 	if err := wait.Poll(time.Second, time.Second*60, podRunning(client, baz.Namespace, baz.Name)); err != nil {
-		glog.Fatalf("FAILED: (Scheduler probably didn't process deletion of 'phantom.bar') Pod never started running: %v", err)
+		if pod, perr := client.Pods(api.NamespaceDefault).Get("phantom.bar"); perr == nil {
+			glog.Fatalf("FAILED: 'phantom.bar' was never deleted: %#v", pod)
+		} else {
+			glog.Fatalf("FAILED: (Scheduler probably didn't process deletion of 'phantom.bar') Pod never started running: %v", err)
+		}
 	}

 	glog.Info("Scheduler doesn't make phantom pods: test passed.")
--- a/cmd/kube-controller-manager/app/controllermanager.go
+++ b/cmd/kube-controller-manager/app/controllermanager.go
@ -196,7 +196,7 @@ func (s *CMServer) Run(_ []string) error {
 	}

 	nodeController := nodecontroller.NewNodeController(cloud, kubeClient,
-		s.PodEvictionTimeout, nodecontroller.NewPodEvictor(util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst)),
+		s.PodEvictionTimeout, util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst),
 		s.NodeMonitorGracePeriod, s.NodeStartupGracePeriod, s.NodeMonitorPeriod, &s.ClusterCIDR, s.AllocateNodeCIDRs)
 	nodeController.Run(s.NodeSyncPeriod)

--- a/contrib/mesos/pkg/controllermanager/controllermanager.go
+++ b/contrib/mesos/pkg/controllermanager/controllermanager.go
@ -123,7 +123,7 @@ func (s *CMServer) Run(_ []string) error {
 	}

 	nodeController := nodecontroller.NewNodeController(cloud, kubeClient,
-		s.PodEvictionTimeout, nodecontroller.NewPodEvictor(util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst)),
+		s.PodEvictionTimeout, util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst),
 		s.NodeMonitorGracePeriod, s.NodeStartupGracePeriod, s.NodeMonitorPeriod, (*net.IPNet)(&s.ClusterCIDR), s.AllocateNodeCIDRs)
 	nodeController.Run(s.NodeSyncPeriod)

--- a/docs/getting-started-guides/logging.md
+++ b/docs/getting-started-guides/logging.md
@ -182,6 +182,7 @@ spec:
      mountPath: /varlog
    - name: containers
      mountPath: /var/lib/docker/containers
+  terminationGracePeriodSeconds: 30
  volumes:
  - name: varlog
    hostPath:
--- a/hack/lib/test.sh
+++ b/hack/lib/test.sh
@ -23,7 +23,7 @@ readonly   red=$(tput setaf 1)
 readonly green=$(tput setaf 2)

 kube::test::clear_all() {
-  kubectl delete "${kube_flags[@]}" rc,pods --all
+  kubectl delete "${kube_flags[@]}" rc,pods --all --grace-period=0
 }

 kube::test::get_object_assert() {
--- a/hack/test-cmd.sh
+++ b/hack/test-cmd.sh
@ -246,7 +246,7 @@ runTests() {
  # Pre-condition: valid-pod POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:'
  # Command
-  kubectl delete pod valid-pod "${kube_flags[@]}"
+  kubectl delete pod valid-pod "${kube_flags[@]}" --grace-period=0
  # Post-condition: no POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" ''

@ -262,7 +262,7 @@ runTests() {
  # Pre-condition: valid-pod POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:'
  # Command
-  kubectl delete -f docs/admin/limitrange/valid-pod.yaml "${kube_flags[@]}"
+  kubectl delete -f docs/admin/limitrange/valid-pod.yaml "${kube_flags[@]}" --grace-period=0
  # Post-condition: no POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" ''

@ -278,7 +278,7 @@ runTests() {
  # Pre-condition: valid-pod POD is running
  kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' 'valid-pod:'
  # Command
-  kubectl delete pods -l'name in (valid-pod)' "${kube_flags[@]}"
+  kubectl delete pods -l'name in (valid-pod)' "${kube_flags[@]}" --grace-period=0
  # Post-condition: no POD is running
  kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' ''

@ -310,7 +310,7 @@ runTests() {
  # Pre-condition: valid-pod POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:'
  # Command
-  kubectl delete --all pods "${kube_flags[@]}" # --all remove all the pods
+  kubectl delete --all pods "${kube_flags[@]}" --grace-period=0 # --all remove all the pods
  # Post-condition: no POD is running
  kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' ''

@ -327,7 +327,7 @@ runTests() {
  # Pre-condition: valid-pod and redis-proxy PODs are running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'redis-proxy:valid-pod:'
  # Command
-  kubectl delete pods valid-pod redis-proxy "${kube_flags[@]}" # delete multiple pods at once
+  kubectl delete pods valid-pod redis-proxy "${kube_flags[@]}" --grace-period=0 # delete multiple pods at once
  # Post-condition: no POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" ''

@ -344,7 +344,7 @@ runTests() {
  # Pre-condition: valid-pod and redis-proxy PODs are running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'redis-proxy:valid-pod:'
  # Command
-  kubectl stop pods valid-pod redis-proxy "${kube_flags[@]}" # stop multiple pods at once
+  kubectl stop pods valid-pod redis-proxy "${kube_flags[@]}" --grace-period=0 # stop multiple pods at once
  # Post-condition: no POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" ''

@ -368,7 +368,7 @@ runTests() {
  # Pre-condition: valid-pod POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:'
  # Command
-  kubectl delete pods -lnew-name=new-valid-pod "${kube_flags[@]}"
+  kubectl delete pods -lnew-name=new-valid-pod --grace-period=0 "${kube_flags[@]}"
  # Post-condition: no POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" ''

@ -419,7 +419,7 @@ runTests() {
  # Pre-condition: valid-pod POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:'
  # Command
-  kubectl delete pods -l'name in (valid-pod-super-sayan)' "${kube_flags[@]}"
+  kubectl delete pods -l'name in (valid-pod-super-sayan)' --grace-period=0 "${kube_flags[@]}"
  # Post-condition: no POD is running
  kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" ''

@ -455,7 +455,7 @@ runTests() {
  # Pre-condition: valid-pod POD is running
  kube::test::get_object_assert 'pods --namespace=other' "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:'
  # Command
-  kubectl delete "${kube_flags[@]}" pod --namespace=other valid-pod
+  kubectl delete "${kube_flags[@]}" pod --namespace=other valid-pod --grace-period=0
  # Post-condition: no POD is running
  kube::test::get_object_assert 'pods --namespace=other' "{{range.items}}{{$id_field}}:{{end}}" ''

--- a/hack/verify-flags/known-flags.txt
+++ b/hack/verify-flags/known-flags.txt
@ -51,6 +51,7 @@ current-release-pr
 current-replicas
 default-container-cpu-limit
 default-container-mem-limit
+delay-shutdown
 deleting-pods-burst
 deleting-pods-qps
 deployment-label-key
--- a/pkg/api/deep_copy_generated.go
+++ b/pkg/api/deep_copy_generated.go
@ -1011,6 +1011,12 @@ func deepCopy_api_ObjectMeta(in ObjectMeta, out *ObjectMeta, c *conversion.Clone
 	} else {
 		out.DeletionTimestamp = nil
 	}
+	if in.DeletionGracePeriodSeconds != nil {
+		out.DeletionGracePeriodSeconds = new(int64)
+		*out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds
+	} else {
+		out.DeletionGracePeriodSeconds = nil
+	}
 	if in.Labels != nil {
 		out.Labels = make(map[string]string)
 		for key, val := range in.Labels {
--- a/pkg/api/rest/create.go
+++ b/pkg/api/rest/create.go
@ -59,6 +59,8 @@ func BeforeCreate(strategy RESTCreateStrategy, ctx api.Context, obj runtime.Obje
 	} else {
 		objectMeta.Namespace = api.NamespaceNone
 	}
+	objectMeta.DeletionTimestamp = nil
+	objectMeta.DeletionGracePeriodSeconds = nil
 	strategy.PrepareForCreate(obj)
 	api.FillObjectMetaSystemFields(ctx, objectMeta)
 	api.GenerateName(strategy, objectMeta)
--- a/pkg/api/rest/delete.go
+++ b/pkg/api/rest/delete.go
@ -17,8 +17,11 @@ limitations under the License.
 package rest

 import (
+	"time"
+
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/runtime"
+	"k8s.io/kubernetes/pkg/util"
 )

 // RESTDeleteStrategy defines deletion behavior on an object that follows Kubernetes
@ -40,12 +43,41 @@ func BeforeDelete(strategy RESTDeleteStrategy, ctx api.Context, obj runtime.Obje
 	if strategy == nil {
 		return false, false, nil
 	}
-	_, _, kerr := objectMetaAndKind(strategy, obj)
+	objectMeta, _, kerr := objectMetaAndKind(strategy, obj)
 	if kerr != nil {
 		return false, false, kerr
 	}
+
+	// if the object is already being deleted
+	if objectMeta.DeletionTimestamp != nil {
+		// if we are already being deleted, we may only shorten the deletion grace period
+		// this means the object was gracefully deleted previously but deletionGracePeriodSeconds was not set,
+		// so we force deletion immediately
+		if objectMeta.DeletionGracePeriodSeconds == nil {
+			return false, false, nil
+		}
+		// only a shorter grace period may be provided by a user
+		if options.GracePeriodSeconds != nil {
+			period := int64(*options.GracePeriodSeconds)
+			if period > *objectMeta.DeletionGracePeriodSeconds {
+				return false, true, nil
+			}
+			now := util.NewTime(util.Now().Add(time.Second * time.Duration(*options.GracePeriodSeconds)))
+			objectMeta.DeletionTimestamp = &now
+			objectMeta.DeletionGracePeriodSeconds = &period
+			options.GracePeriodSeconds = &period
+			return true, false, nil
+		}
+		// graceful deletion is pending, do nothing
+		options.GracePeriodSeconds = objectMeta.DeletionGracePeriodSeconds
+		return false, true, nil
+	}
+
 	if !strategy.CheckGracefulDelete(obj, options) {
 		return false, false, nil
 	}
+	now := util.NewTime(util.Now().Add(time.Second * time.Duration(*options.GracePeriodSeconds)))
+	objectMeta.DeletionTimestamp = &now
+	objectMeta.DeletionGracePeriodSeconds = options.GracePeriodSeconds
 	return true, false, nil
 }
--- a/pkg/api/rest/resttest/resttest.go
+++ b/pkg/api/rest/resttest/resttest.go
@ -114,17 +114,20 @@ func (t *Tester) TestUpdate(valid runtime.Object, existing, older runtime.Object

 // Test deleting an object.
 func (t *Tester) TestDelete(createFn func() runtime.Object, wasGracefulFn func() bool, invalid ...runtime.Object) {
-	t.testDeleteNonExist(createFn)
-	t.testDeleteNoGraceful(createFn, wasGracefulFn)
-	t.testDeleteInvokesValidation(invalid...)
+	t.TestDeleteNonExist(createFn)
+	t.TestDeleteNoGraceful(createFn, wasGracefulFn)
+	t.TestDeleteInvokesValidation(invalid...)
 	// TODO: Test delete namespace mismatch rejection
 	// once #5684 is fixed.
 }

 // Test graceful deletion.
 func (t *Tester) TestDeleteGraceful(createFn func() runtime.Object, expectedGrace int64, wasGracefulFn func() bool) {
-	t.testDeleteGracefulHasDefault(createFn(), expectedGrace, wasGracefulFn)
-	t.testDeleteGracefulUsesZeroOnNil(createFn(), 0)
+	t.TestDeleteGracefulHasDefault(createFn(), expectedGrace, wasGracefulFn)
+	t.TestDeleteGracefulWithValue(createFn(), expectedGrace, wasGracefulFn)
+	t.TestDeleteGracefulUsesZeroOnNil(createFn(), 0)
+	t.TestDeleteGracefulExtend(createFn(), expectedGrace, wasGracefulFn)
+	t.TestDeleteGracefulImmediate(createFn(), expectedGrace, wasGracefulFn)
 }

 // Test getting object.
@ -298,7 +301,7 @@ func (t *Tester) testUpdateFailsOnVersion(older runtime.Object) {
 // =============================================================================
 // Deletion tests.

-func (t *Tester) testDeleteInvokesValidation(invalid ...runtime.Object) {
+func (t *Tester) TestDeleteInvokesValidation(invalid ...runtime.Object) {
 	for i, obj := range invalid {
 		objectMeta := t.getObjectMetaOrFail(obj)
 		ctx := t.TestContext()
@ -309,7 +312,7 @@ func (t *Tester) testDeleteInvokesValidation(invalid ...runtime.Object) {
 	}
 }

-func (t *Tester) testDeleteNonExist(createFn func() runtime.Object) {
+func (t *Tester) TestDeleteNonExist(createFn func() runtime.Object) {
 	existing := createFn()
 	objectMeta := t.getObjectMetaOrFail(existing)
 	context := t.TestContext()
@ -322,7 +325,10 @@ func (t *Tester) testDeleteNonExist(createFn func() runtime.Object) {
 	})
 }

-func (t *Tester) testDeleteNoGraceful(createFn func() runtime.Object, wasGracefulFn func() bool) {
+// =============================================================================
+// Graceful Deletion tests.
+
+func (t *Tester) TestDeleteNoGraceful(createFn func() runtime.Object, wasGracefulFn func() bool) {
 	existing := createFn()
 	objectMeta := t.getObjectMetaOrFail(existing)
 	ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace)
@ -338,25 +344,142 @@ func (t *Tester) testDeleteNoGraceful(createFn func() runtime.Object, wasGracefu
 	}
 }

-// =============================================================================
-// Graceful Deletion tests.
-
-func (t *Tester) testDeleteGracefulHasDefault(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) {
+func (t *Tester) TestDeleteGracefulHasDefault(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) {
 	objectMeta := t.getObjectMetaOrFail(existing)
 	ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace)
 	_, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, &api.DeleteOptions{})
 	if err != nil {
 		t.Errorf("unexpected error: %v", err)
 	}
-	if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); err != nil {
+	if !wasGracefulFn() {
+		t.Errorf("did not gracefully delete resource")
+		return
+	}
+	object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name)
+	if err != nil {
 		t.Errorf("unexpected error, object should exist: %v", err)
+		return
+	}
+	objectMeta, err = api.ObjectMetaFor(object)
+	if err != nil {
+		t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object)
+	}
+	if objectMeta.DeletionTimestamp == nil {
+		t.Errorf("did not set deletion timestamp")
+	}
+	if objectMeta.DeletionGracePeriodSeconds == nil {
+		t.Fatalf("did not set deletion grace period seconds")
+	}
+	if *objectMeta.DeletionGracePeriodSeconds != expectedGrace {
+		t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds)
+	}
+}
+
+func (t *Tester) TestDeleteGracefulWithValue(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) {
+	objectMeta, err := api.ObjectMetaFor(existing)
+	if err != nil {
+		t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing)
+	}
+
+	ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace)
+	_, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace+2))
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
 	}
 	if !wasGracefulFn() {
 		t.Errorf("did not gracefully delete resource")
 	}
+	object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name)
+	if err != nil {
+		t.Errorf("unexpected error, object should exist: %v", err)
+	}
+	objectMeta, err = api.ObjectMetaFor(object)
+	if err != nil {
+		t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object)
+	}
+	if objectMeta.DeletionTimestamp == nil {
+		t.Errorf("did not set deletion timestamp")
+	}
+	if objectMeta.DeletionGracePeriodSeconds == nil {
+		t.Fatalf("did not set deletion grace period seconds")
+	}
+	if *objectMeta.DeletionGracePeriodSeconds != expectedGrace+2 {
+		t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds)
+	}
 }

-func (t *Tester) testDeleteGracefulUsesZeroOnNil(existing runtime.Object, expectedGrace int64) {
+func (t *Tester) TestDeleteGracefulExtend(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) {
+	objectMeta, err := api.ObjectMetaFor(existing)
+	if err != nil {
+		t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing)
+	}
+
+	ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace)
+	_, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace))
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	if !wasGracefulFn() {
+		t.Errorf("did not gracefully delete resource")
+	}
+	// second delete duration is ignored
+	_, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace+2))
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name)
+	if err != nil {
+		t.Errorf("unexpected error, object should exist: %v", err)
+	}
+	objectMeta, err = api.ObjectMetaFor(object)
+	if err != nil {
+		t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object)
+	}
+	if objectMeta.DeletionTimestamp == nil {
+		t.Errorf("did not set deletion timestamp")
+	}
+	if objectMeta.DeletionGracePeriodSeconds == nil {
+		t.Fatalf("did not set deletion grace period seconds")
+	}
+	if *objectMeta.DeletionGracePeriodSeconds != expectedGrace {
+		t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds)
+	}
+}
+
+func (t *Tester) TestDeleteGracefulImmediate(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) {
+	objectMeta, err := api.ObjectMetaFor(existing)
+	if err != nil {
+		t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing)
+	}
+
+	ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace)
+	_, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace))
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	if !wasGracefulFn() {
+		t.Errorf("did not gracefully delete resource")
+	}
+	// second delete is immediate, resource is deleted
+	out, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(0))
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+	_, err = t.storage.(rest.Getter).Get(ctx, objectMeta.Name)
+	if !errors.IsNotFound(err) {
+		t.Errorf("unexpected error, object should be deleted immediately: %v", err)
+	}
+	objectMeta, err = api.ObjectMetaFor(out)
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+		return
+	}
+	if objectMeta.DeletionTimestamp == nil || objectMeta.DeletionGracePeriodSeconds == nil || *objectMeta.DeletionGracePeriodSeconds != 0 {
+		t.Errorf("unexpected deleted meta: %#v", objectMeta)
+	}
+}
+
+func (t *Tester) TestDeleteGracefulUsesZeroOnNil(existing runtime.Object, expectedGrace int64) {
 	objectMeta := t.getObjectMetaOrFail(existing)
 	ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace)
 	_, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, nil)
@ -364,7 +487,7 @@ func (t *Tester) testDeleteGracefulUsesZeroOnNil(existing runtime.Object, expect
 		t.Errorf("unexpected error: %v", err)
 	}
 	if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); !errors.IsNotFound(err) {
-		t.Errorf("unexpected error, object should exist: %v", err)
+		t.Errorf("unexpected error, object should not exist: %v", err)
 	}
 }

--- a/pkg/api/serialization_test.go
+++ b/pkg/api/serialization_test.go
@ -151,6 +151,7 @@ func TestRoundTripTypes(t *testing.T) {
 }

 func TestEncode_Ptr(t *testing.T) {
+	grace := int64(30)
 	pod := &api.Pod{
 		ObjectMeta: api.ObjectMeta{
 			Labels: map[string]string{"name": "foo"},
@ -158,6 +159,8 @@ func TestEncode_Ptr(t *testing.T) {
 		Spec: api.PodSpec{
 			RestartPolicy: api.RestartPolicyAlways,
 			DNSPolicy:     api.DNSClusterFirst,
+
+			TerminationGracePeriodSeconds: &grace,
 		},
 	}
 	obj := runtime.Object(pod)
--- a/pkg/api/testing/fuzzer.go
+++ b/pkg/api/testing/fuzzer.go
@ -89,6 +89,15 @@ func FuzzerFor(t *testing.T, version string, src rand.Source) *fuzz.Fuzzer {
 			j.LabelSelector, _ = labels.Parse("a=b")
 			j.FieldSelector, _ = fields.ParseSelector("a=b")
 		},
+		func(j *api.PodSpec, c fuzz.Continue) {
+			c.FuzzNoCustom(j)
+			// has a default value
+			ttl := int64(30)
+			if c.RandBool() {
+				ttl = int64(c.Uint32())
+			}
+			j.TerminationGracePeriodSeconds = &ttl
+		},
 		func(j *api.PodPhase, c fuzz.Continue) {
 			statuses := []api.PodPhase{api.PodPending, api.PodRunning, api.PodFailed, api.PodUnknown}
 			*j = statuses[c.Rand.Intn(len(statuses))]
--- a/pkg/api/types.go
+++ b/pkg/api/types.go
@ -143,6 +143,10 @@ type ObjectMeta struct {
 	// will send a hard termination signal to the container.
 	DeletionTimestamp *util.Time `json:"deletionTimestamp,omitempty"`

+	// DeletionGracePeriodSeconds records the graceful deletion value set when graceful deletion
+	// was requested. Represents the most recent grace period, and may only be shortened once set.
+	DeletionGracePeriodSeconds *int64 `json:"deletionGracePeriodSeconds,omitempty"`
+
 	// Labels are key value pairs that may be used to scope and select individual resources.
 	// Label keys are of the form:
 	//     label-key ::= prefixed-name | name
--- a/pkg/api/v1/conversion_generated.go
+++ b/pkg/api/v1/conversion_generated.go
@ -1176,6 +1176,12 @@ func convert_api_ObjectMeta_To_v1_ObjectMeta(in *api.ObjectMeta, out *ObjectMeta
 	} else {
 		out.DeletionTimestamp = nil
 	}
+	if in.DeletionGracePeriodSeconds != nil {
+		out.DeletionGracePeriodSeconds = new(int64)
+		*out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds
+	} else {
+		out.DeletionGracePeriodSeconds = nil
+	}
 	if in.Labels != nil {
 		out.Labels = make(map[string]string)
 		for key, val := range in.Labels {
@ -3591,6 +3597,12 @@ func convert_v1_ObjectMeta_To_api_ObjectMeta(in *ObjectMeta, out *api.ObjectMeta
 	} else {
 		out.DeletionTimestamp = nil
 	}
+	if in.DeletionGracePeriodSeconds != nil {
+		out.DeletionGracePeriodSeconds = new(int64)
+		*out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds
+	} else {
+		out.DeletionGracePeriodSeconds = nil
+	}
 	if in.Labels != nil {
 		out.Labels = make(map[string]string)
 		for key, val := range in.Labels {
--- a/pkg/api/v1/deep_copy_generated.go
+++ b/pkg/api/v1/deep_copy_generated.go
@ -1010,6 +1010,12 @@ func deepCopy_v1_ObjectMeta(in ObjectMeta, out *ObjectMeta, c *conversion.Cloner
 	} else {
 		out.DeletionTimestamp = nil
 	}
+	if in.DeletionGracePeriodSeconds != nil {
+		out.DeletionGracePeriodSeconds = new(int64)
+		*out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds
+	} else {
+		out.DeletionGracePeriodSeconds = nil
+	}
 	if in.Labels != nil {
 		out.Labels = make(map[string]string)
 		for key, val := range in.Labels {
--- a/pkg/api/v1/defaults.go
+++ b/pkg/api/v1/defaults.go
@ -113,6 +113,10 @@ func addDefaultingFuncs() {
 			if obj.HostNetwork {
 				defaultHostNetworkPorts(&obj.Containers)
 			}
+			if obj.TerminationGracePeriodSeconds == nil {
+				period := int64(DefaultTerminationGracePeriodSeconds)
+				obj.TerminationGracePeriodSeconds = &period
+			}
 		},
 		func(obj *Probe) {
 			if obj.TimeoutSeconds == 0 {
--- a/pkg/api/v1/types.go
+++ b/pkg/api/v1/types.go
@ -141,6 +141,10 @@ type ObjectMeta struct {
 	// will send a hard termination signal to the container.
 	DeletionTimestamp *util.Time `json:"deletionTimestamp,omitempty" description:"RFC 3339 date and time at which the object will be deleted; populated by the system when a graceful deletion is requested, read-only; if not set, graceful deletion of the object has not been requested; see http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata"`

+	// DeletionGracePeriodSeconds records the graceful deletion value set when graceful deletion
+	// was requested. Represents the most recent grace period, and may only be shortened once set.
+	DeletionGracePeriodSeconds *int64 `json:"deletionGracePeriodSeconds,omitempty" description:"number of seconds allowed for this object to gracefully terminate before it will be removed from the system; only set when deletionTimestamp is also set, read-only; may only be shortened"`
+
 	// Labels are key value pairs that may be used to scope and select individual resources.
 	// TODO: replace map[string]string with labels.LabelSet type
 	Labels map[string]string `json:"labels,omitempty" description:"map of string keys and values that can be used to organize and categorize objects; may match selectors of replication controllers and services; see http://releases.k8s.io/HEAD/docs/user-guide/labels.md"`
@ -858,6 +862,8 @@ const (
 	// DNSDefault indicates that the pod should use the default (as
 	// determined by kubelet) DNS settings.
 	DNSDefault DNSPolicy = "Default"
+
+	DefaultTerminationGracePeriodSeconds = 30
 )

 // PodSpec is a description of a pod
@ -872,7 +878,7 @@ type PodSpec struct {
 	// The grace period is the duration in seconds after the processes running in the pod are sent
 	// a termination signal and the time when the processes are forcibly halted with a kill signal.
 	// Set this value longer than the expected cleanup time for your process.
-	TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds,omitempty" description:"optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process"`
+	TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds,omitempty" description:"optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process; defaults to 30 seconds"`
 	ActiveDeadlineSeconds         *int64 `json:"activeDeadlineSeconds,omitempty" description:"optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers; value must be a positive integer"`
 	// Optional: Set DNS policy.  Defaults to "ClusterFirst"
 	DNSPolicy DNSPolicy `json:"dnsPolicy,omitempty" description:"DNS policy for containers within the pod; one of 'ClusterFirst' or 'Default'"`
--- a/pkg/api/validation/validation.go
+++ b/pkg/api/validation/validation.go
@ -265,6 +265,16 @@ func ValidateObjectMetaUpdate(new, old *api.ObjectMeta) errs.ValidationErrorList
 	} else {
 		new.CreationTimestamp = old.CreationTimestamp
 	}
+	// an object can never remove a deletion timestamp or clear/change grace period seconds
+	if !old.DeletionTimestamp.IsZero() {
+		new.DeletionTimestamp = old.DeletionTimestamp
+	}
+	if old.DeletionGracePeriodSeconds != nil && new.DeletionGracePeriodSeconds == nil {
+		new.DeletionGracePeriodSeconds = old.DeletionGracePeriodSeconds
+	}
+	if new.DeletionGracePeriodSeconds != nil && old.DeletionGracePeriodSeconds != nil && *new.DeletionGracePeriodSeconds != *old.DeletionGracePeriodSeconds {
+		allErrs = append(allErrs, errs.NewFieldInvalid("deletionGracePeriodSeconds", new.DeletionGracePeriodSeconds, "field is immutable; may only be changed via deletion"))
+	}

 	// Reject updates that don't specify a resource version
 	if new.ResourceVersion == "" {
--- a/pkg/api/validation/validation_test.go
+++ b/pkg/api/validation/validation_test.go
@ -1317,6 +1317,9 @@ func TestValidatePod(t *testing.T) {
 }

 func TestValidatePodUpdate(t *testing.T) {
+	now := util.Now()
+	grace := int64(30)
+	grace2 := int64(31)
 	tests := []struct {
 		a       api.Pod
 		b       api.Pod
@ -1403,6 +1406,30 @@ func TestValidatePodUpdate(t *testing.T) {
 			false,
 			"more containers",
 		},
+		{
+			api.Pod{
+				ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now},
+				Spec:       api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}},
+			},
+			api.Pod{
+				ObjectMeta: api.ObjectMeta{Name: "foo"},
+				Spec:       api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}},
+			},
+			true,
+			"deletion timestamp filled out",
+		},
+		{
+			api.Pod{
+				ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now, DeletionGracePeriodSeconds: &grace},
+				Spec:       api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}},
+			},
+			api.Pod{
+				ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now, DeletionGracePeriodSeconds: &grace2},
+				Spec:       api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}},
+			},
+			false,
+			"deletion grace period seconds cleared",
+		},
 		{
 			api.Pod{
 				ObjectMeta: api.ObjectMeta{Name: "foo"},
--- a/pkg/controller/controller_utils.go
+++ b/pkg/controller/controller_utils.go
@ -322,7 +322,8 @@ func FilterActivePods(pods []api.Pod) []*api.Pod {
 	var result []*api.Pod
 	for i := range pods {
 		if api.PodSucceeded != pods[i].Status.Phase &&
-			api.PodFailed != pods[i].Status.Phase {
+			api.PodFailed != pods[i].Status.Phase &&
+			pods[i].DeletionTimestamp == nil {
 			result = append(result, &pods[i])
 		}
 	}
--- a/pkg/controller/endpoint/endpoints_controller.go
+++ b/pkg/controller/endpoint/endpoints_controller.go
@ -310,7 +310,11 @@ func (e *EndpointController) syncService(key string) {
 				continue
 			}
 			if len(pod.Status.PodIP) == 0 {
-				glog.V(4).Infof("Failed to find an IP for pod %s/%s", pod.Namespace, pod.Name)
+				glog.V(5).Infof("Failed to find an IP for pod %s/%s", pod.Namespace, pod.Name)
+				continue
+			}
+			if pod.DeletionTimestamp != nil {
+				glog.V(5).Infof("Pod is being deleted %s/%s", pod.Namespace, pod.Name)
 				continue
 			}

--- a/pkg/controller/node/nodecontroller.go
+++ b/pkg/controller/node/nodecontroller.go
@ -89,8 +89,11 @@ type NodeController struct {
 	nodeStatusMap map[string]nodeStatusData
 	now           func() util.Time
 	// worker that evicts pods from unresponsive nodes.
-	podEvictor         *PodEvictor
+	podEvictor         *RateLimitedTimedQueue
+	terminationEvictor *RateLimitedTimedQueue
 	podEvictionTimeout time.Duration
+	// The maximum duration before a pod evicted from a node can be forcefully terminated.
+	maximumGracePeriod time.Duration
 	recorder           record.EventRecorder
 }

@ -99,7 +102,7 @@ func NewNodeController(
 	cloud cloudprovider.Interface,
 	kubeClient client.Interface,
 	podEvictionTimeout time.Duration,
-	podEvictor *PodEvictor,
+	podEvictionLimiter util.RateLimiter,
 	nodeMonitorGracePeriod time.Duration,
 	nodeStartupGracePeriod time.Duration,
 	nodeMonitorPeriod time.Duration,
@ -123,7 +126,9 @@ func NewNodeController(
 		kubeClient:             kubeClient,
 		recorder:               recorder,
 		podEvictionTimeout:     podEvictionTimeout,
-		podEvictor:             podEvictor,
+		maximumGracePeriod:     5 * time.Minute,
+		podEvictor:             NewRateLimitedTimedQueue(podEvictionLimiter, false),
+		terminationEvictor:     NewRateLimitedTimedQueue(podEvictionLimiter, false),
 		nodeStatusMap:          make(map[string]nodeStatusData),
 		nodeMonitorGracePeriod: nodeMonitorGracePeriod,
 		nodeMonitorPeriod:      nodeMonitorPeriod,
@ -145,38 +150,36 @@ func (nc *NodeController) Run(period time.Duration) {
 	}, nc.nodeMonitorPeriod)

 	go util.Forever(func() {
-		nc.podEvictor.TryEvict(func(nodeName string) { nc.deletePods(nodeName) })
+		nc.podEvictor.Try(func(value TimedValue) (bool, time.Duration) {
+			remaining, err := nc.deletePods(value.Value)
+			if err != nil {
+				util.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err))
+				return false, 0
+			}
+			if remaining {
+				glog.V(2).Infof("Pods terminating on %q", value.Value)
+				nc.terminationEvictor.Add(value.Value)
+			}
+			return true, 0
+		})
 	}, nodeEvictionPeriod)
-}

-// We observed a Node deletion in etcd. Currently we only need to remove Pods that
-// were assigned to it.
-func (nc *NodeController) deleteNode(nodeID string) error {
-	return nc.deletePods(nodeID)
-}
-
-// deletePods will delete all pods from master running on given node.
-func (nc *NodeController) deletePods(nodeID string) error {
-	glog.V(2).Infof("Delete all pods from %v", nodeID)
-	pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(),
-		fields.OneTermEqualSelector(client.PodHost, nodeID))
-	if err != nil {
-		return err
-	}
-	nc.recordNodeEvent(nodeID, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeID))
-	for _, pod := range pods.Items {
-		// Defensive check, also needed for tests.
-		if pod.Spec.NodeName != nodeID {
-			continue
-		}
-		glog.V(2).Infof("Delete pod %v", pod.Name)
-		nc.recorder.Eventf(&pod, "NodeControllerEviction", "Deleting Pod %s from Node %s", pod.Name, nodeID)
-		if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil {
-			glog.Errorf("Error deleting pod %v: %v", pod.Name, err)
-		}
-	}
-
-	return nil
+	// TODO: replace with a controller that ensures pods that are terminating complete
+	// in a particular time period
+	go util.Forever(func() {
+		nc.terminationEvictor.Try(func(value TimedValue) (bool, time.Duration) {
+			remaining, err := nc.terminatePods(value.Value, value.Added)
+			if err != nil {
+				util.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err))
+				return false, 0
+			}
+			if remaining != 0 {
+				glog.V(2).Infof("Pods still terminating on %q, estimated completion %s", value.Value, remaining)
+				return false, remaining
+			}
+			return true, 0
+		})
+	}, nodeEvictionPeriod)
 }

 // Generates num pod CIDRs that could be assigned to nodes.
@ -271,18 +274,18 @@ func (nc *NodeController) monitorNodeStatus() error {
 			// Check eviction timeout against decisionTimestamp
 			if lastReadyCondition.Status == api.ConditionFalse &&
 				decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
-				if nc.podEvictor.AddNodeToEvict(node.Name) {
+				if nc.podEvictor.Add(node.Name) {
 					glog.Infof("Adding pods to evict: %v is later than %v + %v", decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout)
 				}
 			}
 			if lastReadyCondition.Status == api.ConditionUnknown &&
 				decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout-gracePeriod)) {
-				if nc.podEvictor.AddNodeToEvict(node.Name) {
+				if nc.podEvictor.Add(node.Name) {
 					glog.Infof("Adding pods to evict2: %v is later than %v + %v", decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod)
 				}
 			}
 			if lastReadyCondition.Status == api.ConditionTrue {
-				if nc.podEvictor.RemoveNodeToEvict(node.Name) {
+				if nc.podEvictor.Remove(node.Name) {
 					glog.Infof("Pods on %v won't be evicted", node.Name)
 				}
 			}
@ -302,8 +305,8 @@ func (nc *NodeController) monitorNodeStatus() error {
 				}
 				if _, err := instances.ExternalID(node.Name); err != nil && err == cloudprovider.InstanceNotFound {
 					glog.Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
-					nc.recordNodeEvent(node.Name, "DeleteingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name))
-					if err := nc.deletePods(node.Name); err != nil {
+					nc.recordNodeEvent(node.Name, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name))
+					if _, err := nc.deletePods(node.Name); err != nil {
 						glog.Errorf("Unable to delete pods from node %s: %v", node.Name, err)
 						continue
 					}
@ -311,6 +314,7 @@ func (nc *NodeController) monitorNodeStatus() error {
 						glog.Errorf("Unable to delete node %s: %v", node.Name, err)
 						continue
 					}
+					nc.podEvictor.Add(node.Name)
 				}
 			}
 		}
@ -502,3 +506,88 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap

 	return gracePeriod, lastReadyCondition, readyCondition, err
 }
+
+// We observed a Node deletion in etcd. Currently we only need to remove Pods that
+// were assigned to it.
+func (nc *NodeController) deleteNode(nodeID string) error {
+	nc.podEvictor.Add(nodeID)
+	return nil
+}
+
+// deletePods will delete all pods from master running on given node, and return true
+// if any pods were deleted.
+func (nc *NodeController) deletePods(nodeID string) (bool, error) {
+	remaining := false
+	glog.V(2).Infof("Delete all pods from %s", nodeID)
+	pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(),
+		fields.OneTermEqualSelector(client.PodHost, nodeID))
+	if err != nil {
+		return remaining, err
+	}
+
+	nc.recordNodeEvent(nodeID, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeID))
+
+	for _, pod := range pods.Items {
+		// Defensive check, also needed for tests.
+		if pod.Spec.NodeName != nodeID {
+			continue
+		}
+		// if the pod has already been deleted, ignore it
+		if pod.DeletionGracePeriodSeconds != nil {
+			continue
+		}
+
+		glog.V(2).Infof("Delete pod %v", pod.Name)
+		nc.recorder.Eventf(&pod, "NodeControllerEviction", "Deleting Pod %s from Node %s", pod.Name, nodeID)
+		if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil {
+			return false, err
+		}
+		remaining = true
+	}
+	return remaining, nil
+}
+
+// terminatePods will ensure all pods on the given node that are in terminating state are eventually
+// cleaned up
+func (nc *NodeController) terminatePods(nodeID string, since time.Time) (time.Duration, error) {
+	remaining := time.Duration(0)
+	glog.V(2).Infof("Terminating all pods on %s", nodeID)
+	pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(),
+		fields.OneTermEqualSelector(client.PodHost, nodeID))
+	if err != nil {
+		return remaining, err
+	}
+
+	nc.recordNodeEvent(nodeID, "TerminatingAllPods", fmt.Sprintf("Terminating all Pods on Node %s.", nodeID))
+
+	now := time.Now()
+	elapsed := now.Sub(since)
+	for _, pod := range pods.Items {
+		// Defensive check, also needed for tests.
+		if pod.Spec.NodeName != nodeID {
+			continue
+		}
+		// only clean terminated pods
+		if pod.DeletionGracePeriodSeconds == nil {
+			continue
+		}
+
+		grace := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second
+		if grace > nc.maximumGracePeriod {
+			grace = nc.maximumGracePeriod
+		}
+		next := grace - elapsed
+		if next < 0 {
+			glog.V(2).Infof("Removing pod %v after %s grace period", pod.Name, grace)
+			nc.recordNodeEvent(nodeID, "TerminatingEvictedPod", fmt.Sprintf("Pod %s has exceeded the grace period for deletion after being evicted from Node %q and is being force killed", pod.Name, nodeID))
+			if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil {
+				glog.Errorf("Error completing deletion of pod %s: %v", pod.Name, err)
+				next = 1
+			}
+		}
+		if remaining < next {
+			remaining = next
+		}
+	}
+	return remaining, nil
+}
--- a/pkg/controller/node/nodecontroller_test.go
+++ b/pkg/controller/node/nodecontroller_test.go
@ -324,9 +324,8 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) {
 	}

 	for _, item := range table {
-		podEvictor := NewPodEvictor(util.NewFakeRateLimiter())
 		nodeController := NewNodeController(nil, item.fakeNodeHandler,
-			evictionTimeout, podEvictor, testNodeMonitorGracePeriod,
+			evictionTimeout, util.NewFakeRateLimiter(), testNodeMonitorGracePeriod,
 			testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false)
 		nodeController.now = func() util.Time { return fakeNow }
 		if err := nodeController.monitorNodeStatus(); err != nil {
@ -340,7 +339,17 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) {
 			t.Errorf("unexpected error: %v", err)
 		}

-		podEvictor.TryEvict(func(nodeName string) { nodeController.deletePods(nodeName) })
+		nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) {
+			remaining, _ := nodeController.deletePods(value.Value)
+			if remaining {
+				nodeController.terminationEvictor.Add(value.Value)
+			}
+			return true, 0
+		})
+		nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) {
+			nodeController.terminatePods(value.Value, value.Added)
+			return true, 0
+		})
 		podEvicted := false
 		for _, action := range item.fakeNodeHandler.Actions() {
 			if action.GetVerb() == "delete" && action.GetResource() == "pods" {
@ -531,7 +540,7 @@ func TestMonitorNodeStatusUpdateStatus(t *testing.T) {
 	}

 	for _, item := range table {
-		nodeController := NewNodeController(nil, item.fakeNodeHandler, 5*time.Minute, NewPodEvictor(util.NewFakeRateLimiter()),
+		nodeController := NewNodeController(nil, item.fakeNodeHandler, 5*time.Minute, util.NewFakeRateLimiter(),
 			testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false)
 		nodeController.now = func() util.Time { return fakeNow }
 		if err := nodeController.monitorNodeStatus(); err != nil {
@ -610,7 +619,7 @@ func TestNodeDeletion(t *testing.T) {
 		Fake: testclient.NewSimpleFake(&api.PodList{Items: []api.Pod{*newPod("pod0", "node0"), *newPod("pod1", "node1")}}),
 	}

-	nodeController := NewNodeController(nil, fakeNodeHandler, 5*time.Minute, NewPodEvictor(util.NewFakeRateLimiter()),
+	nodeController := NewNodeController(nil, fakeNodeHandler, 5*time.Minute, util.NewFakeRateLimiter(),
 		testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false)
 	nodeController.now = func() util.Time { return fakeNow }
 	if err := nodeController.monitorNodeStatus(); err != nil {
@ -620,6 +629,10 @@ func TestNodeDeletion(t *testing.T) {
 	if err := nodeController.monitorNodeStatus(); err != nil {
 		t.Errorf("unexpected error: %v", err)
 	}
+	nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) {
+		nodeController.deletePods(value.Value)
+		return true, 0
+	})
 	podEvicted := false
 	for _, action := range fakeNodeHandler.Actions() {
 		if action.GetVerb() == "delete" && action.GetResource() == "pods" {
--- a/pkg/controller/node/podevictor.go
+++ b/pkg/controller/node/podevictor.go
@ -1,129 +0,0 @@
-/*
-Copyright 2015 The Kubernetes Authors All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package nodecontroller
-
-import (
-	"sync"
-
-	"k8s.io/kubernetes/pkg/util"
-
-	"github.com/golang/glog"
-)
-
-// A FIFO queue which additionally guarantees that any element can be added only once until
-// it is removed.
-type UniqueQueue struct {
-	lock  sync.Mutex
-	queue []string
-	set   util.StringSet
-}
-
-// Entity responsible for evicting Pods from inserted Nodes. It uses RateLimiter to avoid
-// evicting everything at once. Note that we rate limit eviction of Nodes not individual Pods.
-type PodEvictor struct {
-	queue                   UniqueQueue
-	deletingPodsRateLimiter util.RateLimiter
-}
-
-// Adds a new value to the queue if it wasn't added before, or was explicitly removed by the
-// Remove call. Returns true if new value was added.
-func (q *UniqueQueue) Add(value string) bool {
-	q.lock.Lock()
-	defer q.lock.Unlock()
-
-	if !q.set.Has(value) {
-		q.queue = append(q.queue, value)
-		q.set.Insert(value)
-		return true
-	} else {
-		return false
-	}
-}
-
-// Removes the value from the queue, so Get() call won't return it, and allow subsequent addition
-// of the given value. If the value is not present does nothing and returns false.
-func (q *UniqueQueue) Remove(value string) bool {
-	q.lock.Lock()
-	defer q.lock.Unlock()
-
-	q.set.Delete(value)
-	for i, val := range q.queue {
-		if val == value {
-			if i > 0 && i < len(q.queue)-1 {
-				q.queue = append(q.queue[0:i], q.queue[i+1:len(q.queue)]...)
-			} else if i > 0 {
-				q.queue = q.queue[0 : len(q.queue)-1]
-			} else {
-				q.queue = q.queue[1:len(q.queue)]
-			}
-			return true
-		}
-	}
-	return false
-}
-
-// Returns the oldest added value that wasn't returned yet.
-func (q *UniqueQueue) Get() (string, bool) {
-	q.lock.Lock()
-	defer q.lock.Unlock()
-	if len(q.queue) == 0 {
-		return "", false
-	}
-
-	result := q.queue[0]
-	q.queue = q.queue[1:len(q.queue)]
-	return result, true
-}
-
-// Creates new PodEvictor which will use given RateLimiter to oversee eviction.
-func NewPodEvictor(deletingPodsRateLimiter util.RateLimiter) *PodEvictor {
-	return &PodEvictor{
-		queue: UniqueQueue{
-			queue: make([]string, 0),
-			set:   util.NewStringSet(),
-		},
-		deletingPodsRateLimiter: deletingPodsRateLimiter,
-	}
-}
-
-// Tries to evict all Pods from previously inserted Nodes. Ends prematurely if RateLimiter forbids any eviction.
-// Each Node is processed only once, as long as it's not Removed, i.e. calling multiple AddNodeToEvict does not result
-// with multiple evictions as long as RemoveNodeToEvict is not called.
-func (pe *PodEvictor) TryEvict(delFunc func(string)) {
-	val, ok := pe.queue.Get()
-	for ok {
-		if pe.deletingPodsRateLimiter.CanAccept() {
-			glog.Infof("PodEvictor is evicting Pods on Node: %v", val)
-			delFunc(val)
-		} else {
-			glog.V(1).Info("PodEvictor is rate limitted.")
-			break
-		}
-		val, ok = pe.queue.Get()
-	}
-}
-
-// Adds Node to the Evictor to be processed later. Won't add the same Node second time if it was already
-// added and not removed.
-func (pe *PodEvictor) AddNodeToEvict(nodeName string) bool {
-	return pe.queue.Add(nodeName)
-}
-
-// Removes Node from the Evictor. The Node won't be processed until added again.
-func (pe *PodEvictor) RemoveNodeToEvict(nodeName string) bool {
-	return pe.queue.Remove(nodeName)
-}
--- a/pkg/controller/node/rate_limited_queue.go
+++ b/pkg/controller/node/rate_limited_queue.go
@ -0,0 +1,183 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package nodecontroller
+
+import (
+	"container/heap"
+	"sync"
+	"time"
+
+	"k8s.io/kubernetes/pkg/util"
+)
+
+// TimedValue is a value that should be processed at a designated time.
+type TimedValue struct {
+	Value string
+	Added time.Time
+	Next  time.Time
+}
+
+// now is used to test time
+var now func() time.Time = time.Now
+
+// TimedQueue is a priority heap where the lowest Next is at the front of the queue
+type TimedQueue []*TimedValue
+
+func (h TimedQueue) Len() int           { return len(h) }
+func (h TimedQueue) Less(i, j int) bool { return h[i].Next.Before(h[j].Next) }
+func (h TimedQueue) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
+
+func (h *TimedQueue) Push(x interface{}) {
+	*h = append(*h, x.(*TimedValue))
+}
+
+func (h *TimedQueue) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[0 : n-1]
+	return x
+}
+
+// A FIFO queue which additionally guarantees that any element can be added only once until
+// it is removed.
+type UniqueQueue struct {
+	lock  sync.Mutex
+	queue TimedQueue
+	set   util.StringSet
+}
+
+// Adds a new value to the queue if it wasn't added before, or was explicitly removed by the
+// Remove call. Returns true if new value was added.
+func (q *UniqueQueue) Add(value TimedValue) bool {
+	q.lock.Lock()
+	defer q.lock.Unlock()
+
+	if q.set.Has(value.Value) {
+		return false
+	}
+	heap.Push(&q.queue, &value)
+	q.set.Insert(value.Value)
+	return true
+}
+
+// Removes the value from the queue, so Get() call won't return it, and allow subsequent addition
+// of the given value. If the value is not present does nothing and returns false.
+func (q *UniqueQueue) Remove(value string) bool {
+	q.lock.Lock()
+	defer q.lock.Unlock()
+
+	q.set.Delete(value)
+	for i, val := range q.queue {
+		if val.Value == value {
+			if i > 0 && i < len(q.queue)-1 {
+				q.queue = append(q.queue[0:i], q.queue[i+1:len(q.queue)]...)
+			} else if i > 0 {
+				q.queue = q.queue[0 : len(q.queue)-1]
+			} else {
+				q.queue = q.queue[1:len(q.queue)]
+			}
+			return true
+		}
+	}
+	return false
+}
+
+// Returns the oldest added value that wasn't returned yet.
+func (q *UniqueQueue) Get() (TimedValue, bool) {
+	q.lock.Lock()
+	defer q.lock.Unlock()
+	if len(q.queue) == 0 {
+		return TimedValue{}, false
+	}
+	result := q.queue.Pop().(*TimedValue)
+	q.set.Delete(result.Value)
+	return *result, true
+}
+
+// RateLimitedTimedQueue is a unique item priority queue ordered by the expected next time
+// of execution. It is also rate limited.
+type RateLimitedTimedQueue struct {
+	queue   UniqueQueue
+	limiter util.RateLimiter
+	leak    bool
+}
+
+// Creates new queue which will use given RateLimiter to oversee execution. If leak is true,
+// items which are rate limited will be leakped. Otherwise, rate limited items will be requeued.
+func NewRateLimitedTimedQueue(limiter util.RateLimiter, leak bool) *RateLimitedTimedQueue {
+	return &RateLimitedTimedQueue{
+		queue: UniqueQueue{
+			queue: TimedQueue{},
+			set:   util.NewStringSet(),
+		},
+		limiter: limiter,
+		leak:    leak,
+	}
+}
+
+// ActionFunc takes a timed value and returns false if the item must be retried, with an optional
+// time.Duration if some minimum wait interval should be used.
+type ActionFunc func(TimedValue) (bool, time.Duration)
+
+// Try processes the queue. Ends prematurely if RateLimiter forbids an action and leak is true.
+// Otherwise, requeues the item to be processed. Each value is processed once if fn returns true,
+// otherwise it is added back to the queue. The returned remaining is used to identify the minimum
+// time to execute the next item in the queue.
+func (q *RateLimitedTimedQueue) Try(fn ActionFunc) {
+	val, ok := q.queue.Get()
+	for ok {
+		// rate limit the queue checking
+		if q.leak {
+			if !q.limiter.CanAccept() {
+				break
+			}
+		} else {
+			q.limiter.Accept()
+		}
+
+		now := now()
+		if now.Before(val.Next) {
+			q.queue.Add(val)
+			val, ok = q.queue.Get()
+			// we do not sleep here because other values may be added at the front of the queue
+			continue
+		}
+
+		if ok, wait := fn(val); !ok {
+			val.Next = now.Add(wait + 1)
+			q.queue.Add(val)
+		}
+		val, ok = q.queue.Get()
+	}
+}
+
+// Adds value to the queue to be processed. Won't add the same value a second time if it was already
+// added and not removed.
+func (q *RateLimitedTimedQueue) Add(value string) bool {
+	now := now()
+	return q.queue.Add(TimedValue{
+		Value: value,
+		Added: now,
+		Next:  now,
+	})
+}
+
+// Removes Node from the Evictor. The Node won't be processed until added again.
+func (q *RateLimitedTimedQueue) Remove(value string) bool {
+	return q.queue.Remove(value)
+}
--- a/pkg/controller/node/rate_limited_queue_test.go
+++ b/pkg/controller/node/rate_limited_queue_test.go
@ -17,14 +17,16 @@ limitations under the License.
 package nodecontroller

 import (
+	"reflect"
 	"testing"
+	"time"

 	"k8s.io/kubernetes/pkg/util"
 )

-func CheckQueueEq(lhs, rhs []string) bool {
+func CheckQueueEq(lhs []string, rhs TimedQueue) bool {
 	for i := 0; i < len(lhs); i++ {
-		if rhs[i] != lhs[i] {
+		if rhs[i].Value != lhs[i] {
 			return false
 		}
 	}
@ -36,10 +38,10 @@ func CheckSetEq(lhs, rhs util.StringSet) bool {
 }

 func TestAddNode(t *testing.T) {
-	evictor := NewPodEvictor(util.NewFakeRateLimiter())
-	evictor.AddNodeToEvict("first")
-	evictor.AddNodeToEvict("second")
-	evictor.AddNodeToEvict("third")
+	evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true)
+	evictor.Add("first")
+	evictor.Add("second")
+	evictor.Add("third")

 	queuePattern := []string{"first", "second", "third"}
 	if len(evictor.queue.queue) != len(queuePattern) {
@ -59,11 +61,11 @@ func TestAddNode(t *testing.T) {
 }

 func TestDelNode(t *testing.T) {
-	evictor := NewPodEvictor(util.NewFakeRateLimiter())
-	evictor.AddNodeToEvict("first")
-	evictor.AddNodeToEvict("second")
-	evictor.AddNodeToEvict("third")
-	evictor.RemoveNodeToEvict("first")
+	evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true)
+	evictor.Add("first")
+	evictor.Add("second")
+	evictor.Add("third")
+	evictor.Remove("first")

 	queuePattern := []string{"second", "third"}
 	if len(evictor.queue.queue) != len(queuePattern) {
@ -81,11 +83,11 @@ func TestDelNode(t *testing.T) {
 		t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern)
 	}

-	evictor = NewPodEvictor(util.NewFakeRateLimiter())
-	evictor.AddNodeToEvict("first")
-	evictor.AddNodeToEvict("second")
-	evictor.AddNodeToEvict("third")
-	evictor.RemoveNodeToEvict("second")
+	evictor = NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true)
+	evictor.Add("first")
+	evictor.Add("second")
+	evictor.Add("third")
+	evictor.Remove("second")

 	queuePattern = []string{"first", "third"}
 	if len(evictor.queue.queue) != len(queuePattern) {
@ -103,11 +105,11 @@ func TestDelNode(t *testing.T) {
 		t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern)
 	}

-	evictor = NewPodEvictor(util.NewFakeRateLimiter())
-	evictor.AddNodeToEvict("first")
-	evictor.AddNodeToEvict("second")
-	evictor.AddNodeToEvict("third")
-	evictor.RemoveNodeToEvict("third")
+	evictor = NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true)
+	evictor.Add("first")
+	evictor.Add("second")
+	evictor.Add("third")
+	evictor.Remove("third")

 	queuePattern = []string{"first", "second"}
 	if len(evictor.queue.queue) != len(queuePattern) {
@ -126,15 +128,18 @@ func TestDelNode(t *testing.T) {
 	}
 }

-func TestEvictNode(t *testing.T) {
-	evictor := NewPodEvictor(util.NewFakeRateLimiter())
-	evictor.AddNodeToEvict("first")
-	evictor.AddNodeToEvict("second")
-	evictor.AddNodeToEvict("third")
-	evictor.RemoveNodeToEvict("second")
+func TestTry(t *testing.T) {
+	evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true)
+	evictor.Add("first")
+	evictor.Add("second")
+	evictor.Add("third")
+	evictor.Remove("second")

 	deletedMap := util.NewStringSet()
-	evictor.TryEvict(func(nodeName string) { deletedMap.Insert(nodeName) })
+	evictor.Try(func(value TimedValue) (bool, time.Duration) {
+		deletedMap.Insert(value.Value)
+		return true, 0
+	})

 	setPattern := util.NewStringSet("first", "third")
 	if len(deletedMap) != len(setPattern) {
@ -144,3 +149,35 @@ func TestEvictNode(t *testing.T) {
 		t.Errorf("Invalid map. Got %v, expected %v", deletedMap, setPattern)
 	}
 }
+
+func TestTryOrdering(t *testing.T) {
+	evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), false)
+	evictor.Add("first")
+	evictor.Add("second")
+	evictor.Add("third")
+
+	order := []string{}
+	count := 0
+	queued := false
+	evictor.Try(func(value TimedValue) (bool, time.Duration) {
+		count++
+		if value.Added.IsZero() {
+			t.Fatalf("added should not be zero")
+		}
+		if value.Next.IsZero() {
+			t.Fatalf("next should not be zero")
+		}
+		if !queued && value.Value == "second" {
+			queued = true
+			return false, time.Millisecond
+		}
+		order = append(order, value.Value)
+		return true, 0
+	})
+	if reflect.DeepEqual(order, []string{"first", "third", "second"}) {
+		t.Fatalf("order was wrong: %v", order)
+	}
+	if count != 4 {
+		t.Fatalf("unexpected iterations: %d", count)
+	}
+}
--- a/pkg/controller/replication/replication_controller.go
+++ b/pkg/controller/replication/replication_controller.go
@ -213,6 +213,12 @@ func (rm *ReplicationManager) getPodController(pod *api.Pod) *api.ReplicationCon
 // When a pod is created, enqueue the controller that manages it and update it's expectations.
 func (rm *ReplicationManager) addPod(obj interface{}) {
 	pod := obj.(*api.Pod)
+	if pod.DeletionTimestamp != nil {
+		// on a restart of the controller manager, it's possible a new pod shows up in a state that
+		// is already pending deletion. Prevent the pod from being a creation observation.
+		rm.deletePod(pod)
+		return
+	}
 	if rc := rm.getPodController(pod); rc != nil {
 		rcKey, err := controller.KeyFunc(rc)
 		if err != nil {
@ -234,6 +240,15 @@ func (rm *ReplicationManager) updatePod(old, cur interface{}) {
 	}
 	// TODO: Write a unittest for this case
 	curPod := cur.(*api.Pod)
+	if curPod.DeletionTimestamp != nil {
+		// when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period,
+		// and after such time has passed, the kubelet actually deletes it from the store. We receive an update
+		// for modification of the deletion timestamp and expect an rc to create more replicas asap, not wait
+		// until the kubelet actually deletes the pod. This is different from the Phase of a pod changing, because
+		// an rc never initiates a phase change, and so is never asleep waiting for the same.
+		rm.deletePod(curPod)
+		return
+	}
 	if rc := rm.getPodController(curPod); rc != nil {
 		rm.enqueueController(rc)
 	}
--- a/pkg/expapi/deep_copy_generated.go
+++ b/pkg/expapi/deep_copy_generated.go
@ -53,6 +53,12 @@ func deepCopy_api_ObjectMeta(in api.ObjectMeta, out *api.ObjectMeta, c *conversi
 	} else {
 		out.DeletionTimestamp = nil
 	}
+	if in.DeletionGracePeriodSeconds != nil {
+		out.DeletionGracePeriodSeconds = new(int64)
+		*out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds
+	} else {
+		out.DeletionGracePeriodSeconds = nil
+	}
 	if in.Labels != nil {
 		out.Labels = make(map[string]string)
 		for key, val := range in.Labels {
--- a/pkg/expapi/v1/conversion_generated.go
+++ b/pkg/expapi/v1/conversion_generated.go
@ -57,6 +57,12 @@ func convert_api_ObjectMeta_To_v1_ObjectMeta(in *api.ObjectMeta, out *v1.ObjectM
 	} else {
 		out.DeletionTimestamp = nil
 	}
+	if in.DeletionGracePeriodSeconds != nil {
+		out.DeletionGracePeriodSeconds = new(int64)
+		*out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds
+	} else {
+		out.DeletionGracePeriodSeconds = nil
+	}
 	if in.Labels != nil {
 		out.Labels = make(map[string]string)
 		for key, val := range in.Labels {
@ -115,6 +121,12 @@ func convert_v1_ObjectMeta_To_api_ObjectMeta(in *v1.ObjectMeta, out *api.ObjectM
 	} else {
 		out.DeletionTimestamp = nil
 	}
+	if in.DeletionGracePeriodSeconds != nil {
+		out.DeletionGracePeriodSeconds = new(int64)
+		*out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds
+	} else {
+		out.DeletionGracePeriodSeconds = nil
+	}
 	if in.Labels != nil {
 		out.Labels = make(map[string]string)
 		for key, val := range in.Labels {
--- a/pkg/expapi/v1/deep_copy_generated.go
+++ b/pkg/expapi/v1/deep_copy_generated.go
@ -70,6 +70,12 @@ func deepCopy_v1_ObjectMeta(in v1.ObjectMeta, out *v1.ObjectMeta, c *conversion.
 	} else {
 		out.DeletionTimestamp = nil
 	}
+	if in.DeletionGracePeriodSeconds != nil {
+		out.DeletionGracePeriodSeconds = new(int64)
+		*out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds
+	} else {
+		out.DeletionGracePeriodSeconds = nil
+	}
 	if in.Labels != nil {
 		out.Labels = make(map[string]string)
 		for key, val := range in.Labels {
--- a/pkg/kubectl/cmd/get_test.go
+++ b/pkg/kubectl/cmd/get_test.go
@ -38,6 +38,7 @@ import (
 )

 func testData() (*api.PodList, *api.ServiceList, *api.ReplicationControllerList) {
+	grace := int64(30)
 	pods := &api.PodList{
 		ListMeta: api.ListMeta{
 			ResourceVersion: "15",
@ -46,15 +47,17 @@ func testData() (*api.PodList, *api.ServiceList, *api.ReplicationControllerList)
 			{
 				ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "test", ResourceVersion: "10"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			{
 				ObjectMeta: api.ObjectMeta{Name: "bar", Namespace: "test", ResourceVersion: "11"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
@ -536,6 +539,7 @@ func TestGetMultipleTypeObjectsWithDirectReference(t *testing.T) {
 	}
 }
 func watchTestData() ([]api.Pod, []watch.Event) {
+	grace := int64(30)
 	pods := []api.Pod{
 		{
 			ObjectMeta: api.ObjectMeta{
@ -544,8 +548,9 @@ func watchTestData() ([]api.Pod, []watch.Event) {
 				ResourceVersion: "10",
 			},
 			Spec: api.PodSpec{
-				RestartPolicy: api.RestartPolicyAlways,
-				DNSPolicy:     api.DNSClusterFirst,
+				RestartPolicy:                 api.RestartPolicyAlways,
+				DNSPolicy:                     api.DNSClusterFirst,
+				TerminationGracePeriodSeconds: &grace,
 			},
 		},
 	}
@ -559,8 +564,9 @@ func watchTestData() ([]api.Pod, []watch.Event) {
 					ResourceVersion: "11",
 				},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
@ -573,8 +579,9 @@ func watchTestData() ([]api.Pod, []watch.Event) {
 					ResourceVersion: "12",
 				},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
--- a/pkg/kubectl/cmd/util/helpers_test.go
+++ b/pkg/kubectl/cmd/util/helpers_test.go
@ -34,6 +34,7 @@ import (
 )

 func TestMerge(t *testing.T) {
+	grace := int64(30)
 	tests := []struct {
 		obj       runtime.Object
 		fragment  string
@ -54,8 +55,9 @@ func TestMerge(t *testing.T) {
 					Name: "foo",
 				},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
@ -122,8 +124,9 @@ func TestMerge(t *testing.T) {
 							VolumeSource: api.VolumeSource{EmptyDir: &api.EmptyDirVolumeSource{}},
 						},
 					},
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
--- a/pkg/kubectl/describe.go
+++ b/pkg/kubectl/describe.go
@ -417,7 +417,12 @@ func describePod(pod *api.Pod, rcs []api.ReplicationController, events *api.Even
 		fmt.Fprintf(out, "Image(s):\t%s\n", makeImageList(&pod.Spec))
 		fmt.Fprintf(out, "Node:\t%s\n", pod.Spec.NodeName+"/"+pod.Status.HostIP)
 		fmt.Fprintf(out, "Labels:\t%s\n", formatLabels(pod.Labels))
-		fmt.Fprintf(out, "Status:\t%s\n", string(pod.Status.Phase))
+		if pod.DeletionTimestamp != nil {
+			fmt.Fprintf(out, "Status:\tTerminating (expires %s)\n", pod.DeletionTimestamp.Time.Format(time.RFC1123Z))
+			fmt.Fprintf(out, "Termination Grace Period:\t%ds\n", pod.DeletionGracePeriodSeconds)
+		} else {
+			fmt.Fprintf(out, "Status:\t%s\n", string(pod.Status.Phase))
+		}
 		fmt.Fprintf(out, "Reason:\t%s\n", pod.Status.Reason)
 		fmt.Fprintf(out, "Message:\t%s\n", pod.Status.Message)
 		fmt.Fprintf(out, "IP:\t%s\n", pod.Status.PodIP)
--- a/pkg/kubectl/resource/builder_test.go
+++ b/pkg/kubectl/resource/builder_test.go
@ -83,6 +83,7 @@ func fakeClientWith(testName string, t *testing.T, data map[string]string) Clien
 }

 func testData() (*api.PodList, *api.ServiceList) {
+	grace := int64(30)
 	pods := &api.PodList{
 		ListMeta: api.ListMeta{
 			ResourceVersion: "15",
@ -91,15 +92,17 @@ func testData() (*api.PodList, *api.ServiceList) {
 			{
 				ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "test", ResourceVersion: "10"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			{
 				ObjectMeta: api.ObjectMeta{Name: "bar", Namespace: "test", ResourceVersion: "11"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
--- a/pkg/kubectl/resource/helper_test.go
+++ b/pkg/kubectl/resource/helper_test.go
@ -128,6 +128,7 @@ func TestHelperCreate(t *testing.T) {
 		return true
 	}

+	grace := int64(30)
 	tests := []struct {
 		Resp     *http.Response
 		RespFunc client.HTTPClientFunc
@ -172,8 +173,9 @@ func TestHelperCreate(t *testing.T) {
 			ExpectObject: &api.Pod{
 				ObjectMeta: api.ObjectMeta{Name: "foo"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			Resp: &http.Response{StatusCode: http.StatusOK, Body: objBody(&api.Status{Status: api.StatusSuccess})},
@ -381,6 +383,7 @@ func TestHelperReplace(t *testing.T) {
 		return true
 	}

+	grace := int64(30)
 	tests := []struct {
 		Resp      *http.Response
 		RespFunc  client.HTTPClientFunc
@ -418,8 +421,9 @@ func TestHelperReplace(t *testing.T) {
 			ExpectObject: &api.Pod{
 				ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "10"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			Overwrite: true,
--- a/pkg/kubectl/resource_printer.go
+++ b/pkg/kubectl/resource_printer.go
@ -410,6 +410,9 @@ func printPod(pod *api.Pod, w io.Writer, withNamespace bool, wide bool, columnLa
 			readyContainers++
 		}
 	}
+	if pod.DeletionTimestamp != nil {
+		reason = "Terminating"
+	}

 	if withNamespace {
 		if _, err := fmt.Fprintf(w, "%s\t", namespace); err != nil {
--- a/pkg/kubectl/rolling_updater_test.go
+++ b/pkg/kubectl/rolling_updater_test.go
@ -880,6 +880,7 @@ func TestUpdateExistingReplicationController(t *testing.T) {

 func TestUpdateWithRetries(t *testing.T) {
 	codec := testapi.Codec()
+	grace := int64(30)
 	rc := &api.ReplicationController{
 		ObjectMeta: api.ObjectMeta{Name: "rc",
 			Labels: map[string]string{
@ -897,8 +898,9 @@ func TestUpdateWithRetries(t *testing.T) {
 					},
 				},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
--- a/pkg/kubelet/config/common_test.go
+++ b/pkg/kubelet/config/common_test.go
@ -31,6 +31,7 @@ import (
 func noDefault(*api.Pod) error { return nil }

 func TestDecodeSinglePod(t *testing.T) {
+	grace := int64(30)
 	pod := &api.Pod{
 		TypeMeta: api.TypeMeta{
 			APIVersion: "",
@ -41,8 +42,9 @@ func TestDecodeSinglePod(t *testing.T) {
 			Namespace: "mynamespace",
 		},
 		Spec: api.PodSpec{
-			RestartPolicy: api.RestartPolicyAlways,
-			DNSPolicy:     api.DNSClusterFirst,
+			RestartPolicy:                 api.RestartPolicyAlways,
+			DNSPolicy:                     api.DNSClusterFirst,
+			TerminationGracePeriodSeconds: &grace,
 			Containers: []api.Container{{
 				Name:                   "image",
 				Image:                  "test/image",
@ -91,6 +93,7 @@ func TestDecodeSinglePod(t *testing.T) {
 }

 func TestDecodePodList(t *testing.T) {
+	grace := int64(30)
 	pod := &api.Pod{
 		TypeMeta: api.TypeMeta{
 			APIVersion: "",
@ -101,8 +104,9 @@ func TestDecodePodList(t *testing.T) {
 			Namespace: "mynamespace",
 		},
 		Spec: api.PodSpec{
-			RestartPolicy: api.RestartPolicyAlways,
-			DNSPolicy:     api.DNSClusterFirst,
+			RestartPolicy:                 api.RestartPolicyAlways,
+			DNSPolicy:                     api.DNSClusterFirst,
+			TerminationGracePeriodSeconds: &grace,
 			Containers: []api.Container{{
 				Name:                   "image",
 				Image:                  "test/image",
--- a/pkg/kubelet/config/config.go
+++ b/pkg/kubelet/config/config.go
@ -217,9 +217,8 @@ func (s *podStorage) merge(source string, change interface{}) (adds, updates, de
 		for _, ref := range filtered {
 			name := kubecontainer.GetPodFullName(ref)
 			if existing, found := pods[name]; found {
-				if !reflect.DeepEqual(existing.Spec, ref.Spec) {
+				if checkAndUpdatePod(existing, ref) {
 					// this is an update
-					existing.Spec = ref.Spec
 					updates.Pods = append(updates.Pods, existing)
 					continue
 				}
@ -261,9 +260,8 @@ func (s *podStorage) merge(source string, change interface{}) (adds, updates, de
 			name := kubecontainer.GetPodFullName(ref)
 			if existing, found := oldPods[name]; found {
 				pods[name] = existing
-				if !reflect.DeepEqual(existing.Spec, ref.Spec) {
+				if checkAndUpdatePod(existing, ref) {
 					// this is an update
-					existing.Spec = ref.Spec
 					updates.Pods = append(updates.Pods, existing)
 					continue
 				}
@ -335,6 +333,23 @@ func filterInvalidPods(pods []*api.Pod, source string, recorder record.EventReco
 	return
 }

+// checkAndUpdatePod updates existing if ref makes a meaningful change and returns true, or
+// returns false if there was no update.
+func checkAndUpdatePod(existing, ref *api.Pod) bool {
+	// TODO: it would be better to update the whole object and only preserve certain things
+	//       like the source annotation or the UID (to ensure safety)
+	if reflect.DeepEqual(existing.Spec, ref.Spec) &&
+		reflect.DeepEqual(existing.DeletionTimestamp, ref.DeletionTimestamp) &&
+		reflect.DeepEqual(existing.DeletionGracePeriodSeconds, ref.DeletionGracePeriodSeconds) {
+		return false
+	}
+	// this is an update
+	existing.Spec = ref.Spec
+	existing.DeletionTimestamp = ref.DeletionTimestamp
+	existing.DeletionGracePeriodSeconds = ref.DeletionGracePeriodSeconds
+	return true
+}
+
 // Sync sends a copy of the current state through the update channel.
 func (s *podStorage) Sync() {
 	s.updateLock.Lock()
--- a/pkg/kubelet/config/file_test.go
+++ b/pkg/kubelet/config/file_test.go
@ -69,6 +69,7 @@ func writeTestFile(t *testing.T, dir, name string, contents string) *os.File {

 func TestReadPodsFromFile(t *testing.T) {
 	hostname := "random-test-hostname"
+	grace := int64(30)
 	var testCases = []struct {
 		desc     string
 		pod      runtime.Object
@ -98,9 +99,10 @@ func TestReadPodsFromFile(t *testing.T) {
 					SelfLink:  getSelfLink("test-"+hostname, "mynamespace"),
 				},
 				Spec: api.PodSpec{
-					NodeName:      hostname,
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					NodeName:                      hostname,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 					Containers: []api.Container{{
 						Name:  "image",
 						Image: "test/image",
--- a/pkg/kubelet/config/http_test.go
+++ b/pkg/kubelet/config/http_test.go
@ -123,6 +123,7 @@ func TestExtractInvalidPods(t *testing.T) {
 func TestExtractPodsFromHTTP(t *testing.T) {
 	hostname := "different-value"

+	grace := int64(30)
 	var testCases = []struct {
 		desc     string
 		pods     runtime.Object
@ -156,9 +157,11 @@ func TestExtractPodsFromHTTP(t *testing.T) {
 						SelfLink: getSelfLink("foo-"+hostname, "mynamespace"),
 					},
 					Spec: api.PodSpec{
-						NodeName:      hostname,
-						RestartPolicy: api.RestartPolicyAlways,
-						DNSPolicy:     api.DNSClusterFirst,
+						NodeName:                      hostname,
+						RestartPolicy:                 api.RestartPolicyAlways,
+						DNSPolicy:                     api.DNSClusterFirst,
+						TerminationGracePeriodSeconds: &grace,
+
 						Containers: []api.Container{{
 							Name:  "1",
 							Image: "foo",
@ -209,9 +212,11 @@ func TestExtractPodsFromHTTP(t *testing.T) {
 						SelfLink: getSelfLink("foo-"+hostname, kubelet.NamespaceDefault),
 					},
 					Spec: api.PodSpec{
-						NodeName:      hostname,
-						RestartPolicy: api.RestartPolicyAlways,
-						DNSPolicy:     api.DNSClusterFirst,
+						NodeName:                      hostname,
+						RestartPolicy:                 api.RestartPolicyAlways,
+						DNSPolicy:                     api.DNSClusterFirst,
+						TerminationGracePeriodSeconds: &grace,
+
 						Containers: []api.Container{{
 							Name:  "1",
 							Image: "foo",
@ -229,9 +234,11 @@ func TestExtractPodsFromHTTP(t *testing.T) {
 						SelfLink: getSelfLink("bar-"+hostname, kubelet.NamespaceDefault),
 					},
 					Spec: api.PodSpec{
-						NodeName:      hostname,
-						RestartPolicy: api.RestartPolicyAlways,
-						DNSPolicy:     api.DNSClusterFirst,
+						NodeName:                      hostname,
+						RestartPolicy:                 api.RestartPolicyAlways,
+						DNSPolicy:                     api.DNSClusterFirst,
+						TerminationGracePeriodSeconds: &grace,
+
 						Containers: []api.Container{{
 							Name:  "2",
 							Image: "bar",
--- a/pkg/kubelet/container/fake_runtime.go
+++ b/pkg/kubelet/container/fake_runtime.go
@ -163,13 +163,13 @@ func (f *FakeRuntime) SyncPod(pod *api.Pod, _ Pod, _ api.PodStatus, _ []api.Secr
 	return f.Err
 }

-func (f *FakeRuntime) KillPod(pod Pod) error {
+func (f *FakeRuntime) KillPod(pod *api.Pod, runningPod Pod) error {
 	f.Lock()
 	defer f.Unlock()

 	f.CalledFunctions = append(f.CalledFunctions, "KillPod")
-	f.KilledPods = append(f.KilledPods, string(pod.ID))
-	for _, c := range pod.Containers {
+	f.KilledPods = append(f.KilledPods, string(runningPod.ID))
+	for _, c := range runningPod.Containers {
 		f.KilledContainers = append(f.KilledContainers, c.Name)
 	}
 	return f.Err
--- a/pkg/kubelet/container/runtime.go
+++ b/pkg/kubelet/container/runtime.go
@ -54,8 +54,8 @@ type Runtime interface {
 	GetPods(all bool) ([]*Pod, error)
 	// Syncs the running pod into the desired pod.
 	SyncPod(pod *api.Pod, runningPod Pod, podStatus api.PodStatus, pullSecrets []api.Secret) error
-	// KillPod kills all the containers of a pod.
-	KillPod(pod Pod) error
+	// KillPod kills all the containers of a pod. Pod may be nil, running pod must not be.
+	KillPod(pod *api.Pod, runningPod Pod) error
 	// GetPodStatus retrieves the status of the pod, including the information of
 	// all containers in the pod. Clients of this interface assume the containers
 	// statuses in a pod always have a deterministic ordering (eg: sorted by name).
--- a/pkg/kubelet/dockertools/manager.go
+++ b/pkg/kubelet/dockertools/manager.go
@ -56,12 +56,21 @@ import (
 const (
 	maxReasonCacheEntries = 200

-	kubernetesPodLabel       = "io.kubernetes.pod.data"
-	kubernetesContainerLabel = "io.kubernetes.container.name"
 	// ndots specifies the minimum number of dots that a domain name must contain for the resolver to consider it as FQDN (fully-qualified)
 	// we want to able to consider SRV lookup names like _dns._udp.kube-dns.default.svc to be considered relative.
 	// hence, setting ndots to be 5.
 	ndotsDNSOption = "options ndots:5\n"
+	// In order to avoid unnecessary SIGKILLs, give every container a minimum grace
+	// period after SIGTERM. Docker will guarantee the termination, but SIGTERM is
+	// potentially dangerous.
+	// TODO: evaluate whether there are scenarios in which SIGKILL is preferable to
+	// SIGTERM for certain process types, which may justify setting this to 0.
+	minimumGracePeriodInSeconds = 2
+
+	kubernetesNameLabel                   = "io.kubernetes.pod.name"
+	kubernetesPodLabel                    = "io.kubernetes.pod.data"
+	kubernetesTerminationGracePeriodLabel = "io.kubernetes.pod.terminationGracePeriod"
+	kubernetesContainerLabel              = "io.kubernetes.container.name"
 )

 // DockerManager implements the Runtime interface.
@ -588,12 +597,19 @@ func (dm *DockerManager) runContainer(
 	if len(containerHostname) > hostnameMaxLen {
 		containerHostname = containerHostname[:hostnameMaxLen]
 	}
+
+	// Pod information is recorded on the container as labels to preserve it in the event the pod is deleted
+	// while the Kubelet is down and there is no information available to recover the pod. This includes
+	// termination information like the termination grace period and the pre stop hooks.
+	// TODO: keep these labels up to date if the pod changes
 	namespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name}
 	labels := map[string]string{
-		"io.kubernetes.pod.name": namespacedName.String(),
+		kubernetesNameLabel: namespacedName.String(),
+	}
+	if pod.Spec.TerminationGracePeriodSeconds != nil {
+		labels[kubernetesTerminationGracePeriodLabel] = strconv.FormatInt(*pod.Spec.TerminationGracePeriodSeconds, 10)
 	}
 	if container.Lifecycle != nil && container.Lifecycle.PreStop != nil {
-		glog.V(1).Infof("Setting preStop hook")
 		// TODO: This is kind of hacky, we should really just encode the bits we need.
 		data, err := latest.Codec.Encode(pod)
 		if err != nil {
@ -1104,40 +1120,56 @@ func (dm *DockerManager) PortForward(pod *kubecontainer.Pod, port uint16, stream
 }

 // Kills all containers in the specified pod
-func (dm *DockerManager) KillPod(pod kubecontainer.Pod) error {
+func (dm *DockerManager) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error {
 	// Send the kills in parallel since they may take a long time. Len + 1 since there
 	// can be Len errors + the networkPlugin teardown error.
-	errs := make(chan error, len(pod.Containers)+1)
+	errs := make(chan error, len(runningPod.Containers)+1)
 	wg := sync.WaitGroup{}
-	var networkID types.UID
-	for _, container := range pod.Containers {
+	var (
+		networkContainer *kubecontainer.Container
+		networkSpec      *api.Container
+	)
+	for _, container := range runningPod.Containers {
 		wg.Add(1)
 		go func(container *kubecontainer.Container) {
 			defer util.HandleCrash()
 			defer wg.Done()

+			var containerSpec *api.Container
+			if pod != nil {
+				for i, c := range pod.Spec.Containers {
+					if c.Name == container.Name {
+						containerSpec = &pod.Spec.Containers[i]
+						break
+					}
+				}
+			}
+
 			// TODO: Handle this without signaling the pod infra container to
 			// adapt to the generic container runtime.
 			if container.Name == PodInfraContainerName {
 				// Store the container runtime for later deletion.
 				// We do this so that PreStop handlers can run in the network namespace.
-				networkID = container.ID
+				networkContainer = container
+				networkSpec = containerSpec
 				return
 			}
-			if err := dm.killContainer(container.ID); err != nil {
-				glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID)
+
+			err := dm.killContainer(container.ID, containerSpec, pod)
+			if err != nil {
+				glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, runningPod.ID)
 				errs <- err
 			}
 		}(container)
 	}
 	wg.Wait()
-	if len(networkID) > 0 {
-		if err := dm.networkPlugin.TearDownPod(pod.Namespace, pod.Name, kubeletTypes.DockerID(networkID)); err != nil {
+	if networkContainer != nil {
+		if err := dm.networkPlugin.TearDownPod(runningPod.Namespace, runningPod.Name, kubeletTypes.DockerID(networkContainer.ID)); err != nil {
 			glog.Errorf("Failed tearing down the infra container: %v", err)
 			errs <- err
 		}
-		if err := dm.killContainer(networkID); err != nil {
-			glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID)
+		if err := dm.killContainer(networkContainer.ID, networkSpec, pod); err != nil {
+			glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, runningPod.ID)
 			errs <- err
 		}
 	}
@ -1152,75 +1184,142 @@ func (dm *DockerManager) KillPod(pod kubecontainer.Pod) error {
 	return nil
 }

-// KillContainerInPod kills a container in the pod.
-func (dm *DockerManager) KillContainerInPod(container api.Container, pod *api.Pod) error {
-	// Locate the container.
-	pods, err := dm.GetPods(false)
-	if err != nil {
-		return err
-	}
-	targetPod := kubecontainer.Pods(pods).FindPod(kubecontainer.GetPodFullName(pod), pod.UID)
-	targetContainer := targetPod.FindContainerByName(container.Name)
-	if targetContainer == nil {
-		return fmt.Errorf("unable to find container %q in pod %q", container.Name, targetPod.Name)
-	}
-	return dm.killContainer(targetContainer.ID)
-}
-
-// TODO(vmarmol): Unexport this as it is no longer used externally.
-// KillContainer kills a container identified by containerID.
-// Internally, it invokes docker's StopContainer API with a timeout of 10s.
-// TODO: Deprecate this function in favor of KillContainerInPod.
-func (dm *DockerManager) KillContainer(containerID types.UID) error {
-	return dm.killContainer(containerID)
-}
-
-func (dm *DockerManager) killContainer(containerID types.UID) error {
-	ID := string(containerID)
-	glog.V(2).Infof("Killing container with id %q", ID)
-	inspect, err := dm.client.InspectContainer(ID)
-	if err != nil {
-		return err
-	}
-	var found bool
-	var preStop string
-	if inspect != nil && inspect.Config != nil && inspect.Config.Labels != nil {
-		preStop, found = inspect.Config.Labels[kubernetesPodLabel]
-	}
-	if found {
-		var pod api.Pod
-		err := latest.Codec.DecodeInto([]byte(preStop), &pod)
+// KillContainerInPod kills a container in the pod. It must be passed either a container ID or a container and pod,
+// and will attempt to lookup the other information if missing.
+func (dm *DockerManager) KillContainerInPod(containerID types.UID, container *api.Container, pod *api.Pod) error {
+	switch {
+	case len(containerID) == 0:
+		// Locate the container.
+		pods, err := dm.GetPods(false)
 		if err != nil {
-			glog.Errorf("Failed to decode prestop: %s, %s", preStop, ID)
-		} else {
-			name := inspect.Config.Labels[kubernetesContainerLabel]
-			var container *api.Container
+			return err
+		}
+		targetPod := kubecontainer.Pods(pods).FindPod(kubecontainer.GetPodFullName(pod), pod.UID)
+		targetContainer := targetPod.FindContainerByName(container.Name)
+		if targetContainer == nil {
+			return fmt.Errorf("unable to find container %q in pod %q", container.Name, targetPod.Name)
+		}
+		containerID = targetContainer.ID
+
+	case container == nil || pod == nil:
+		// Read information about the container from labels
+		inspect, err := dm.client.InspectContainer(string(containerID))
+		if err != nil {
+			return err
+		}
+		storedPod, storedContainer, cerr := containerAndPodFromLabels(inspect)
+		if cerr != nil {
+			glog.Errorf("unable to access pod data from container: %v", err)
+		}
+		if container == nil {
+			container = storedContainer
+		}
+		if pod == nil {
+			pod = storedPod
+		}
+	}
+	return dm.killContainer(containerID, container, pod)
+}
+
+// killContainer accepts a containerID and an optional container or pod containing shutdown policies. Invoke
+// KillContainerInPod if information must be retrieved first.
+func (dm *DockerManager) killContainer(containerID types.UID, container *api.Container, pod *api.Pod) error {
+	ID := string(containerID)
+	name := ID
+	if container != nil {
+		name = fmt.Sprintf("%s %s", name, container.Name)
+	}
+	if pod != nil {
+		name = fmt.Sprintf("%s %s/%s", name, pod.Namespace, pod.Name)
+	}
+
+	gracePeriod := int64(minimumGracePeriodInSeconds)
+	if pod != nil {
+		switch {
+		case pod.DeletionGracePeriodSeconds != nil:
+			gracePeriod = *pod.DeletionGracePeriodSeconds
+		case pod.Spec.TerminationGracePeriodSeconds != nil:
+			gracePeriod = *pod.Spec.TerminationGracePeriodSeconds
+		}
+	}
+	glog.V(2).Infof("Killing container %q with %d second grace period", name, gracePeriod)
+	start := util.Now()
+
+	if pod != nil && container != nil && container.Lifecycle != nil && container.Lifecycle.PreStop != nil {
+		glog.V(4).Infof("Running preStop hook for container %q", name)
+		// TODO: timebox PreStop execution to at most gracePeriod
+		if err := dm.runner.Run(ID, pod, container, container.Lifecycle.PreStop); err != nil {
+			glog.Errorf("preStop hook for container %q failed: %v", name, err)
+		}
+		gracePeriod -= int64(util.Now().Sub(start.Time).Seconds())
+	}
+
+	dm.readinessManager.RemoveReadiness(ID)
+
+	// always give containers a minimal shutdown window to avoid unnecessary SIGKILLs
+	if gracePeriod < minimumGracePeriodInSeconds {
+		gracePeriod = minimumGracePeriodInSeconds
+	}
+	err := dm.client.StopContainer(ID, uint(gracePeriod))
+	if _, ok := err.(*docker.ContainerNotRunning); ok && err != nil {
+		glog.V(4).Infof("Container %q has already exited", name)
+		return nil
+	}
+	if err == nil {
+		glog.V(2).Infof("Container %q exited after %s", name, util.Now().Sub(start.Time))
+	} else {
+		glog.V(2).Infof("Container %q termination failed after %s: %v", name, util.Now().Sub(start.Time), err)
+	}
+	ref, ok := dm.containerRefManager.GetRef(ID)
+	if !ok {
+		glog.Warningf("No ref for pod '%q'", name)
+	} else {
+		// TODO: pass reason down here, and state, or move this call up the stack.
+		dm.recorder.Eventf(ref, "Killing", "Killing with docker id %v", util.ShortenString(ID, 12))
+	}
+	return err
+}
+
+var errNoPodOnContainer = fmt.Errorf("no pod information labels on Docker container")
+
+// containerAndPodFromLabels tries to load the appropriate container info off of a Docker container's labels
+func containerAndPodFromLabels(inspect *docker.Container) (pod *api.Pod, container *api.Container, err error) {
+	if inspect == nil && inspect.Config == nil && inspect.Config.Labels == nil {
+		return nil, nil, errNoPodOnContainer
+	}
+	labels := inspect.Config.Labels
+
+	// the pod data may not be set
+	if body, found := labels[kubernetesPodLabel]; found {
+		pod = &api.Pod{}
+		if err = latest.Codec.DecodeInto([]byte(body), pod); err == nil {
+			name := labels[kubernetesContainerLabel]
 			for ix := range pod.Spec.Containers {
 				if pod.Spec.Containers[ix].Name == name {
 					container = &pod.Spec.Containers[ix]
 					break
 				}
 			}
-			if container != nil {
-				glog.V(1).Infof("Running preStop hook")
-				if err := dm.runner.Run(ID, &pod, container, container.Lifecycle.PreStop); err != nil {
-					glog.Errorf("failed to run preStop hook: %v", err)
-				}
-			} else {
-				glog.Errorf("unable to find container %v, %s", pod, name)
+			if container == nil {
+				err = fmt.Errorf("unable to find container %s in pod %v", name, pod)
+			}
+		} else {
+			pod = nil
+		}
+	}
+
+	// attempt to find the default grace period if we didn't commit a pod, but set the generic metadata
+	// field (the one used by kill)
+	if pod == nil {
+		if period, ok := labels[kubernetesTerminationGracePeriodLabel]; ok {
+			if seconds, err := strconv.ParseInt(period, 10, 64); err == nil {
+				pod = &api.Pod{}
+				pod.DeletionGracePeriodSeconds = &seconds
 			}
 		}
 	}
-	dm.readinessManager.RemoveReadiness(ID)
-	err = dm.client.StopContainer(ID, 10)
-	ref, ok := dm.containerRefManager.GetRef(ID)
-	if !ok {
-		glog.Warningf("No ref for pod '%v'", ID)
-	} else {
-		// TODO: pass reason down here, and state, or move this call up the stack.
-		dm.recorder.Eventf(ref, "Killing", "Killing with docker id %v", util.ShortenString(ID, 12))
-	}
-	return err
+
+	return
 }

 // Run a single container from a pod. Returns the docker container ID
@ -1253,7 +1352,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe
 	if container.Lifecycle != nil && container.Lifecycle.PostStart != nil {
 		handlerErr := dm.runner.Run(id, pod, container, container.Lifecycle.PostStart)
 		if handlerErr != nil {
-			dm.killContainer(types.UID(id))
+			dm.killContainer(types.UID(id), container, pod)
 			return kubeletTypes.DockerID(""), fmt.Errorf("failed to call event handler: %v", handlerErr)
 		}
 	}
@ -1534,10 +1633,10 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod

 	podFullName := kubecontainer.GetPodFullName(pod)
 	containerChanges, err := dm.computePodContainerChanges(pod, runningPod, podStatus)
-	glog.V(3).Infof("Got container changes for pod %q: %+v", podFullName, containerChanges)
 	if err != nil {
 		return err
 	}
+	glog.V(3).Infof("Got container changes for pod %q: %+v", podFullName, containerChanges)

 	if containerChanges.StartInfraContainer || (len(containerChanges.ContainersToKeep) == 0 && len(containerChanges.ContainersToStart) == 0) {
 		if len(containerChanges.ContainersToKeep) == 0 && len(containerChanges.ContainersToStart) == 0 {
@ -1547,7 +1646,7 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod
 		}

 		// Killing phase: if we want to start new infra container, or nothing is running kill everything (including infra container)
-		err = dm.KillPod(runningPod)
+		err = dm.KillPod(pod, runningPod)
 		if err != nil {
 			return err
 		}
@ -1557,7 +1656,15 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod
 			_, keep := containerChanges.ContainersToKeep[kubeletTypes.DockerID(container.ID)]
 			if !keep {
 				glog.V(3).Infof("Killing unwanted container %+v", container)
-				err = dm.KillContainer(container.ID)
+				// attempt to find the appropriate container policy
+				var podContainer *api.Container
+				for i, c := range pod.Spec.Containers {
+					if c.Name == container.Name {
+						podContainer = &pod.Spec.Containers[i]
+						break
+					}
+				}
+				err = dm.KillContainerInPod(container.ID, podContainer, pod)
 				if err != nil {
 					glog.Errorf("Error killing container: %v", err)
 				}
--- a/pkg/kubelet/dockertools/manager_test.go
+++ b/pkg/kubelet/dockertools/manager_test.go
@ -405,7 +405,7 @@ func TestKillContainerInPod(t *testing.T) {
 		manager.readinessManager.SetReadiness(c.ID, true)
 	}

-	if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err != nil {
+	if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err != nil {
 		t.Errorf("unexpected error: %v", err)
 	}
 	// Assert the container has been stopped.
@ -478,14 +478,14 @@ func TestKillContainerInPodWithPreStop(t *testing.T) {
 		manager.readinessManager.SetReadiness(c.ID, true)
 	}

-	if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err != nil {
+	if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err != nil {
 		t.Errorf("unexpected error: %v", err)
 	}
 	// Assert the container has been stopped.
 	if err := fakeDocker.AssertStopped([]string{containerToKill.ID}); err != nil {
 		t.Errorf("container was not stopped correctly: %v", err)
 	}
-	verifyCalls(t, fakeDocker, []string{"list", "inspect_container", "create_exec", "start_exec", "stop"})
+	verifyCalls(t, fakeDocker, []string{"list", "create_exec", "start_exec", "stop"})
 	if !reflect.DeepEqual(expectedCmd, fakeDocker.execCmd) {
 		t.Errorf("expected: %v, got %v", expectedCmd, fakeDocker.execCmd)
 	}
@ -522,7 +522,7 @@ func TestKillContainerInPodWithError(t *testing.T) {
 		manager.readinessManager.SetReadiness(c.ID, true)
 	}

-	if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err == nil {
+	if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err == nil {
 		t.Errorf("expected error, found nil")
 	}

@ -568,7 +568,7 @@ func replaceProber(dm *DockerManager, result probe.Result, err error) {
 // Unknown or error.
 //
 // PLEASE READ THE PROBE DOCS BEFORE CHANGING THIS TEST IF YOU ARE UNSURE HOW PROBES ARE SUPPOSED TO WORK:
-// (See https://github.com/GoogleCloudPlatform/kubernetes/blob/master/docs/pod-states.md#pod-conditions)
+// (See https://k8s.io/kubernetes/blob/master/docs/pod-states.md#pod-conditions)
 func TestProbeContainer(t *testing.T) {
 	manager, _ := newTestDockerManager()
 	dc := &docker.APIContainers{
@ -1021,7 +1021,7 @@ func TestSyncPodDeletesWithNoPodInfraContainer(t *testing.T) {

 	verifyCalls(t, fakeDocker, []string{
 		// Kill the container since pod infra container is not running.
-		"inspect_container", "stop",
+		"stop",
 		// Create pod infra container.
 		"create", "start", "inspect_container",
 		// Create container.
@ -1096,7 +1096,7 @@ func TestSyncPodDeletesDuplicate(t *testing.T) {
 		// Check the pod infra container.
 		"inspect_container",
 		// Kill the duplicated container.
-		"inspect_container", "stop",
+		"stop",
 	})
 	// Expect one of the duplicates to be killed.
 	if len(fakeDocker.Stopped) != 1 || (fakeDocker.Stopped[0] != "1234" && fakeDocker.Stopped[0] != "4567") {
@ -1150,7 +1150,7 @@ func TestSyncPodBadHash(t *testing.T) {
 		// Check the pod infra container.
 		"inspect_container",
 		// Kill and restart the bad hash container.
-		"inspect_container", "stop", "create", "start", "inspect_container",
+		"stop", "create", "start", "inspect_container",
 	})

 	if err := fakeDocker.AssertStopped([]string{"1234"}); err != nil {
@ -1208,7 +1208,7 @@ func TestSyncPodsUnhealthy(t *testing.T) {
 		// Check the pod infra container.
 		"inspect_container",
 		// Kill the unhealthy container.
-		"inspect_container", "stop",
+		"stop",
 		// Restart the unhealthy container.
 		"create", "start", "inspect_container",
 	})
@ -1443,7 +1443,7 @@ func TestSyncPodWithRestartPolicy(t *testing.T) {
 				// Check the pod infra container.
 				"inspect_container",
 				// Stop the last pod infra container.
-				"inspect_container", "stop",
+				"stop",
 			},
 			[]string{},
 			[]string{"9876"},
@ -1910,7 +1910,7 @@ func TestSyncPodEventHandlerFails(t *testing.T) {
 		// Create the container.
 		"create", "start",
 		// Kill the container since event handler fails.
-		"inspect_container", "stop",
+		"stop",
 	})

 	// TODO(yifan): Check the stopped container's name.
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -1120,8 +1120,8 @@ func parseResolvConf(reader io.Reader) (nameservers []string, searches []string,
 }

 // Kill all running containers in a pod (includes the pod infra container).
-func (kl *Kubelet) killPod(pod kubecontainer.Pod) error {
-	return kl.containerRuntime.KillPod(pod)
+func (kl *Kubelet) killPod(pod *api.Pod, runningPod kubecontainer.Pod) error {
+	return kl.containerRuntime.KillPod(pod, runningPod)
 }

 type empty struct{}
@ -1177,9 +1177,10 @@ func (kl *Kubelet) syncPod(pod *api.Pod, mirrorPod *api.Pod, runningPod kubecont
 	}()

 	// Kill pods we can't run.
-	err := canRunPod(pod)
-	if err != nil {
-		kl.killPod(runningPod)
+	if err := canRunPod(pod); err != nil || pod.DeletionTimestamp != nil {
+		if err := kl.killPod(pod, runningPod); err != nil {
+			util.HandleError(err)
+		}
 		return err
 	}

@ -1366,6 +1367,32 @@ func (kl *Kubelet) cleanupOrphanedVolumes(pods []*api.Pod, runningPods []*kubeco
 	return nil
 }

+// Delete any pods that are no longer running and are marked for deletion.
+func (kl *Kubelet) cleanupTerminatedPods(pods []*api.Pod, runningPods []*kubecontainer.Pod) error {
+	var terminating []*api.Pod
+	for _, pod := range pods {
+		if pod.DeletionTimestamp != nil {
+			found := false
+			for _, runningPod := range runningPods {
+				if runningPod.ID == pod.UID {
+					found = true
+					break
+				}
+			}
+			if found {
+				podFullName := kubecontainer.GetPodFullName(pod)
+				glog.V(5).Infof("Keeping terminated pod %q and uid %q, still running", podFullName, pod.UID)
+				continue
+			}
+			terminating = append(terminating, pod)
+		}
+	}
+	if !kl.statusManager.TerminatePods(terminating) {
+		return errors.New("not all pods were successfully terminated")
+	}
+	return nil
+}
+
 // pastActiveDeadline returns true if the pod has been active for more than
 // ActiveDeadlineSeconds.
 func (kl *Kubelet) pastActiveDeadline(pod *api.Pod) bool {
@ -1528,6 +1555,10 @@ func (kl *Kubelet) cleanupPods(allPods []*api.Pod, admittedPods []*api.Pod) erro
 	// Remove any orphaned mirror pods.
 	kl.podManager.DeleteOrphanedMirrorPods()

+	if err := kl.cleanupTerminatedPods(allPods, runningPods); err != nil {
+		glog.Errorf("Failed to cleanup terminated pods: %v", err)
+	}
+
 	return err
 }

@ -1550,7 +1581,7 @@ func (kl *Kubelet) killUnwantedPods(desiredPods map[types.UID]empty,
 			}()
 			glog.V(1).Infof("Killing unwanted pod %q", pod.Name)
 			// Stop the containers.
-			err = kl.killPod(*pod)
+			err = kl.killPod(nil, *pod)
 			if err != nil {
 				glog.Errorf("Failed killing the pod %q: %v", pod.Name, err)
 				return
--- a/pkg/kubelet/mirror_client.go
+++ b/pkg/kubelet/mirror_client.go
@ -64,7 +64,7 @@ func (mc *basicMirrorClient) DeleteMirrorPod(podFullName string) error {
 		return err
 	}
 	glog.V(4).Infof("Deleting a mirror pod %q", podFullName)
-	if err := mc.apiserverClient.Pods(namespace).Delete(name, nil); err != nil {
+	if err := mc.apiserverClient.Pods(namespace).Delete(name, api.NewDeleteOptions(0)); err != nil {
 		glog.Errorf("Failed deleting a mirror pod %q: %v", podFullName, err)
 	}
 	return nil
--- a/pkg/kubelet/rkt/rkt.go
+++ b/pkg/kubelet/rkt/rkt.go
@ -676,11 +676,11 @@ func (r *runtime) GetPods(all bool) ([]*kubecontainer.Pod, error) {
 }

 // KillPod invokes 'systemctl kill' to kill the unit that runs the pod.
-func (r *runtime) KillPod(pod kubecontainer.Pod) error {
-	glog.V(4).Infof("Rkt is killing pod: name %q.", pod.Name)
+func (r *runtime) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error {
+	glog.V(4).Infof("Rkt is killing pod: name %q.", runningPod.Name)

 	// TODO(yifan): More graceful stop. Replace with StopUnit and wait for a timeout.
-	r.systemd.KillUnit(makePodServiceFileName(pod.ID), int32(syscall.SIGKILL))
+	r.systemd.KillUnit(makePodServiceFileName(runningPod.ID), int32(syscall.SIGKILL))
 	return r.systemd.Reload()
 }

@ -887,7 +887,7 @@ func (r *runtime) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, podStatus

 	if restartPod {
 		// TODO(yifan): Handle network plugin.
-		if err := r.KillPod(runningPod); err != nil {
+		if err := r.KillPod(pod, runningPod); err != nil {
 			return err
 		}
 		if err := r.RunPod(pod); err != nil {
--- a/pkg/kubelet/status_manager.go
+++ b/pkg/kubelet/status_manager.go
@ -24,6 +24,7 @@ import (

 	"github.com/golang/glog"
 	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/errors"
 	"k8s.io/kubernetes/pkg/client"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
 	kubeletTypes "k8s.io/kubernetes/pkg/kubelet/types"
@ -122,7 +123,7 @@ func (s *statusManager) SetPodStatus(pod *api.Pod, status api.PodStatus) {
 	// Currently this routine is not called for the same pod from multiple
 	// workers and/or the kubelet but dropping the lock before sending the
 	// status down the channel feels like an easy way to get a bullet in foot.
-	if !found || !isStatusEqual(&oldStatus, &status) {
+	if !found || !isStatusEqual(&oldStatus, &status) || pod.DeletionTimestamp != nil {
 		s.podStatuses[podFullName] = status
 		s.podStatusChannel <- podStatusSyncRequest{pod, status}
 	} else {
@ -130,6 +131,29 @@ func (s *statusManager) SetPodStatus(pod *api.Pod, status api.PodStatus) {
 	}
 }

+// TerminatePods resets the container status for the provided pods to terminated and triggers
+// a status update. This function may not enqueue all the provided pods, in which case it will
+// return false
+func (s *statusManager) TerminatePods(pods []*api.Pod) bool {
+	sent := true
+	s.podStatusesLock.Lock()
+	defer s.podStatusesLock.Unlock()
+	for _, pod := range pods {
+		for i := range pod.Status.ContainerStatuses {
+			pod.Status.ContainerStatuses[i].State = api.ContainerState{
+				Terminated: &api.ContainerStateTerminated{},
+			}
+		}
+		select {
+		case s.podStatusChannel <- podStatusSyncRequest{pod, pod.Status}:
+		default:
+			sent = false
+			glog.V(4).Infof("Termination notice for %q was dropped because the status channel is full", kubeletUtil.FormatPodName(pod))
+		}
+	}
+	return sent
+}
+
 func (s *statusManager) DeletePodStatus(podFullName string) {
 	s.podStatusesLock.Lock()
 	defer s.podStatusesLock.Unlock()
@ -161,13 +185,33 @@ func (s *statusManager) syncBatch() error {
 	}
 	// TODO: make me easier to express from client code
 	statusPod, err = s.kubeClient.Pods(statusPod.Namespace).Get(statusPod.Name)
+	if errors.IsNotFound(err) {
+		glog.V(3).Infof("Pod %q was deleted on the server", pod.Name)
+		return nil
+	}
 	if err == nil {
+		if len(pod.UID) > 0 && statusPod.UID != pod.UID {
+			glog.V(3).Infof("Pod %q was deleted and then recreated, skipping status update", kubeletUtil.FormatPodName(pod))
+			return nil
+		}
 		statusPod.Status = status
-		_, err = s.kubeClient.Pods(pod.Namespace).UpdateStatus(statusPod)
 		// TODO: handle conflict as a retry, make that easier too.
+		statusPod, err = s.kubeClient.Pods(pod.Namespace).UpdateStatus(statusPod)
 		if err == nil {
 			glog.V(3).Infof("Status for pod %q updated successfully", kubeletUtil.FormatPodName(pod))
-			return nil
+
+			if pod.DeletionTimestamp == nil {
+				return nil
+			}
+			if !notRunning(pod.Status.ContainerStatuses) {
+				glog.V(3).Infof("Pod %q is terminated, but some pods are still running", pod.Name)
+				return nil
+			}
+			if err := s.kubeClient.Pods(statusPod.Namespace).Delete(statusPod.Name, api.NewDeleteOptions(0)); err == nil {
+				glog.V(3).Infof("Pod %q fully terminated and removed from etcd", statusPod.Name)
+				s.DeletePodStatus(podFullName)
+				return nil
+			}
 		}
 	}

@ -181,3 +225,14 @@ func (s *statusManager) syncBatch() error {
 	go s.DeletePodStatus(podFullName)
 	return fmt.Errorf("error updating status for pod %q: %v", kubeletUtil.FormatPodName(pod), err)
 }
+
+// notRunning returns true if every status is terminated or waiting, or the status list
+// is empty.
+func notRunning(statuses []api.ContainerStatus) bool {
+	for _, status := range statuses {
+		if status.State.Terminated == nil && status.State.Waiting == nil {
+			return false
+		}
+	}
+	return true
+}
--- a/pkg/kubelet/status_manager_test.go
+++ b/pkg/kubelet/status_manager_test.go
@ -153,8 +153,21 @@ func TestUnchangedStatus(t *testing.T) {
 	verifyUpdates(t, syncer, 1)
 }

+func TestSyncBatchIgnoresNotFound(t *testing.T) {
+	syncer := newTestStatusManager()
+	syncer.SetPodStatus(testPod, getRandomPodStatus())
+	err := syncer.syncBatch()
+	if err != nil {
+		t.Errorf("unexpected syncing error: %v", err)
+	}
+	verifyActions(t, syncer.kubeClient, []testclient.Action{
+		testclient.GetActionImpl{ActionImpl: testclient.ActionImpl{Verb: "get", Resource: "pods"}},
+	})
+}
+
 func TestSyncBatch(t *testing.T) {
 	syncer := newTestStatusManager()
+	syncer.kubeClient = testclient.NewSimpleFake(testPod)
 	syncer.SetPodStatus(testPod, getRandomPodStatus())
 	err := syncer.syncBatch()
 	if err != nil {
@ -167,6 +180,22 @@ func TestSyncBatch(t *testing.T) {
 	)
 }

+func TestSyncBatchChecksMismatchedUID(t *testing.T) {
+	syncer := newTestStatusManager()
+	testPod.UID = "first"
+	differentPod := *testPod
+	differentPod.UID = "second"
+	syncer.kubeClient = testclient.NewSimpleFake(testPod)
+	syncer.SetPodStatus(&differentPod, getRandomPodStatus())
+	err := syncer.syncBatch()
+	if err != nil {
+		t.Errorf("unexpected syncing error: %v", err)
+	}
+	verifyActions(t, syncer.kubeClient, []testclient.Action{
+		testclient.GetActionImpl{ActionImpl: testclient.ActionImpl{Verb: "get", Resource: "pods"}},
+	})
+}
+
 // shuffle returns a new shuffled list of container statuses.
 func shuffle(statuses []api.ContainerStatus) []api.ContainerStatus {
 	numStatuses := len(statuses)
--- a/pkg/registry/controller/etcd/etcd_test.go
+++ b/pkg/registry/controller/etcd/etcd_test.go
@ -695,7 +695,7 @@ func TestDelete(t *testing.T) {
 		// If the controller is still around after trying to delete either the delete
 		// failed, or we're deleting it gracefully.
 		if fakeClient.Data[key].R.Node != nil {
-			return true
+			return fakeClient.Data[key].R.Node.TTL != 0
 		}
 		return false
 	}
--- a/pkg/registry/event/etcd/etcd_test.go
+++ b/pkg/registry/event/etcd/etcd_test.go
@ -37,6 +37,7 @@ var testTTL uint64 = 60

 func NewTestEventStorage(t *testing.T) (*tools.FakeEtcdClient, *REST) {
 	f := tools.NewFakeEtcdClient(t)
+	f.HideExpires = true
 	f.TestIndex = true

 	s := etcdstorage.NewEtcdStorage(f, testapi.Codec(), etcdtest.PathPrefix())
--- a/pkg/registry/generic/etcd/etcd.go
+++ b/pkg/registry/generic/etcd/etcd.go
@ -341,6 +341,11 @@ func (e *Etcd) Get(ctx api.Context, name string) (runtime.Object, error) {
 	return obj, nil
 }

+var (
+	errAlreadyDeleting = fmt.Errorf("abort delete")
+	errDeleteNow       = fmt.Errorf("delete now")
+)
+
 // Delete removes the item from etcd.
 func (e *Etcd) Delete(ctx api.Context, name string, options *api.DeleteOptions) (runtime.Object, error) {
 	key, err := e.KeyFunc(ctx, name)
@ -367,13 +372,41 @@ func (e *Etcd) Delete(ctx api.Context, name string, options *api.DeleteOptions)
 	if pendingGraceful {
 		return e.finalizeDelete(obj, false)
 	}
-	if graceful && *options.GracePeriodSeconds != 0 {
+	if graceful {
 		trace.Step("Graceful deletion")
 		out := e.NewFunc()
-		if err := e.Storage.Set(key, obj, out, uint64(*options.GracePeriodSeconds)); err != nil {
+		lastGraceful := int64(0)
+		err := e.Storage.GuaranteedUpdate(
+			key, out, false,
+			storage.SimpleUpdate(func(existing runtime.Object) (runtime.Object, error) {
+				graceful, pendingGraceful, err := rest.BeforeDelete(e.DeleteStrategy, ctx, existing, options)
+				if err != nil {
+					return nil, err
+				}
+				if pendingGraceful {
+					return nil, errAlreadyDeleting
+				}
+				if !graceful {
+					return nil, errDeleteNow
+				}
+				lastGraceful = *options.GracePeriodSeconds
+				return existing, nil
+			}),
+		)
+		switch err {
+		case nil:
+			if lastGraceful > 0 {
+				return out, nil
+			}
+			// fall through and delete immediately
+		case errDeleteNow:
+			// we've updated the object to have a zero grace period, or it's already at 0, so
+			// we should fall through and truly delete the object.
+		case errAlreadyDeleting:
+			return e.finalizeDelete(obj, true)
+		default:
 			return nil, etcderr.InterpretUpdateError(err, e.EndpointName, name)
 		}
-		return e.finalizeDelete(out, true)
 	}

 	// delete immediately, or no graceful deletion supported
--- a/pkg/registry/persistentvolume/etcd/etcd_test.go
+++ b/pkg/registry/persistentvolume/etcd/etcd_test.go
@ -300,7 +300,7 @@ func TestEtcdUpdateStatus(t *testing.T) {
 	key, _ := storage.KeyFunc(ctx, "foo")
 	key = etcdtest.AddPrefix(key)
 	pvStart := validNewPersistentVolume("foo")
-	fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvStart), 1)
+	fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvStart), 0)

 	pvIn := &api.PersistentVolume{
 		ObjectMeta: api.ObjectMeta{
--- a/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go
+++ b/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go
@ -298,7 +298,7 @@ func TestEtcdUpdateStatus(t *testing.T) {
 	key, _ := storage.KeyFunc(ctx, "foo")
 	key = etcdtest.AddPrefix(key)
 	pvcStart := validNewPersistentVolumeClaim("foo", api.NamespaceDefault)
-	fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvcStart), 1)
+	fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvcStart), 0)

 	pvc := &api.PersistentVolumeClaim{
 		ObjectMeta: api.ObjectMeta{
--- a/pkg/registry/pod/etcd/etcd_test.go
+++ b/pkg/registry/pod/etcd/etcd_test.go
@ -56,6 +56,7 @@ func newStorage(t *testing.T) (*REST, *BindingREST, *StatusREST, *tools.FakeEtcd
 }

 func validNewPod() *api.Pod {
+	grace := int64(30)
 	return &api.Pod{
 		ObjectMeta: api.ObjectMeta{
 			Name:      "foo",
@ -64,6 +65,8 @@ func validNewPod() *api.Pod {
 		Spec: api.PodSpec{
 			RestartPolicy: api.RestartPolicyAlways,
 			DNSPolicy:     api.DNSClusterFirst,
+
+			TerminationGracePeriodSeconds: &grace,
 			Containers: []api.Container{
 				{
 					Name:            "foo",
@ -118,8 +121,10 @@ func TestDelete(t *testing.T) {
 	key = etcdtest.AddPrefix(key)
 	test := resttest.New(t, storage, fakeEtcdClient.SetError)

+	expectedNode := "some-node"
 	createFn := func() runtime.Object {
 		pod := validChangedPod()
+		pod.Spec.NodeName = expectedNode
 		fakeEtcdClient.Data[key] = tools.EtcdResponseWithError{
 			R: &etcd.Response{
 				Node: &etcd.Node{
@ -134,8 +139,17 @@ func TestDelete(t *testing.T) {
 		if fakeEtcdClient.Data[key].R.Node == nil {
 			return false
 		}
-		return fakeEtcdClient.Data[key].R.Node.TTL == 30
+		obj, err := latest.Codec.Decode([]byte(fakeEtcdClient.Data[key].R.Node.Value))
+		if err != nil {
+			return false
+		}
+		pod := obj.(*api.Pod)
+		t.Logf("found object %#v", pod.ObjectMeta)
+		return pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil && *pod.DeletionGracePeriodSeconds != 0
 	}
+	test.TestDeleteGraceful(createFn, 30, gracefulSetFn)
+
+	expectedNode = ""
 	test.TestDelete(createFn, gracefulSetFn)
 }

@ -1027,6 +1041,7 @@ func TestEtcdUpdateScheduled(t *testing.T) {
 		},
 	}), 1)

+	grace := int64(30)
 	podIn := api.Pod{
 		ObjectMeta: api.ObjectMeta{
 			Name:            "foo",
@ -1048,6 +1063,8 @@ func TestEtcdUpdateScheduled(t *testing.T) {
 			},
 			RestartPolicy: api.RestartPolicyAlways,
 			DNSPolicy:     api.DNSClusterFirst,
+
+			TerminationGracePeriodSeconds: &grace,
 		},
 	}
 	_, _, err := registry.Update(ctx, &podIn)
@ -1088,7 +1105,7 @@ func TestEtcdUpdateStatus(t *testing.T) {
 			},
 		},
 	}
-	fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, &podStart), 1)
+	fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, &podStart), 0)

 	podIn := api.Pod{
 		ObjectMeta: api.ObjectMeta{
@ -1117,6 +1134,8 @@ func TestEtcdUpdateStatus(t *testing.T) {

 	expected := podStart
 	expected.ResourceVersion = "2"
+	grace := int64(30)
+	expected.Spec.TerminationGracePeriodSeconds = &grace
 	expected.Spec.RestartPolicy = api.RestartPolicyAlways
 	expected.Spec.DNSPolicy = api.DNSClusterFirst
 	expected.Spec.Containers[0].ImagePullPolicy = api.PullIfNotPresent
--- a/pkg/registry/pod/rest.go
+++ b/pkg/registry/pod/rest.go
@ -81,15 +81,49 @@ func (podStrategy) ValidateUpdate(ctx api.Context, obj, old runtime.Object) fiel
 	return append(errorList, validation.ValidatePodUpdate(obj.(*api.Pod), old.(*api.Pod))...)
 }

+// AllowUnconditionalUpdate allows pods to be overwritten
 func (podStrategy) AllowUnconditionalUpdate() bool {
 	return true
 }

-// CheckGracefulDelete allows a pod to be gracefully deleted.
+// CheckGracefulDelete allows a pod to be gracefully deleted. It updates the DeleteOptions to
+// reflect the desired grace value.
 func (podStrategy) CheckGracefulDelete(obj runtime.Object, options *api.DeleteOptions) bool {
+	if options == nil {
+		return false
+	}
+	pod := obj.(*api.Pod)
+	period := int64(0)
+	// user has specified a value
+	if options.GracePeriodSeconds != nil {
+		period = *options.GracePeriodSeconds
+	} else {
+		// use the default value if set, or deletes the pod immediately (0)
+		if pod.Spec.TerminationGracePeriodSeconds != nil {
+			period = *pod.Spec.TerminationGracePeriodSeconds
+		}
+	}
+	// if the pod is not scheduled, delete immediately
+	if len(pod.Spec.NodeName) == 0 {
+		period = 0
+	}
+	// ensure the options and the pod are in sync
+	options.GracePeriodSeconds = &period
+	return true
+}
+
+type podStrategyWithoutGraceful struct {
+	podStrategy
+}
+
+// CheckGracefulDelete prohibits graceful deletion.
+func (podStrategyWithoutGraceful) CheckGracefulDelete(obj runtime.Object, options *api.DeleteOptions) bool {
 	return false
 }

+// StrategyWithoutGraceful implements the legacy instant delele behavior.
+var StrategyWithoutGraceful = podStrategyWithoutGraceful{Strategy}
+
 type podStatusStrategy struct {
 	podStrategy
 }
@ -100,6 +134,7 @@ func (podStatusStrategy) PrepareForUpdate(obj, old runtime.Object) {
 	newPod := obj.(*api.Pod)
 	oldPod := old.(*api.Pod)
 	newPod.Spec = oldPod.Spec
+	newPod.DeletionTimestamp = nil
 }

 func (podStatusStrategy) ValidateUpdate(ctx api.Context, obj, old runtime.Object) fielderrors.ValidationErrorList {
--- a/pkg/registry/resourcequota/etcd/etcd_test.go
+++ b/pkg/registry/resourcequota/etcd/etcd_test.go
@ -389,7 +389,7 @@ func TestEtcdUpdateStatus(t *testing.T) {
 	key, _ := registry.KeyFunc(ctx, "foo")
 	key = etcdtest.AddPrefix(key)
 	resourcequotaStart := validNewResourceQuota()
-	fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, resourcequotaStart), 1)
+	fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, resourcequotaStart), 0)

 	resourcequotaIn := &api.ResourceQuota{
 		ObjectMeta: api.ObjectMeta{
--- a/pkg/storage/cacher_test.go
+++ b/pkg/storage/cacher_test.go
@ -54,11 +54,13 @@ func newTestCacher(client tools.EtcdClient) *storage.Cacher {
 }

 func makeTestPod(name string) *api.Pod {
+	gracePeriod := int64(30)
 	return &api.Pod{
 		ObjectMeta: api.ObjectMeta{Namespace: "ns", Name: name},
 		Spec: api.PodSpec{
-			DNSPolicy:     api.DNSClusterFirst,
-			RestartPolicy: api.RestartPolicyAlways,
+			TerminationGracePeriodSeconds: &gracePeriod,
+			DNSPolicy:                     api.DNSClusterFirst,
+			RestartPolicy:                 api.RestartPolicyAlways,
 		},
 	}
 }
--- a/pkg/storage/etcd/etcd_helper.go
+++ b/pkg/storage/etcd/etcd_helper.go
@ -394,14 +394,21 @@ func (h *etcdHelper) GuaranteedUpdate(key string, ptrToType runtime.Object, igno
 		ttl := uint64(0)
 		if node != nil {
 			index = node.ModifiedIndex
-			if node.TTL > 0 {
+			if node.TTL != 0 {
 				ttl = uint64(node.TTL)
 			}
+			if node.Expiration != nil && ttl == 0 {
+				ttl = 1
+			}
 		} else if res != nil {
 			index = res.EtcdIndex
 		}

 		if newTTL != nil {
+			if ttl != 0 && *newTTL == 0 {
+				// TODO: remove this after we have verified this is no longer an issue
+				glog.V(4).Infof("GuaranteedUpdate is clearing TTL for %q, may not be intentional", key)
+			}
 			ttl = *newTTL
 		}

--- a/pkg/storage/etcd/etcd_helper_test.go
+++ b/pkg/storage/etcd/etcd_helper_test.go
@ -123,28 +123,32 @@ func TestList(t *testing.T) {
 			},
 		},
 	}
+	grace := int64(30)
 	expect := api.PodList{
 		ListMeta: api.ListMeta{ResourceVersion: "10"},
 		Items: []api.Pod{
 			{
 				ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			{
 				ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			{
 				ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
@ -206,6 +210,7 @@ func TestListAcrossDirectories(t *testing.T) {
 			},
 		},
 	}
+	grace := int64(30)
 	expect := api.PodList{
 		ListMeta: api.ListMeta{ResourceVersion: "10"},
 		Items: []api.Pod{
@ -213,22 +218,25 @@ func TestListAcrossDirectories(t *testing.T) {
 			{
 				ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			{
 				ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			{
 				ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
@ -278,28 +286,32 @@ func TestListExcludesDirectories(t *testing.T) {
 			},
 		},
 	}
+	grace := int64(30)
 	expect := api.PodList{
 		ListMeta: api.ListMeta{ResourceVersion: "10"},
 		Items: []api.Pod{
 			{
 				ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			{
 				ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 			{
 				ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"},
 				Spec: api.PodSpec{
-					RestartPolicy: api.RestartPolicyAlways,
-					DNSPolicy:     api.DNSClusterFirst,
+					RestartPolicy:                 api.RestartPolicyAlways,
+					DNSPolicy:                     api.DNSClusterFirst,
+					TerminationGracePeriodSeconds: &grace,
 				},
 			},
 		},
@ -319,11 +331,13 @@ func TestGet(t *testing.T) {
 	fakeClient := tools.NewFakeEtcdClient(t)
 	helper := newEtcdHelper(fakeClient, testapi.Codec(), etcdtest.PathPrefix())
 	key := etcdtest.AddPrefix("/some/key")
+	grace := int64(30)
 	expect := api.Pod{
 		ObjectMeta: api.ObjectMeta{Name: "foo"},
 		Spec: api.PodSpec{
-			RestartPolicy: api.RestartPolicyAlways,
-			DNSPolicy:     api.DNSClusterFirst,
+			RestartPolicy:                 api.RestartPolicyAlways,
+			DNSPolicy:                     api.DNSClusterFirst,
+			TerminationGracePeriodSeconds: &grace,
 		},
 	}
 	fakeClient.Set(key, runtime.EncodeOrDie(testapi.Codec(), &expect), 0)
--- a/pkg/storage/etcd/etcd_watcher.go
+++ b/pkg/storage/etcd/etcd_watcher.go
@ -38,6 +38,7 @@ const (
 	EtcdSet    = "set"
 	EtcdCAS    = "compareAndSwap"
 	EtcdDelete = "delete"
+	EtcdExpire = "expire"
 )

 // TransformFunc attempts to convert an object to another object for use with a watcher.
@ -353,7 +354,7 @@ func (w *etcdWatcher) sendResult(res *etcd.Response) {
 		w.sendAdd(res)
 	case EtcdSet, EtcdCAS:
 		w.sendModify(res)
-	case EtcdDelete:
+	case EtcdDelete, EtcdExpire:
 		w.sendDelete(res)
 	default:
 		glog.Errorf("unknown action: %v", res.Action)
--- a/pkg/tools/fake_etcd_client.go
+++ b/pkg/tools/fake_etcd_client.go
@ -20,6 +20,7 @@ import (
 	"errors"
 	"sort"
 	"sync"
+	"time"

 	"github.com/coreos/go-etcd/etcd"
 )
@ -52,6 +53,8 @@ type FakeEtcdClient struct {
 	TestIndex   bool
 	ChangeIndex uint64
 	LastSetTTL  uint64
+	// Will avoid setting the expires header on objects to make comparison easier
+	HideExpires bool
 	Machines    []string

 	// Will become valid after Watch is called; tester may write to it. Tester may
@ -175,6 +178,11 @@ func (f *FakeEtcdClient) setLocked(key, value string, ttl uint64) (*etcd.Respons
 		prevResult := f.Data[key]
 		createdIndex := prevResult.R.Node.CreatedIndex
 		f.t.Logf("updating %v, index %v -> %v (ttl: %d)", key, createdIndex, i, ttl)
+		var expires *time.Time
+		if !f.HideExpires && ttl > 0 {
+			now := time.Now()
+			expires = &now
+		}
 		result := EtcdResponseWithError{
 			R: &etcd.Response{
 				Node: &etcd.Node{
@ -182,6 +190,7 @@ func (f *FakeEtcdClient) setLocked(key, value string, ttl uint64) (*etcd.Respons
 					CreatedIndex:  createdIndex,
 					ModifiedIndex: i,
 					TTL:           int64(ttl),
+					Expiration:    expires,
 				},
 			},
 		}
--- a/plugin/pkg/scheduler/factory/factory_test.go
+++ b/plugin/pkg/scheduler/factory/factory_test.go
@ -132,11 +132,13 @@ func PriorityTwo(pod *api.Pod, podLister algorithm.PodLister, minionLister algor
 }

 func TestDefaultErrorFunc(t *testing.T) {
+	grace := int64(30)
 	testPod := &api.Pod{
 		ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "bar"},
 		Spec: api.PodSpec{
-			RestartPolicy: api.RestartPolicyAlways,
-			DNSPolicy:     api.DNSClusterFirst,
+			RestartPolicy:                 api.RestartPolicyAlways,
+			DNSPolicy:                     api.DNSClusterFirst,
+			TerminationGracePeriodSeconds: &grace,
 		},
 	}
 	handler := util.FakeHandler{
--- a/test/e2e/kubectl.go
+++ b/test/e2e/kubectl.go
@ -791,7 +791,7 @@ func getUDData(jpgExpected string, ns string) func(*client.Client, string) error
 		if strings.Contains(data.Image, jpgExpected) {
 			return nil
 		} else {
-			return errors.New(fmt.Sprintf("data served up in container is innaccurate, %s didn't contain %s", data, jpgExpected))
+			return errors.New(fmt.Sprintf("data served up in container is inaccurate, %s didn't contain %s", data, jpgExpected))
 		}
 	}
 }
--- a/test/e2e/pd.go
+++ b/test/e2e/pd.go
@ -78,8 +78,8 @@ var _ = Describe("Pod Disks", func() {
 			By("cleaning up PD-RW test environment")
 			// Teardown pods, PD. Ignore errors.
 			// Teardown should do nothing unless test failed.
-			podClient.Delete(host0Pod.Name, nil)
-			podClient.Delete(host1Pod.Name, nil)
+			podClient.Delete(host0Pod.Name, api.NewDeleteOptions(0))
+			podClient.Delete(host1Pod.Name, api.NewDeleteOptions(0))
 			detachPD(host0Name, diskName)
 			detachPD(host1Name, diskName)
 			deletePD(diskName)
@ -98,7 +98,7 @@ var _ = Describe("Pod Disks", func() {
 		Logf("Wrote value: %v", testFileContents)

 		By("deleting host0Pod")
-		expectNoError(podClient.Delete(host0Pod.Name, nil), "Failed to delete host0Pod")
+		expectNoError(podClient.Delete(host0Pod.Name, api.NewDeleteOptions(0)), "Failed to delete host0Pod")

 		By("submitting host1Pod to kubernetes")
 		_, err = podClient.Create(host1Pod)
@ -113,7 +113,7 @@ var _ = Describe("Pod Disks", func() {
 		Expect(strings.TrimSpace(v)).To(Equal(strings.TrimSpace(testFileContents)))

 		By("deleting host1Pod")
-		expectNoError(podClient.Delete(host1Pod.Name, nil), "Failed to delete host1Pod")
+		expectNoError(podClient.Delete(host1Pod.Name, api.NewDeleteOptions(0)), "Failed to delete host1Pod")

 		By(fmt.Sprintf("deleting PD %q", diskName))
 		deletePDWithRetry(diskName)
@ -136,9 +136,9 @@ var _ = Describe("Pod Disks", func() {
 			By("cleaning up PD-RO test environment")
 			// Teardown pods, PD. Ignore errors.
 			// Teardown should do nothing unless test failed.
-			podClient.Delete(rwPod.Name, nil)
-			podClient.Delete(host0ROPod.Name, nil)
-			podClient.Delete(host1ROPod.Name, nil)
+			podClient.Delete(rwPod.Name, api.NewDeleteOptions(0))
+			podClient.Delete(host0ROPod.Name, api.NewDeleteOptions(0))
+			podClient.Delete(host1ROPod.Name, api.NewDeleteOptions(0))

 			detachPD(host0Name, diskName)
 			detachPD(host1Name, diskName)
@ -149,7 +149,7 @@ var _ = Describe("Pod Disks", func() {
 		_, err = podClient.Create(rwPod)
 		expectNoError(err, "Failed to create rwPod")
 		expectNoError(framework.WaitForPodRunning(rwPod.Name))
-		expectNoError(podClient.Delete(rwPod.Name, nil), "Failed to delete host0Pod")
+		expectNoError(podClient.Delete(rwPod.Name, api.NewDeleteOptions(0)), "Failed to delete host0Pod")
 		expectNoError(waitForPDDetach(diskName, host0Name))

 		By("submitting host0ROPod to kubernetes")
@ -165,10 +165,10 @@ var _ = Describe("Pod Disks", func() {
 		expectNoError(framework.WaitForPodRunning(host1ROPod.Name))

 		By("deleting host0ROPod")
-		expectNoError(podClient.Delete(host0ROPod.Name, nil), "Failed to delete host0ROPod")
+		expectNoError(podClient.Delete(host0ROPod.Name, api.NewDeleteOptions(0)), "Failed to delete host0ROPod")

 		By("deleting host1ROPod")
-		expectNoError(podClient.Delete(host1ROPod.Name, nil), "Failed to delete host1ROPod")
+		expectNoError(podClient.Delete(host1ROPod.Name, api.NewDeleteOptions(0)), "Failed to delete host1ROPod")

 		By(fmt.Sprintf("deleting PD %q", diskName))
 		deletePDWithRetry(diskName)
--- a/test/e2e/pods.go
+++ b/test/e2e/pods.go
@ -43,7 +43,7 @@ func runLivenessTest(c *client.Client, ns string, podDescr *api.Pod, expectResta
 	// At the end of the test, clean up by removing the pod.
 	defer func() {
 		By("deleting the pod")
-		c.Pods(ns).Delete(podDescr.Name, nil)
+		c.Pods(ns).Delete(podDescr.Name, api.NewDeleteOptions(0))
 	}()

 	// Wait until the pod is not pending. (Here we need to check for something other than
@ -86,15 +86,14 @@ func runLivenessTest(c *client.Client, ns string, podDescr *api.Pod, expectResta
 func testHostIP(c *client.Client, ns string, pod *api.Pod) {
 	podClient := c.Pods(ns)
 	By("creating pod")
-	defer podClient.Delete(pod.Name, nil)
-	_, err := podClient.Create(pod)
-	if err != nil {
+	defer podClient.Delete(pod.Name, api.NewDeleteOptions(0))
+	if _, err := podClient.Create(pod); err != nil {
 		Failf("Failed to create pod: %v", err)
 	}
 	By("ensuring that pod is running and has a hostIP")
 	// Wait for the pods to enter the running state. Waiting loops until the pods
 	// are running so non-running pods cause a timeout for this test.
-	err = waitForPodRunningInNamespace(c, pod.Name, ns)
+	err := waitForPodRunningInNamespace(c, pod.Name, ns)
 	Expect(err).NotTo(HaveOccurred())
 	// Try to make sure we get a hostIP for each pod.
 	hostIPTimeout := 2 * time.Minute
@ -222,7 +221,7 @@ var _ = Describe("Pods", func() {
 		// We call defer here in case there is a problem with
 		// the test so we can ensure that we clean up after
 		// ourselves
-		defer podClient.Delete(pod.Name, nil)
+		defer podClient.Delete(pod.Name, api.NewDeleteOptions(0))
 		_, err = podClient.Create(pod)
 		if err != nil {
 			Failf("Failed to create pod: %v", err)
@ -235,7 +234,7 @@ var _ = Describe("Pods", func() {
 		}
 		Expect(len(pods.Items)).To(Equal(1))

-		By("veryfying pod creation was observed")
+		By("verifying pod creation was observed")
 		select {
 		case event, _ := <-w.ResultChan():
 			if event.Type != watch.Added {
@ -245,7 +244,7 @@ var _ = Describe("Pods", func() {
 			Fail("Timeout while waiting for pod creation")
 		}

-		By("deleting the pod")
+		By("deleting the pod gracefully")
 		if err := podClient.Delete(pod.Name, nil); err != nil {
 			Failf("Failed to delete pod: %v", err)
 		}
@ -253,11 +252,13 @@ var _ = Describe("Pods", func() {
 		By("verifying pod deletion was observed")
 		deleted := false
 		timeout := false
+		var lastPod *api.Pod
 		timer := time.After(podStartTimeout)
 		for !deleted && !timeout {
 			select {
 			case event, _ := <-w.ResultChan():
 				if event.Type == watch.Deleted {
+					lastPod = event.Object.(*api.Pod)
 					deleted = true
 				}
 			case <-timer:
@ -268,6 +269,9 @@ var _ = Describe("Pods", func() {
 			Fail("Failed to observe pod deletion")
 		}

+		Expect(lastPod.DeletionTimestamp).ToNot(BeNil())
+		Expect(lastPod.Spec.TerminationGracePeriodSeconds).ToNot(BeZero())
+
 		pods, err = podClient.List(labels.SelectorFromSet(labels.Set(map[string]string{"time": value})), fields.Everything())
 		if err != nil {
 			Fail(fmt.Sprintf("Failed to list pods to verify deletion: %v", err))
@ -312,7 +316,7 @@ var _ = Describe("Pods", func() {
 		By("submitting the pod to kubernetes")
 		defer func() {
 			By("deleting the pod")
-			podClient.Delete(pod.Name, nil)
+			podClient.Delete(pod.Name, api.NewDeleteOptions(0))
 		}()
 		pod, err := podClient.Create(pod)
 		if err != nil {
@ -376,7 +380,7 @@ var _ = Describe("Pods", func() {
 				},
 			},
 		}
-		defer framework.Client.Pods(framework.Namespace.Name).Delete(serverPod.Name, nil)
+		defer framework.Client.Pods(framework.Namespace.Name).Delete(serverPod.Name, api.NewDeleteOptions(0))
 		_, err := framework.Client.Pods(framework.Namespace.Name).Create(serverPod)
 		if err != nil {
 			Failf("Failed to create serverPod: %v", err)
@ -600,7 +604,7 @@ var _ = Describe("Pods", func() {
 				// We call defer here in case there is a problem with
 				// the test so we can ensure that we clean up after
 				// ourselves
-				podClient.Delete(pod.Name)
+				podClient.Delete(pod.Name, api.NewDeleteOptions(0))
 			}()

 			By("waiting for the pod to start running")
@ -673,7 +677,7 @@ var _ = Describe("Pods", func() {
 				// We call defer here in case there is a problem with
 				// the test so we can ensure that we clean up after
 				// ourselves
-				podClient.Delete(pod.Name)
+				podClient.Delete(pod.Name, api.NewDeleteOptions(0))
 			}()

 			By("waiting for the pod to start running")
--- a/test/e2e/util.go
+++ b/test/e2e/util.go
@ -831,20 +831,24 @@ func expectNoError(err error, explain ...interface{}) {
 	ExpectWithOffset(1, err).NotTo(HaveOccurred(), explain...)
 }

-// Stops everything from filePath from namespace ns and checks if everything maching selectors from the given namespace is correctly stopped.
+// Stops everything from filePath from namespace ns and checks if everything matching selectors from the given namespace is correctly stopped.
 func cleanup(filePath string, ns string, selectors ...string) {
-	By("using stop to clean up resources")
+	By("using delete to clean up resources")
 	var nsArg string
 	if ns != "" {
 		nsArg = fmt.Sprintf("--namespace=%s", ns)
 	}
-	runKubectl("stop", "-f", filePath, nsArg)
+	runKubectl("stop", "--grace-period=0", "-f", filePath, nsArg)

 	for _, selector := range selectors {
-		resources := runKubectl("get", "pods,rc,svc", "-l", selector, "--no-headers", nsArg)
+		resources := runKubectl("get", "rc,svc", "-l", selector, "--no-headers", nsArg)
 		if resources != "" {
 			Failf("Resources left running after stop:\n%s", resources)
 		}
+		pods := runKubectl("get", "pods", "-l", selector, nsArg, "-t", "{{ range .items }}{{ if not .metadata.deletionTimestamp }}{{ .metadata.name }}{{ \"\\n\" }}{{ end }}{{ end }}")
+		if pods != "" {
+			Failf("Pods left unterminated after stop:\n%s", pods)
+		}
 	}
 }

--- a/test/images/network-tester/slow-pod.json
+++ b/test/images/network-tester/slow-pod.json
@ -0,0 +1,28 @@
+{
+  "kind": "Pod",
+  "apiVersion": "v1",
+  "metadata": {
+    "name": "slow-pod",
+    "labels": {
+      "name": "nettest"
+    }
+  },
+  "spec": {
+    "containers": [
+      {
+        "name": "webserver",
+        "image": "gcr.io/google_containers/nettest:1.5",
+        "args": [
+          "-service=nettest",
+          "-delay-shutdown=10"
+        ],
+        "ports": [
+          {
+            "containerPort": 8080,
+            "protocol": "TCP"
+          }
+        ]
+      }
+    ]
+  }
+}
--- a/test/images/network-tester/slow-rc.json
+++ b/test/images/network-tester/slow-rc.json
@ -0,0 +1,42 @@
+{
+  "kind": "ReplicationController",
+  "apiVersion": "v1",
+  "metadata": {
+    "name": "slow-rc",
+    "labels": {
+      "name": "nettest"
+    }
+  },
+  "spec": {
+    "replicas": 8,
+    "selector": {
+      "name": "nettest"
+    },
+    "template": {
+      "metadata": {
+        "labels": {
+          "name": "nettest"
+        }
+      },
+      "spec": {
+        "terminationGracePeriodSeconds": 5,
+        "containers": [
+          {
+            "name": "webserver",
+            "image": "gcr.io/google_containers/nettest:1.5",
+            "args": [
+              "-service=nettest",
+              "-delay-shutdown=10"
+            ],
+            "ports": [
+              {
+                "containerPort": 8080,
+                "protocol": "TCP"
+              }
+            ]
+          }
+        ]
+      }
+    }
+  }
+}
--- a/test/images/network-tester/webserver.go
+++ b/test/images/network-tester/webserver.go
@ -39,7 +39,9 @@ import (
 	"math/rand"
 	"net/http"
 	"os"
+	"os/signal"
 	"sync"
+	"syscall"
 	"time"

 	"k8s.io/kubernetes/pkg/client"
@ -47,10 +49,11 @@ import (
 )

 var (
-	port      = flag.Int("port", 8080, "Port number to serve at.")
-	peerCount = flag.Int("peers", 8, "Must find at least this many peers for the test to pass.")
-	service   = flag.String("service", "nettest", "Service to find other network test pods in.")
-	namespace = flag.String("namespace", "default", "Namespace of this pod. TODO: kubernetes should make this discoverable.")
+	port          = flag.Int("port", 8080, "Port number to serve at.")
+	peerCount     = flag.Int("peers", 8, "Must find at least this many peers for the test to pass.")
+	service       = flag.String("service", "nettest", "Service to find other network test pods in.")
+	namespace     = flag.String("namespace", "default", "Namespace of this pod. TODO: kubernetes should make this discoverable.")
+	delayShutdown = flag.Int("delay-shutdown", 0, "Number of seconds to delay shutdown when receiving SIGTERM.")
 )

 // State tracks the internal state of our little http server.
@ -178,6 +181,17 @@ func main() {
 		log.Fatalf("Error getting hostname: %v", err)
 	}

+	if *delayShutdown > 0 {
+		termCh := make(chan os.Signal)
+		signal.Notify(termCh, syscall.SIGTERM)
+		go func() {
+			<-termCh
+			log.Printf("Sleeping %d seconds before exit ...", *delayShutdown)
+			time.Sleep(time.Duration(*delayShutdown) * time.Second)
+			os.Exit(0)
+		}()
+	}
+
 	state := State{
 		Hostname:             hostname,
 		StillContactingPeers: true,
--- a/test/integration/auth_test.go
+++ b/test/integration/auth_test.go
@ -232,7 +232,7 @@ var deleteNow string = `
 {
  "kind": "DeleteOptions",
  "apiVersion": "` + testapi.Version() + `",
-  "gracePeriodSeconds": null%s
+  "gracePeriodSeconds": 0%s
 }
 `

--- a/test/integration/etcd_tools_test.go
+++ b/test/integration/etcd_tools_test.go
@ -80,6 +80,59 @@ func TestGet(t *testing.T) {
 	})
 }

+func TestWriteTTL(t *testing.T) {
+	client := framework.NewEtcdClient()
+	etcdStorage := etcd.NewEtcdStorage(client, testapi.Codec(), "")
+	framework.WithEtcdKey(func(key string) {
+		testObject := api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "foo"}}
+		if err := etcdStorage.Set(key, &testObject, nil, 0); err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		result := &api.ServiceAccount{}
+		err := etcdStorage.GuaranteedUpdate(key, result, false, func(obj runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+			if in, ok := obj.(*api.ServiceAccount); !ok || in.Name != "foo" {
+				t.Fatalf("unexpected existing object: %v", obj)
+			}
+			if res.TTL != 0 {
+				t.Fatalf("unexpected TTL: %#v", res)
+			}
+			ttl := uint64(10)
+			out := &api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "out"}}
+			return out, &ttl, nil
+		})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if result.Name != "out" {
+			t.Errorf("unexpected response: %#v", result)
+		}
+		if res, err := client.Get(key, false, false); err != nil || res == nil || res.Node.TTL != 10 {
+			t.Fatalf("unexpected get: %v %#v", err, res)
+		}
+
+		result = &api.ServiceAccount{}
+		err = etcdStorage.GuaranteedUpdate(key, result, false, func(obj runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) {
+			if in, ok := obj.(*api.ServiceAccount); !ok || in.Name != "out" {
+				t.Fatalf("unexpected existing object: %v", obj)
+			}
+			if res.TTL <= 1 {
+				t.Fatalf("unexpected TTL: %#v", res)
+			}
+			out := &api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "out2"}}
+			return out, nil, nil
+		})
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if result.Name != "out2" {
+			t.Errorf("unexpected response: %#v", result)
+		}
+		if res, err := client.Get(key, false, false); err != nil || res == nil || res.Node.TTL <= 1 {
+			t.Fatalf("unexpected get: %v %#v", err, res)
+		}
+	})
+}
+
 func TestWatch(t *testing.T) {
 	client := framework.NewEtcdClient()
 	etcdStorage := etcd.NewEtcdStorage(client, testapi.Codec(), etcdtest.PathPrefix())
--- a/test/integration/scheduler_test.go
+++ b/test/integration/scheduler_test.go
@ -277,7 +277,7 @@ func DoTestUnschedulableNodes(t *testing.T, restClient *client.Client, nodeStore
 			t.Logf("Test %d: Pod got scheduled on a schedulable node", i)
 		}

-		err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, nil)
+		err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, api.NewDeleteOptions(0))
 		if err != nil {
 			t.Errorf("Failed to delete pod: %v", err)
 		}