From c58970e47ca3de6cc16a538432bc7f400fe9a84f Mon Sep 17 00:00:00 2001
From: David Ashpole <dashpole@google.com>
Date: Thu, 23 Feb 2017 10:31:20 -0800
Subject: [PATCH] critical pods can preempt other pods to be admitted

---
 pkg/api/v1/resource_helpers.go                |  28 +
 pkg/kubelet/BUILD                             |   2 +
 pkg/kubelet/events/event.go                   |   1 +
 pkg/kubelet/kubelet.go                        |   4 +-
 pkg/kubelet/kubelet_test.go                   |   2 +-
 pkg/kubelet/lifecycle/BUILD                   |   2 +
 .../admission_failure_handler_stub.go         |  36 ++
 pkg/kubelet/lifecycle/predicate.go            |  28 +-
 pkg/kubelet/preemption/BUILD                  |  58 +++
 pkg/kubelet/preemption/preemption.go          | 264 ++++++++++
 pkg/kubelet/preemption/preemption_test.go     | 480 ++++++++++++++++++
 .../scheduler/algorithm/predicates/error.go   |   4 +
 test/e2e/framework/pods.go                    |  19 +-
 test/e2e_node/BUILD                           |   4 +
 test/e2e_node/critical_pod_test.go            | 148 ++++++
 15 files changed, 1071 insertions(+), 9 deletions(-)
 create mode 100644 pkg/kubelet/lifecycle/admission_failure_handler_stub.go
 create mode 100644 pkg/kubelet/preemption/BUILD
 create mode 100644 pkg/kubelet/preemption/preemption.go
 create mode 100644 pkg/kubelet/preemption/preemption_test.go
 create mode 100644 test/e2e_node/critical_pod_test.go

diff --git a/pkg/api/v1/resource_helpers.go b/pkg/api/v1/resource_helpers.go
index e5b791e0e5..ec84232762 100644
--- a/pkg/api/v1/resource_helpers.go
+++ b/pkg/api/v1/resource_helpers.go
@@ -227,3 +227,31 @@ func PodRequestsAndLimits(pod *Pod) (reqs map[ResourceName]resource.Quantity, li
 	}
 	return
 }
+
+// finds and returns the request for a specific resource.
+func GetResourceRequest(pod *Pod, resource ResourceName) int64 {
+	if resource == ResourcePods {
+		return 1
+	}
+	totalResources := int64(0)
+	for _, container := range pod.Spec.Containers {
+		if rQuantity, ok := container.Resources.Requests[resource]; ok {
+			if resource == ResourceCPU {
+				totalResources += rQuantity.MilliValue()
+			} else {
+				totalResources += rQuantity.Value()
+			}
+		}
+	}
+	// take max_resource(sum_pod, any_init_container)
+	for _, container := range pod.Spec.InitContainers {
+		if rQuantity, ok := container.Resources.Requests[resource]; ok {
+			if resource == ResourceCPU && rQuantity.MilliValue() > totalResources {
+				totalResources = rQuantity.MilliValue()
+			} else if rQuantity.Value() > totalResources {
+				totalResources = rQuantity.Value()
+			}
+		}
+	}
+	return totalResources
+}
diff --git a/pkg/kubelet/BUILD b/pkg/kubelet/BUILD
index 18f9967c97..66acc81e0c 100644
--- a/pkg/kubelet/BUILD
+++ b/pkg/kubelet/BUILD
@@ -65,6 +65,7 @@ go_library(
         "//pkg/kubelet/network:go_default_library",
         "//pkg/kubelet/pleg:go_default_library",
         "//pkg/kubelet/pod:go_default_library",
+        "//pkg/kubelet/preemption:go_default_library",
         "//pkg/kubelet/prober:go_default_library",
         "//pkg/kubelet/prober/results:go_default_library",
         "//pkg/kubelet/qos:go_default_library",
@@ -253,6 +254,7 @@ filegroup(
         "//pkg/kubelet/network:all-srcs",
         "//pkg/kubelet/pleg:all-srcs",
         "//pkg/kubelet/pod:all-srcs",
+        "//pkg/kubelet/preemption:all-srcs",
         "//pkg/kubelet/prober:all-srcs",
         "//pkg/kubelet/qos:all-srcs",
         "//pkg/kubelet/remote:all-srcs",
diff --git a/pkg/kubelet/events/event.go b/pkg/kubelet/events/event.go
index c1fa45d552..8124962f95 100644
--- a/pkg/kubelet/events/event.go
+++ b/pkg/kubelet/events/event.go
@@ -28,6 +28,7 @@ const (
 	FailedToCreateContainer = "Failed"
 	FailedToStartContainer  = "Failed"
 	KillingContainer        = "Killing"
+	PreemptContainer        = "Preempting"
 	BackOffStartContainer   = "BackOff"
 	ExceededGracePeriod     = "ExceededGracePeriod"
 
diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go
index 31fed0bb33..999dddc2b3 100644
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -75,6 +75,7 @@ import (
 	"k8s.io/kubernetes/pkg/kubelet/network"
 	"k8s.io/kubernetes/pkg/kubelet/pleg"
 	kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
+	"k8s.io/kubernetes/pkg/kubelet/preemption"
 	"k8s.io/kubernetes/pkg/kubelet/prober"
 	proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
 	"k8s.io/kubernetes/pkg/kubelet/remote"
@@ -783,7 +784,8 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
 	klet.AddPodSyncLoopHandler(activeDeadlineHandler)
 	klet.AddPodSyncHandler(activeDeadlineHandler)
 
-	klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay))
+	criticalPodAdmissionHandler := preemption.NewCriticalPodAdmissionHandler(klet.getActivePods, killPodNow(klet.podWorkers, kubeDeps.Recorder), kubeDeps.Recorder)
+	klet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(klet.getNodeAnyWay, criticalPodAdmissionHandler))
 	// apply functional Option's
 	for _, opt := range kubeDeps.Options {
 		opt(klet)
diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go
index b342c61235..0904369464 100644
--- a/pkg/kubelet/kubelet_test.go
+++ b/pkg/kubelet/kubelet_test.go
@@ -248,7 +248,7 @@ func newTestKubeletWithImageList(
 	kubelet.evictionManager = evictionManager
 	kubelet.admitHandlers.AddPodAdmitHandler(evictionAdmitHandler)
 	// Add this as cleanup predicate pod admitter
-	kubelet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(kubelet.getNodeAnyWay))
+	kubelet.admitHandlers.AddPodAdmitHandler(lifecycle.NewPredicateAdmitHandler(kubelet.getNodeAnyWay, lifecycle.NewAdmissionFailureHandlerStub()))
 
 	plug := &volumetest.FakeVolumePlugin{PluginName: "fake", Host: nil}
 	kubelet.volumePluginMgr, err =
diff --git a/pkg/kubelet/lifecycle/BUILD b/pkg/kubelet/lifecycle/BUILD
index e7f9c84721..7ee5ab965a 100644
--- a/pkg/kubelet/lifecycle/BUILD
+++ b/pkg/kubelet/lifecycle/BUILD
@@ -11,6 +11,7 @@ load(
 go_library(
     name = "go_default_library",
     srcs = [
+        "admission_failure_handler_stub.go",
         "doc.go",
         "fake_handler_runner.go",
         "handlers.go",
@@ -24,6 +25,7 @@ go_library(
         "//pkg/kubelet/types:go_default_library",
         "//pkg/kubelet/util/format:go_default_library",
         "//pkg/security/apparmor:go_default_library",
+        "//plugin/pkg/scheduler/algorithm:go_default_library",
         "//plugin/pkg/scheduler/algorithm/predicates:go_default_library",
         "//plugin/pkg/scheduler/schedulercache:go_default_library",
         "//vendor:github.com/golang/glog",
diff --git a/pkg/kubelet/lifecycle/admission_failure_handler_stub.go b/pkg/kubelet/lifecycle/admission_failure_handler_stub.go
new file mode 100644
index 0000000000..19266d4ced
--- /dev/null
+++ b/pkg/kubelet/lifecycle/admission_failure_handler_stub.go
@@ -0,0 +1,36 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package lifecycle
+
+import (
+	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
+)
+
+// AdmissionFailureHandlerStub is an AdmissionFailureHandler that does not perform any handling of admission failure.
+// It simply passes the failure on.
+type AdmissionFailureHandlerStub struct{}
+
+var _ AdmissionFailureHandler = &AdmissionFailureHandlerStub{}
+
+func NewAdmissionFailureHandlerStub() *AdmissionFailureHandlerStub {
+	return &AdmissionFailureHandlerStub{}
+}
+
+func (n *AdmissionFailureHandlerStub) HandleAdmissionFailure(pod *v1.Pod, failureReasons []algorithm.PredicateFailureReason) (bool, []algorithm.PredicateFailureReason, error) {
+	return false, failureReasons, nil
+}
diff --git a/pkg/kubelet/lifecycle/predicate.go b/pkg/kubelet/lifecycle/predicate.go
index 6c8f24b7ef..82e255caa1 100644
--- a/pkg/kubelet/lifecycle/predicate.go
+++ b/pkg/kubelet/lifecycle/predicate.go
@@ -22,20 +22,30 @@ import (
 	"github.com/golang/glog"
 	"k8s.io/kubernetes/pkg/api/v1"
 	"k8s.io/kubernetes/pkg/kubelet/util/format"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )
 
 type getNodeAnyWayFuncType func() (*v1.Node, error)
+
+// AdmissionFailureHandler is an interface which defines how to deal with a failure to admit a pod.
+// This allows for the graceful handling of pod admission failure.
+type AdmissionFailureHandler interface {
+	HandleAdmissionFailure(pod *v1.Pod, failureReasons []algorithm.PredicateFailureReason) (bool, []algorithm.PredicateFailureReason, error)
+}
+
 type predicateAdmitHandler struct {
-	getNodeAnyWayFunc getNodeAnyWayFuncType
+	getNodeAnyWayFunc       getNodeAnyWayFuncType
+	admissionFailureHandler AdmissionFailureHandler
 }
 
 var _ PodAdmitHandler = &predicateAdmitHandler{}
 
-func NewPredicateAdmitHandler(getNodeAnyWayFunc getNodeAnyWayFuncType) *predicateAdmitHandler {
+func NewPredicateAdmitHandler(getNodeAnyWayFunc getNodeAnyWayFuncType, admissionFailureHandler AdmissionFailureHandler) *predicateAdmitHandler {
 	return &predicateAdmitHandler{
 		getNodeAnyWayFunc,
+		admissionFailureHandler,
 	}
 }
 
@@ -59,10 +69,22 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
 		glog.Warningf("Failed to admit pod %v - %s", format.Pod(pod), message)
 		return PodAdmitResult{
 			Admit:   fit,
-			Reason:  "UnexpectedError",
+			Reason:  "UnexpectedAdmissionError",
 			Message: message,
 		}
 	}
+	if !fit {
+		fit, reasons, err = w.admissionFailureHandler.HandleAdmissionFailure(pod, reasons)
+		if err != nil {
+			message := fmt.Sprintf("Unexpected error while attempting to recover from admission failure: %v", err)
+			glog.Warningf("Failed to admit pod %v - %s", format.Pod(pod), message)
+			return PodAdmitResult{
+				Admit:   fit,
+				Reason:  "UnexpectedAdmissionError",
+				Message: message,
+			}
+		}
+	}
 	if !fit {
 		var reason string
 		var message string
diff --git a/pkg/kubelet/preemption/BUILD b/pkg/kubelet/preemption/BUILD
new file mode 100644
index 0000000000..45c43cb8aa
--- /dev/null
+++ b/pkg/kubelet/preemption/BUILD
@@ -0,0 +1,58 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+load(
+    "@io_bazel_rules_go//go:def.bzl",
+    "go_library",
+    "go_test",
+)
+
+go_library(
+    name = "go_default_library",
+    srcs = ["preemption.go"],
+    tags = ["automanaged"],
+    deps = [
+        "//pkg/api/v1:go_default_library",
+        "//pkg/features:go_default_library",
+        "//pkg/kubelet/events:go_default_library",
+        "//pkg/kubelet/eviction:go_default_library",
+        "//pkg/kubelet/lifecycle:go_default_library",
+        "//pkg/kubelet/qos:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
+        "//pkg/kubelet/util/format:go_default_library",
+        "//plugin/pkg/scheduler/algorithm:go_default_library",
+        "//plugin/pkg/scheduler/algorithm/predicates:go_default_library",
+        "//vendor:github.com/golang/glog",
+        "//vendor:k8s.io/apiserver/pkg/util/feature",
+        "//vendor:k8s.io/client-go/tools/record",
+    ],
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [":package-srcs"],
+    tags = ["automanaged"],
+)
+
+go_test(
+    name = "go_default_test",
+    srcs = ["preemption_test.go"],
+    library = ":go_default_library",
+    tags = ["automanaged"],
+    deps = [
+        "//pkg/api:go_default_library",
+        "//pkg/api/v1:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
+        "//vendor:k8s.io/apimachinery/pkg/api/resource",
+        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
+        "//vendor:k8s.io/client-go/tools/record",
+    ],
+)
diff --git a/pkg/kubelet/preemption/preemption.go b/pkg/kubelet/preemption/preemption.go
new file mode 100644
index 0000000000..f5ab045cc6
--- /dev/null
+++ b/pkg/kubelet/preemption/preemption.go
@@ -0,0 +1,264 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package preemption
+
+import (
+	"fmt"
+	"math"
+
+	"github.com/golang/glog"
+	utilfeature "k8s.io/apiserver/pkg/util/feature"
+	"k8s.io/client-go/tools/record"
+	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/pkg/features"
+	"k8s.io/kubernetes/pkg/kubelet/events"
+	"k8s.io/kubernetes/pkg/kubelet/eviction"
+	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
+	"k8s.io/kubernetes/pkg/kubelet/qos"
+	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+	"k8s.io/kubernetes/pkg/kubelet/util/format"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates"
+)
+
+const message = "Preempted in order to admit critical pod"
+
+// CriticalPodAdmissionFailureHandler is an AdmissionFailureHandler that handles admission failure for Critical Pods.
+// If the ONLY admission failures are due to insufficient resources, then CriticalPodAdmissionHandler evicts pods
+// so that the critical pod can be admitted.  For evictions, the CriticalPodAdmissionHandler evicts a set of pods that
+// frees up the required resource requests.  The set of pods is designed to minimize impact, and is prioritized according to the ordering:
+// minimal impact for guaranteed pods > minimal impact for burstable pods > minimal impact for besteffort pods.
+// minimal impact is defined as follows: fewest pods evicted > fewest total requests of pods.
+// finding the fewest total requests of pods is considered besteffort.
+type CriticalPodAdmissionHandler struct {
+	getPodsFunc eviction.ActivePodsFunc
+	killPodFunc eviction.KillPodFunc
+	recorder    record.EventRecorder
+}
+
+var _ lifecycle.AdmissionFailureHandler = &CriticalPodAdmissionHandler{}
+
+func NewCriticalPodAdmissionHandler(getPodsFunc eviction.ActivePodsFunc, killPodFunc eviction.KillPodFunc, recorder record.EventRecorder) *CriticalPodAdmissionHandler {
+	return &CriticalPodAdmissionHandler{
+		getPodsFunc: getPodsFunc,
+		killPodFunc: killPodFunc,
+		recorder:    recorder,
+	}
+}
+
+// HandleAdmissionFailure gracefully handles admission rejection, and, in some cases,
+// to allow admission of the pod despite its previous failure.
+func (c *CriticalPodAdmissionHandler) HandleAdmissionFailure(pod *v1.Pod, failureReasons []algorithm.PredicateFailureReason) (bool, []algorithm.PredicateFailureReason, error) {
+	if !kubetypes.IsCriticalPod(pod) || !utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) {
+		return false, failureReasons, nil
+	}
+	// InsufficientResourceError is not a reason to reject a critical pod.
+	// Instead of rejecting, we free up resources to admit it, if no other reasons for rejection exist.
+	nonResourceReasons := []algorithm.PredicateFailureReason{}
+	resourceReasons := []*admissionRequirement{}
+	for _, reason := range failureReasons {
+		if r, ok := reason.(*predicates.InsufficientResourceError); ok {
+			resourceReasons = append(resourceReasons, &admissionRequirement{
+				resourceName: r.ResourceName,
+				quantity:     r.GetInsufficientAmount(),
+			})
+		} else {
+			nonResourceReasons = append(nonResourceReasons, reason)
+		}
+	}
+	if len(nonResourceReasons) > 0 {
+		// Return only reasons that are not resource related, since critical pods cannot fail admission for resource reasons.
+		return false, nonResourceReasons, nil
+	}
+	err := c.evictPodsToFreeRequests(admissionRequirementList(resourceReasons))
+	// if no error is returned, preemption succeeded and the pod is safe to admit.
+	return err == nil, nil, err
+}
+
+// freeRequests takes a list of insufficient resources, and attempts to free them by evicting pods
+// based on requests.  For example, if the only insufficient resource is 200Mb of memory, this function could
+// evict a pod with request=250Mb.
+func (c *CriticalPodAdmissionHandler) evictPodsToFreeRequests(insufficientResources admissionRequirementList) error {
+	podsToPreempt, err := getPodsToPreempt(c.getPodsFunc(), insufficientResources)
+	if err != nil {
+		return fmt.Errorf("preemption: error finding a set of pods to preempt: %v", err)
+	}
+	glog.Infof("preemption: attempting to evict pods %v, in order to free up resources: %s", podsToPreempt, insufficientResources.toString())
+	for _, pod := range podsToPreempt {
+		status := v1.PodStatus{
+			Phase:   v1.PodFailed,
+			Message: message,
+			Reason:  events.PreemptContainer,
+		}
+		// record that we are evicting the pod
+		c.recorder.Eventf(pod, v1.EventTypeWarning, events.PreemptContainer, message)
+		// this is a blocking call and should only return when the pod and its containers are killed.
+		err := c.killPodFunc(pod, status, nil)
+		if err != nil {
+			return fmt.Errorf("preemption: pod %s failed to evict %v", format.Pod(pod), err)
+		}
+		glog.Infof("preemption: pod %s evicted successfully", format.Pod(pod))
+	}
+	return nil
+}
+
+// getPodsToPreempt returns a list of pods that could be preempted to free requests >= requirements
+func getPodsToPreempt(pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
+	bestEffortPods, burstablePods, guaranteedPods := sortPodsByQOS(pods)
+
+	// make sure that pods exist to reclaim the requirements
+	unableToMeetRequirements := requirements.subtract(append(append(bestEffortPods, burstablePods...), guaranteedPods...)...)
+	if len(unableToMeetRequirements) > 0 {
+		return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", unableToMeetRequirements.toString())
+	}
+	// find the guaranteed pods we would need to evict if we already evicted ALL burstable and besteffort pods.
+	guarateedToEvict, err := getPodsToPreemptByDistance(guaranteedPods, requirements.subtract(append(bestEffortPods, burstablePods...)...))
+	if err != nil {
+		return nil, err
+	}
+	// Find the burstable pods we would need to evict if we already evicted ALL besteffort pods, and the required guaranteed pods.
+	burstableToEvict, err := getPodsToPreemptByDistance(burstablePods, requirements.subtract(append(bestEffortPods, guarateedToEvict...)...))
+	if err != nil {
+		return nil, err
+	}
+	// Find the besteffort pods we would need to evict if we already evicted the required guaranteed and burstable pods.
+	bestEffortToEvict, err := getPodsToPreemptByDistance(bestEffortPods, requirements.subtract(append(burstableToEvict, guarateedToEvict...)...))
+	if err != nil {
+		return nil, err
+	}
+	return append(append(bestEffortToEvict, burstableToEvict...), guarateedToEvict...), nil
+}
+
+// finds the pods that have pod requests >= admission requirements.
+// Chooses pods that minimize "distance" to the requirements.
+// If more than one pod exists that fulfills the remaining requirements,
+// it chooses the pod that has the "smaller resource request"
+// This method, by repeatedly choosing the pod that fulfills as much of the requirements as possible,
+// attempts to minimize the number of pods returned.
+func getPodsToPreemptByDistance(pods []*v1.Pod, requirements admissionRequirementList) ([]*v1.Pod, error) {
+	podsToEvict := []*v1.Pod{}
+	// evict pods by shortest distance from remaining requirements, updating requirements every round.
+	for len(requirements) > 0 {
+		if len(pods) == 0 {
+			return nil, fmt.Errorf("no set of running pods found to reclaim resources: %v", requirements.toString())
+		}
+		// all distances must be less than len(requirements), because the max distance for a single requirement is 1
+		bestDistance := float64(len(requirements) + 1)
+		bestPodIndex := 0
+		// Find the pod with the smallest distance from requirements
+		// Or, in the case of two equidistant pods, find the pod with "smaller" resource requests.
+		for i, pod := range pods {
+			dist := requirements.distance(pod)
+			if dist < bestDistance || (bestDistance == dist && smallerResourceRequest(pod, pods[bestPodIndex])) {
+				bestDistance = dist
+				bestPodIndex = i
+			}
+		}
+		// subtract the pod from requirements, and transfer the pod from input-pods to pods-to-evicted
+		requirements = requirements.subtract(pods[bestPodIndex])
+		podsToEvict = append(podsToEvict, pods[bestPodIndex])
+		pods[bestPodIndex] = pods[len(pods)-1]
+		pods = pods[:len(pods)-1]
+	}
+	return podsToEvict, nil
+}
+
+type admissionRequirement struct {
+	resourceName v1.ResourceName
+	quantity     int64
+}
+
+type admissionRequirementList []*admissionRequirement
+
+// distance of the pods requests from the admissionRequirements.
+// distance is measured by the fraction of the requirement satisfied by the pod,
+// so that each requirement is weighted equally, regardless of absolute magnitude.
+func (a admissionRequirementList) distance(pod *v1.Pod) float64 {
+	dist := float64(0)
+	for _, req := range a {
+		remainingRequest := float64(req.quantity - v1.GetResourceRequest(pod, req.resourceName))
+		if remainingRequest < 0 {
+			remainingRequest = 0
+		}
+		dist += math.Pow(remainingRequest/float64(req.quantity), 2)
+	}
+	return dist
+}
+
+// returns a new admissionRequirementList containing remaining requirements if the provided pod
+// were to be preempted
+func (a admissionRequirementList) subtract(pods ...*v1.Pod) admissionRequirementList {
+	newList := []*admissionRequirement{}
+	for _, req := range a {
+		newQuantity := req.quantity
+		for _, pod := range pods {
+			newQuantity -= v1.GetResourceRequest(pod, req.resourceName)
+		}
+		if newQuantity > 0 {
+			newList = append(newList, &admissionRequirement{
+				resourceName: req.resourceName,
+				quantity:     newQuantity,
+			})
+		}
+	}
+	return newList
+}
+
+func (a admissionRequirementList) toString() string {
+	s := "["
+	for _, req := range a {
+		s += fmt.Sprintf("(res: %v, q: %d), ", req.resourceName, req.quantity)
+	}
+	return s + "]"
+}
+
+// returns lists containing non-critical besteffort, burstable, and guaranteed pods
+func sortPodsByQOS(pods []*v1.Pod) (bestEffort, burstable, guaranteed []*v1.Pod) {
+	for _, pod := range pods {
+		if !kubetypes.IsCriticalPod(pod) {
+			switch qos.GetPodQOS(pod) {
+			case v1.PodQOSBestEffort:
+				bestEffort = append(bestEffort, pod)
+			case v1.PodQOSBurstable:
+				burstable = append(burstable, pod)
+			case v1.PodQOSGuaranteed:
+				guaranteed = append(guaranteed, pod)
+			default:
+			}
+		}
+	}
+	return
+}
+
+// returns true if pod1 has a smaller request than pod2
+func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
+	priorityList := []v1.ResourceName{
+		v1.ResourceNvidiaGPU,
+		v1.ResourceMemory,
+		v1.ResourceCPU,
+	}
+	for _, res := range priorityList {
+		req1 := v1.GetResourceRequest(pod1, res)
+		req2 := v1.GetResourceRequest(pod2, res)
+		if req1 < req2 {
+			return true
+		} else if req1 > req2 {
+			return false
+		}
+	}
+	return true
+}
diff --git a/pkg/kubelet/preemption/preemption_test.go b/pkg/kubelet/preemption/preemption_test.go
new file mode 100644
index 0000000000..a84a734983
--- /dev/null
+++ b/pkg/kubelet/preemption/preemption_test.go
@@ -0,0 +1,480 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package preemption
+
+import (
+	"fmt"
+	"testing"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/tools/record"
+	kubeapi "k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/v1"
+	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+)
+
+const (
+	critical              = "critical"
+	bestEffort            = "bestEffort"
+	burstable             = "burstable"
+	highRequestBurstable  = "high-request-burstable"
+	guaranteed            = "guaranteed"
+	highRequestGuaranteed = "high-request-guaranteed"
+	tinyBurstable         = "tiny"
+	maxPods               = 110
+)
+
+type fakePodKiller struct {
+	killedPods []*v1.Pod
+}
+
+func newFakePodKiller() *fakePodKiller {
+	return &fakePodKiller{killedPods: []*v1.Pod{}}
+}
+
+func (f *fakePodKiller) clear() {
+	f.killedPods = []*v1.Pod{}
+}
+
+func (f *fakePodKiller) getKilledPods() []*v1.Pod {
+	return f.killedPods
+}
+
+func (f *fakePodKiller) killPodNow(pod *v1.Pod, status v1.PodStatus, gracePeriodOverride *int64) error {
+	f.killedPods = append(f.killedPods, pod)
+	return nil
+}
+
+type fakePodProvider struct {
+	pods []*v1.Pod
+}
+
+func newFakePodProvider() *fakePodProvider {
+	return &fakePodProvider{pods: []*v1.Pod{}}
+}
+
+func (f *fakePodProvider) setPods(pods []*v1.Pod) {
+	f.pods = pods
+}
+
+func (f *fakePodProvider) getPods() []*v1.Pod {
+	return f.pods
+}
+
+func getTestCriticalPodAdmissionHandler(podProvider *fakePodProvider, podKiller *fakePodKiller) *CriticalPodAdmissionHandler {
+	return &CriticalPodAdmissionHandler{
+		getPodsFunc: podProvider.getPods,
+		killPodFunc: podKiller.killPodNow,
+		recorder:    &record.FakeRecorder{},
+	}
+}
+
+func TestEvictPodsToFreeRequests(t *testing.T) {
+	type testRun struct {
+		testName              string
+		inputPods             []*v1.Pod
+		insufficientResources admissionRequirementList
+		expectErr             bool
+		expectedOutput        []*v1.Pod
+	}
+	podProvider := newFakePodProvider()
+	podKiller := newFakePodKiller()
+	criticalPodAdmissionHandler := getTestCriticalPodAdmissionHandler(podProvider, podKiller)
+	allPods := getTestPods()
+	runs := []testRun{
+		{
+			testName:              "critical pods cannot be preempted",
+			inputPods:             []*v1.Pod{allPods[critical]},
+			insufficientResources: getAdmissionRequirementList(0, 0, 1),
+			expectErr:             true,
+			expectedOutput:        nil,
+		},
+		{
+			testName:              "best effort pods are not preempted when attempting to free resources",
+			inputPods:             []*v1.Pod{allPods[bestEffort]},
+			insufficientResources: getAdmissionRequirementList(0, 1, 0),
+			expectErr:             true,
+			expectedOutput:        nil,
+		},
+		{
+			testName: "multiple pods evicted",
+			inputPods: []*v1.Pod{
+				allPods[critical], allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable],
+				allPods[guaranteed], allPods[highRequestGuaranteed]},
+			insufficientResources: getAdmissionRequirementList(0, 550, 0),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{allPods[highRequestBurstable], allPods[highRequestGuaranteed]},
+		},
+	}
+	for _, r := range runs {
+		podProvider.setPods(r.inputPods)
+		outErr := criticalPodAdmissionHandler.evictPodsToFreeRequests(r.insufficientResources)
+		outputPods := podKiller.getKilledPods()
+		if !r.expectErr && outErr != nil {
+			t.Errorf("evictPodsToFreeRequests returned an unexpected error during the %s test.  Err: %v", r.testName, outErr)
+		} else if r.expectErr && outErr == nil {
+			t.Errorf("evictPodsToFreeRequests expected an error but returned a successful output=%v during the %s test.", outputPods, r.testName)
+		} else if !podListEqual(r.expectedOutput, outputPods) {
+			t.Errorf("evictPodsToFreeRequests expected %v but got %v during the %s test.", r.expectedOutput, outputPods, r.testName)
+		}
+		podKiller.clear()
+	}
+}
+
+func BenchmarkGetPodsToPreempt(t *testing.B) {
+	allPods := getTestPods()
+	inputPods := []*v1.Pod{}
+	for i := 0; i < maxPods; i++ {
+		inputPods = append(inputPods, allPods[tinyBurstable])
+	}
+	for n := 0; n < t.N; n++ {
+		getPodsToPreempt(inputPods, admissionRequirementList([]*admissionRequirement{
+			{
+				resourceName: v1.ResourceCPU,
+				quantity:     parseCPUToInt64("110m"),
+			}}))
+	}
+}
+
+func TestGetPodsToPreempt(t *testing.T) {
+	type testRun struct {
+		testName              string
+		inputPods             []*v1.Pod
+		insufficientResources admissionRequirementList
+		expectErr             bool
+		expectedOutput        []*v1.Pod
+	}
+	allPods := getTestPods()
+	runs := []testRun{
+		{
+			testName:              "no requirements",
+			inputPods:             []*v1.Pod{},
+			insufficientResources: getAdmissionRequirementList(0, 0, 0),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{},
+		},
+		{
+			testName:              "no pods",
+			inputPods:             []*v1.Pod{},
+			insufficientResources: getAdmissionRequirementList(0, 0, 1),
+			expectErr:             true,
+			expectedOutput:        nil,
+		},
+		{
+			testName:              "equal pods and resources requirements",
+			inputPods:             []*v1.Pod{allPods[burstable]},
+			insufficientResources: getAdmissionRequirementList(100, 100, 1),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{allPods[burstable]},
+		},
+		{
+			testName:              "higer requirements than pod requests",
+			inputPods:             []*v1.Pod{allPods[burstable]},
+			insufficientResources: getAdmissionRequirementList(200, 200, 2),
+			expectErr:             true,
+			expectedOutput:        nil,
+		},
+		{
+			testName:              "choose between bestEffort and burstable",
+			inputPods:             []*v1.Pod{allPods[burstable], allPods[bestEffort]},
+			insufficientResources: getAdmissionRequirementList(0, 0, 1),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{allPods[bestEffort]},
+		},
+		{
+			testName:              "choose between burstable and guaranteed",
+			inputPods:             []*v1.Pod{allPods[burstable], allPods[guaranteed]},
+			insufficientResources: getAdmissionRequirementList(0, 0, 1),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{allPods[burstable]},
+		},
+		{
+			testName:              "choose lower request burstable if it meets requirements",
+			inputPods:             []*v1.Pod{allPods[bestEffort], allPods[highRequestBurstable], allPods[burstable]},
+			insufficientResources: getAdmissionRequirementList(100, 100, 0),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{allPods[burstable]},
+		},
+		{
+			testName:              "choose higher request burstable if lower does not meet requirements",
+			inputPods:             []*v1.Pod{allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable]},
+			insufficientResources: getAdmissionRequirementList(150, 150, 0),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{allPods[highRequestBurstable]},
+		},
+		{
+			testName:              "multiple pods required",
+			inputPods:             []*v1.Pod{allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable], allPods[guaranteed], allPods[highRequestGuaranteed]},
+			insufficientResources: getAdmissionRequirementList(350, 350, 0),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{allPods[burstable], allPods[highRequestBurstable]},
+		},
+		{
+			testName:              "evict guaranteed when we have to, and dont evict the extra burstable",
+			inputPods:             []*v1.Pod{allPods[bestEffort], allPods[burstable], allPods[highRequestBurstable], allPods[guaranteed], allPods[highRequestGuaranteed]},
+			insufficientResources: getAdmissionRequirementList(0, 550, 0),
+			expectErr:             false,
+			expectedOutput:        []*v1.Pod{allPods[highRequestBurstable], allPods[highRequestGuaranteed]},
+		},
+	}
+	for _, r := range runs {
+		outputPods, outErr := getPodsToPreempt(r.inputPods, r.insufficientResources)
+		if !r.expectErr && outErr != nil {
+			t.Errorf("getPodsToPreempt returned an unexpected error during the %s test.  Err: %v", r.testName, outErr)
+		} else if r.expectErr && outErr == nil {
+			t.Errorf("getPodsToPreempt expected an error but returned a successful output=%v during the %s test.", outputPods, r.testName)
+		} else if !podListEqual(r.expectedOutput, outputPods) {
+			t.Errorf("getPodsToPreempt expected %v but got %v during the %s test.", r.expectedOutput, outputPods, r.testName)
+		}
+	}
+}
+
+func TestAdmissionRequirementsDistance(t *testing.T) {
+	type testRun struct {
+		testName       string
+		requirements   admissionRequirementList
+		inputPod       *v1.Pod
+		expectedOutput float64
+	}
+	allPods := getTestPods()
+	runs := []testRun{
+		{
+			testName:       "no requirements",
+			requirements:   getAdmissionRequirementList(0, 0, 0),
+			inputPod:       allPods[burstable],
+			expectedOutput: 0,
+		},
+		{
+			testName:       "no requests, some requirements",
+			requirements:   getAdmissionRequirementList(100, 100, 1),
+			inputPod:       allPods[bestEffort],
+			expectedOutput: 2,
+		},
+		{
+			testName:       "equal requests and requirements",
+			requirements:   getAdmissionRequirementList(100, 100, 1),
+			inputPod:       allPods[burstable],
+			expectedOutput: 0,
+		},
+		{
+			testName:       "higher requests than requirements",
+			requirements:   getAdmissionRequirementList(50, 50, 0),
+			inputPod:       allPods[burstable],
+			expectedOutput: 0,
+		},
+	}
+	for _, run := range runs {
+		output := run.requirements.distance(run.inputPod)
+		if output != run.expectedOutput {
+			t.Errorf("expected: %f, got: %f for %s test", run.expectedOutput, output, run.testName)
+		}
+	}
+}
+
+func TestAdmissionRequirementsSubtract(t *testing.T) {
+	type testRun struct {
+		testName       string
+		initial        admissionRequirementList
+		inputPod       *v1.Pod
+		expectedOutput admissionRequirementList
+	}
+	allPods := getTestPods()
+	runs := []testRun{
+		{
+			testName:       "subtract a pod from no requirements",
+			initial:        getAdmissionRequirementList(0, 0, 0),
+			inputPod:       allPods[burstable],
+			expectedOutput: getAdmissionRequirementList(0, 0, 0),
+		},
+		{
+			testName:       "subtract no requests from some requirements",
+			initial:        getAdmissionRequirementList(100, 100, 1),
+			inputPod:       allPods[bestEffort],
+			expectedOutput: getAdmissionRequirementList(100, 100, 0),
+		},
+		{
+			testName:       "equal requests and requirements",
+			initial:        getAdmissionRequirementList(100, 100, 1),
+			inputPod:       allPods[burstable],
+			expectedOutput: getAdmissionRequirementList(0, 0, 0),
+		},
+		{
+			testName:       "subtract higher requests than requirements",
+			initial:        getAdmissionRequirementList(50, 50, 0),
+			inputPod:       allPods[burstable],
+			expectedOutput: getAdmissionRequirementList(0, 0, 0),
+		},
+		{
+			testName:       "subtract lower requests than requirements",
+			initial:        getAdmissionRequirementList(200, 200, 1),
+			inputPod:       allPods[burstable],
+			expectedOutput: getAdmissionRequirementList(100, 100, 0),
+		},
+	}
+	for _, run := range runs {
+		output := run.initial.subtract(run.inputPod)
+		if !admissionRequirementListEqual(output, run.expectedOutput) {
+			t.Errorf("expected: %s, got: %s for %s test", run.expectedOutput.toString(), output.toString(), run.testName)
+		}
+	}
+}
+
+func getTestPods() map[string]*v1.Pod {
+	allPods := map[string]*v1.Pod{
+		tinyBurstable: getPodWithResources(tinyBurstable, v1.ResourceRequirements{
+			Requests: v1.ResourceList{
+				"cpu":    resource.MustParse("1m"),
+				"memory": resource.MustParse("1Mi"),
+			},
+		}),
+		bestEffort: getPodWithResources(bestEffort, v1.ResourceRequirements{}),
+		critical: getPodWithResources(critical, v1.ResourceRequirements{
+			Requests: v1.ResourceList{
+				"cpu":    resource.MustParse("100m"),
+				"memory": resource.MustParse("100Mi"),
+			},
+		}),
+		burstable: getPodWithResources(burstable, v1.ResourceRequirements{
+			Requests: v1.ResourceList{
+				"cpu":    resource.MustParse("100m"),
+				"memory": resource.MustParse("100Mi"),
+			},
+		}),
+		guaranteed: getPodWithResources(guaranteed, v1.ResourceRequirements{
+			Requests: v1.ResourceList{
+				"cpu":    resource.MustParse("100m"),
+				"memory": resource.MustParse("100Mi"),
+			},
+			Limits: v1.ResourceList{
+				"cpu":    resource.MustParse("100m"),
+				"memory": resource.MustParse("100Mi"),
+			},
+		}),
+		highRequestBurstable: getPodWithResources(highRequestBurstable, v1.ResourceRequirements{
+			Requests: v1.ResourceList{
+				"cpu":    resource.MustParse("300m"),
+				"memory": resource.MustParse("300Mi"),
+			},
+		}),
+		highRequestGuaranteed: getPodWithResources(highRequestGuaranteed, v1.ResourceRequirements{
+			Requests: v1.ResourceList{
+				"cpu":    resource.MustParse("300m"),
+				"memory": resource.MustParse("300Mi"),
+			},
+			Limits: v1.ResourceList{
+				"cpu":    resource.MustParse("300m"),
+				"memory": resource.MustParse("300Mi"),
+			},
+		}),
+	}
+	allPods[critical].Namespace = kubeapi.NamespaceSystem
+	allPods[critical].Annotations[kubetypes.CriticalPodAnnotationKey] = ""
+	return allPods
+}
+
+func getPodWithResources(name string, requests v1.ResourceRequirements) *v1.Pod {
+	return &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			GenerateName: name,
+			Annotations:  map[string]string{},
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name:      fmt.Sprintf("%s-container", name),
+					Resources: requests,
+				},
+			},
+		},
+	}
+}
+
+func parseCPUToInt64(res string) int64 {
+	r := resource.MustParse(res)
+	return (&r).MilliValue()
+}
+
+func parseNonCpuResourceToInt64(res string) int64 {
+	r := resource.MustParse(res)
+	return (&r).Value()
+}
+
+func getAdmissionRequirementList(cpu, memory, pods int) admissionRequirementList {
+	reqs := []*admissionRequirement{}
+	if cpu > 0 {
+		reqs = append(reqs, &admissionRequirement{
+			resourceName: v1.ResourceCPU,
+			quantity:     parseCPUToInt64(fmt.Sprintf("%dm", cpu)),
+		})
+	}
+	if memory > 0 {
+		reqs = append(reqs, &admissionRequirement{
+			resourceName: v1.ResourceMemory,
+			quantity:     parseNonCpuResourceToInt64(fmt.Sprintf("%dMi", memory)),
+		})
+	}
+	if pods > 0 {
+		reqs = append(reqs, &admissionRequirement{
+			resourceName: v1.ResourcePods,
+			quantity:     int64(pods),
+		})
+	}
+	return admissionRequirementList(reqs)
+}
+
+// this checks if the lists contents contain all of the same elements.
+// this is not correct if there are duplicate pods in the list.
+// for example: podListEqual([a, a, b], [a, b, b]) will return true
+func admissionRequirementListEqual(list1 admissionRequirementList, list2 admissionRequirementList) bool {
+	if len(list1) != len(list2) {
+		return false
+	}
+	for _, a := range list1 {
+		contains := false
+		for _, b := range list2 {
+			if a.resourceName == b.resourceName && a.quantity == b.quantity {
+				contains = true
+			}
+		}
+		if !contains {
+			return false
+		}
+	}
+	return true
+}
+
+// this checks if the lists contents contain all of the same elements.
+// this is not correct if there are duplicate pods in the list.
+// for example: podListEqual([a, a, b], [a, b, b]) will return true
+func podListEqual(list1 []*v1.Pod, list2 []*v1.Pod) bool {
+	if len(list1) != len(list2) {
+		return false
+	}
+	for _, a := range list1 {
+		contains := false
+		for _, b := range list2 {
+			if a == b {
+				contains = true
+			}
+		}
+		if !contains {
+			return false
+		}
+	}
+	return true
+}
diff --git a/plugin/pkg/scheduler/algorithm/predicates/error.go b/plugin/pkg/scheduler/algorithm/predicates/error.go
index 61bb249bb9..36ac5f1dea 100644
--- a/plugin/pkg/scheduler/algorithm/predicates/error.go
+++ b/plugin/pkg/scheduler/algorithm/predicates/error.go
@@ -70,6 +70,10 @@ func (e *InsufficientResourceError) GetReason() string {
 	return fmt.Sprintf("Insufficient %v", e.ResourceName)
 }
 
+func (e *InsufficientResourceError) GetInsufficientAmount() int64 {
+	return e.requested - (e.capacity - e.used)
+}
+
 type PredicateFailureError struct {
 	PredicateName string
 }
diff --git a/test/e2e/framework/pods.go b/test/e2e/framework/pods.go
index 669f054908..81b222007e 100644
--- a/test/e2e/framework/pods.go
+++ b/test/e2e/framework/pods.go
@@ -72,16 +72,21 @@ func (c *PodClient) Create(pod *v1.Pod) *v1.Pod {
 	return p
 }
 
-// CreateSync creates a new pod according to the framework specifications, and wait for it to start.
-func (c *PodClient) CreateSync(pod *v1.Pod) *v1.Pod {
+// CreateSync creates a new pod according to the framework specifications in the given namespace, and waits for it to start.
+func (c *PodClient) CreateSyncInNamespace(pod *v1.Pod, namespace string) *v1.Pod {
 	p := c.Create(pod)
-	ExpectNoError(c.f.WaitForPodRunning(p.Name))
+	ExpectNoError(WaitForPodNameRunningInNamespace(c.f.ClientSet, p.Name, namespace))
 	// Get the newest pod after it becomes running, some status may change after pod created, such as pod ip.
 	p, err := c.Get(p.Name, metav1.GetOptions{})
 	ExpectNoError(err)
 	return p
 }
 
+// CreateSync creates a new pod according to the framework specifications, and wait for it to start.
+func (c *PodClient) CreateSync(pod *v1.Pod) *v1.Pod {
+	return c.CreateSyncInNamespace(pod, c.f.Namespace.Name)
+}
+
 // CreateBatch create a batch of pods. All pods are created before waiting.
 func (c *PodClient) CreateBatch(pods []*v1.Pod) []*v1.Pod {
 	ps := make([]*v1.Pod, len(pods))
@@ -124,11 +129,17 @@ func (c *PodClient) Update(name string, updateFn func(pod *v1.Pod)) {
 // DeleteSync deletes the pod and wait for the pod to disappear for `timeout`. If the pod doesn't
 // disappear before the timeout, it will fail the test.
 func (c *PodClient) DeleteSync(name string, options *metav1.DeleteOptions, timeout time.Duration) {
+	c.DeleteSyncInNamespace(name, c.f.Namespace.Name, options, timeout)
+}
+
+// DeleteSyncInNamespace deletes the pod from the namespace and wait for the pod to disappear for `timeout`. If the pod doesn't
+// disappear before the timeout, it will fail the test.
+func (c *PodClient) DeleteSyncInNamespace(name string, namespace string, options *metav1.DeleteOptions, timeout time.Duration) {
 	err := c.Delete(name, options)
 	if err != nil && !errors.IsNotFound(err) {
 		Failf("Failed to delete pod %q: %v", name, err)
 	}
-	Expect(WaitForPodToDisappear(c.f.ClientSet, c.f.Namespace.Name, name, labels.Everything(),
+	Expect(WaitForPodToDisappear(c.f.ClientSet, namespace, name, labels.Everything(),
 		2*time.Second, timeout)).To(Succeed(), "wait for pod %q to disappear", name)
 }
 
diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD
index 783ebc8a11..1efadf01ad 100644
--- a/test/e2e_node/BUILD
+++ b/test/e2e_node/BUILD
@@ -52,6 +52,7 @@ go_test(
         "apparmor_test.go",
         "cgroup_manager_test.go",
         "container_manager_test.go",
+        "critical_pod_test.go",
         "density_test.go",
         "disk_eviction_test.go",
         "dynamic_kubelet_configuration_test.go",
@@ -76,7 +77,9 @@ go_test(
         "integration",
     ],
     deps = [
+        "//pkg/api:go_default_library",
         "//pkg/api/v1:go_default_library",
+        "//pkg/apis/componentconfig:go_default_library",
         "//pkg/client/clientset_generated/clientset:go_default_library",
         "//pkg/kubelet:go_default_library",
         "//pkg/kubelet/api/v1alpha1/stats:go_default_library",
@@ -85,6 +88,7 @@ go_test(
         "//pkg/kubelet/dockertools:go_default_library",
         "//pkg/kubelet/images:go_default_library",
         "//pkg/kubelet/metrics:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
         "//pkg/metrics:go_default_library",
         "//pkg/security/apparmor:go_default_library",
         "//test/e2e/common:go_default_library",
diff --git a/test/e2e_node/critical_pod_test.go b/test/e2e_node/critical_pod_test.go
new file mode 100644
index 0000000000..8df6f4efe6
--- /dev/null
+++ b/test/e2e_node/critical_pod_test.go
@@ -0,0 +1,148 @@
+/*
+Copyright 2016 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e_node
+
+import (
+	"fmt"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	kubeapi "k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/pkg/apis/componentconfig"
+	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
+	"k8s.io/kubernetes/test/e2e/framework"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+const (
+	criticalPodName   = "critical-pod"
+	guaranteedPodName = "guaranteed"
+	burstablePodName  = "burstable"
+	bestEffortPodName = "best-effort"
+)
+
+var _ = framework.KubeDescribe("CriticalPod [Serial] [Disruptive]", func() {
+	f := framework.NewDefaultFramework("critical-pod-test")
+
+	Context("when we need to admit a critical pod", func() {
+		tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) {
+			initialConfig.FeatureGates += ", ExperimentalCriticalPodAnnotation=true"
+		})
+
+		It("should be able to create and delete a critical pod", func() {
+			configEnabled, err := isKubeletConfigEnabled(f)
+			framework.ExpectNoError(err)
+			if !configEnabled {
+				framework.Skipf("unable to run test without dynamic kubelet config enabled.")
+			}
+
+			// Define test pods
+			nonCriticalGuaranteed := getTestPod(false, guaranteedPodName, v1.ResourceRequirements{
+				Requests: v1.ResourceList{
+					"cpu":    resource.MustParse("100m"),
+					"memory": resource.MustParse("100Mi"),
+				},
+				Limits: v1.ResourceList{
+					"cpu":    resource.MustParse("100m"),
+					"memory": resource.MustParse("100Mi"),
+				},
+			})
+			nonCriticalBurstable := getTestPod(false, burstablePodName, v1.ResourceRequirements{
+				Requests: v1.ResourceList{
+					"cpu":    resource.MustParse("100m"),
+					"memory": resource.MustParse("100Mi"),
+				},
+			})
+			nonCriticalBestEffort := getTestPod(false, bestEffortPodName, v1.ResourceRequirements{})
+			criticalPod := getTestPod(true, criticalPodName, v1.ResourceRequirements{
+				// request the entire resource capacity of the node, so that
+				// admitting this pod requires the other pod to be preempted
+				Requests: getNodeCPUAndMemoryCapacity(f),
+			})
+
+			// Create pods, starting with non-critical so that the critical preempts the other pods.
+			f.PodClient().CreateBatch([]*v1.Pod{nonCriticalBestEffort, nonCriticalBurstable, nonCriticalGuaranteed})
+			f.PodClientNS(kubeapi.NamespaceSystem).CreateSyncInNamespace(criticalPod, kubeapi.NamespaceSystem)
+
+			// Check that non-critical pods other than the besteffort have been evicted
+			updatedPodList, err := f.ClientSet.Core().Pods(f.Namespace.Name).List(metav1.ListOptions{})
+			framework.ExpectNoError(err)
+			for _, p := range updatedPodList.Items {
+				if p.Name == nonCriticalBestEffort.Name {
+					Expect(p.Status.Phase).NotTo(Equal(v1.PodFailed), fmt.Sprintf("pod: %v should be preempted", p.Name))
+				} else {
+					Expect(p.Status.Phase).To(Equal(v1.PodFailed), fmt.Sprintf("pod: %v should not be preempted", p.Name))
+				}
+			}
+		})
+		AfterEach(func() {
+			// Delete Pods
+			f.PodClient().DeleteSync(guaranteedPodName, &metav1.DeleteOptions{}, podDisappearTimeout)
+			f.PodClient().DeleteSync(burstablePodName, &metav1.DeleteOptions{}, podDisappearTimeout)
+			f.PodClient().DeleteSync(bestEffortPodName, &metav1.DeleteOptions{}, podDisappearTimeout)
+			f.PodClientNS(kubeapi.NamespaceSystem).DeleteSyncInNamespace(criticalPodName, kubeapi.NamespaceSystem, &metav1.DeleteOptions{}, podDisappearTimeout)
+			// Log Events
+			logPodEvents(f)
+			logNodeEvents(f)
+
+		})
+	})
+})
+
+func getNodeCPUAndMemoryCapacity(f *framework.Framework) v1.ResourceList {
+	nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{})
+	framework.ExpectNoError(err)
+	// Assuming that there is only one node, because this is a node e2e test.
+	Expect(len(nodeList.Items)).To(Equal(1))
+	capacity := nodeList.Items[0].Status.Capacity
+	return v1.ResourceList{
+		v1.ResourceCPU:    capacity[v1.ResourceCPU],
+		v1.ResourceMemory: capacity[v1.ResourceMemory],
+	}
+}
+
+func getTestPod(critical bool, name string, resources v1.ResourceRequirements) *v1.Pod {
+	pod := &v1.Pod{
+		TypeMeta: metav1.TypeMeta{
+			Kind:       "Pod",
+			APIVersion: "v1",
+		},
+		ObjectMeta: metav1.ObjectMeta{Name: name},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name:      "container",
+					Image:     framework.GetPauseImageNameForHostArch(),
+					Resources: resources,
+				},
+			},
+		},
+	}
+	if critical {
+		pod.ObjectMeta.Namespace = kubeapi.NamespaceSystem
+		pod.ObjectMeta.Annotations = map[string]string{
+			kubelettypes.CriticalPodAnnotationKey: "",
+		}
+		Expect(kubelettypes.IsCriticalPod(pod)).To(BeTrue(), "pod should be a critical pod")
+	} else {
+		Expect(kubelettypes.IsCriticalPod(pod)).To(BeFalse(), "pod should not be a critical pod")
+	}
+	return pod
+}