Merge pull request #40655 from vishh/flag-gate-critical-pod-annotation

Automatic merge from submit-queue Optionally avoid evicting critical pods in kubelet For #40573 ```release-note When feature gate "ExperimentalCriticalPodAnnotation" is set, Kubelet will avoid evicting pods in "kube-system" namespace that contains a special annotation - `scheduler.alpha.kubernetes.io/critical-pod` This feature should be used in conjunction with the rescheduler to guarantee availability for critical system pods - https://kubernetes.io/docs/admin/rescheduler/ ```
2017-02-03 16:22:26 -08:00 · 2017-02-03 16:22:26 -08:00 · f20b4fc67f
parent c78745edee 77a88f7e8b
commit f20b4fc67f
14 changed files with 241 additions and 214 deletions
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@ -123,7 +123,7 @@ fi
 RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"

 # Optional: set feature gates
-FEATURE_GATES="${KUBE_FEATURE_GATES:-}"
+FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"

 # Optional: Install cluster DNS.
 ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}"
--- a/cluster/gce/config-test.sh
+++ b/cluster/gce/config-test.sh
@ -83,7 +83,7 @@ MASTER_IP_RANGE="${MASTER_IP_RANGE:-10.246.0.0/24}"
 RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"

 # Optional: set feature gates
-FEATURE_GATES="${KUBE_FEATURE_GATES:-}"
+FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"

 TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100}

--- a/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest
+++ b/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest
@ -38,9 +38,8 @@ kind: Pod
 metadata:
  name: kube-proxy
  namespace: kube-system
-  # This annotation lowers the possibility that kube-proxy gets evicted when the
-  # node is under memory pressure, and prioritizes it for admission, even if
-  # the node is under memory pressure.
+  # This annotation ensures that kube-proxy does not get evicted if the node
+  # supports critical pod annotation based priority scheme.
  # Note that kube-proxy runs as a static pod so this annotation does NOT have
  # any effect on rescheduler (default scheduler and rescheduler are not
  # involved in scheduling kube-proxy).
--- a/pkg/features/kube_features.go
+++ b/pkg/features/kube_features.go
@ -58,6 +58,14 @@ const (
 	// contains a privileged container, or specific non-namespaced capabilities (MKNOD, SYS_MODULE,
 	// SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon.
 	ExperimentalHostUserNamespaceDefaultingGate utilfeature.Feature = "ExperimentalHostUserNamespaceDefaulting"
+
+	// owner: @vishh
+	// alpha: v1.5
+	//
+	// Ensures guaranteed scheduling of pods marked with a special pod annotation `scheduler.alpha.kubernetes.io/critical-pod`
+	// and also prevents them from being evicted from a node.
+	// Note: This feature is not supported for `BestEffort` pods.
+	ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation"
 )

 func init() {
@ -73,6 +81,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
 	DynamicKubeletConfig:                        {Default: false, PreRelease: utilfeature.Alpha},
 	DynamicVolumeProvisioning:                   {Default: true, PreRelease: utilfeature.Alpha},
 	ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
+	ExperimentalCriticalPodAnnotation:           {Default: false, PreRelease: utilfeature.Alpha},

 	// inherited features from generic apiserver, relisted here to get a conflict if it is changed
 	// unintentionally on either side:
--- a/pkg/kubelet/eviction/BUILD
+++ b/pkg/kubelet/eviction/BUILD
@ -9,37 +9,16 @@ load(
    "go_test",
 )

-go_library(
-    name = "go_default_library",
-    srcs = [
-        "doc.go",
-        "eviction_manager.go",
-        "helpers.go",
-        "types.go",
+cgo_genrule(
+    name = "cgo_codegen",
+    srcs = ["threshold_notifier_linux.go"],
+    clinkopts = [
+        "-lz",
+        "-lm",
+        "-lpthread",
+        "-ldl",
    ],
-    library = ":cgo_codegen",
    tags = ["automanaged"],
-    deps = [
-        "//pkg/api:go_default_library",
-        "//pkg/api/v1:go_default_library",
-        "//pkg/kubelet/api/v1alpha1/stats:go_default_library",
-        "//pkg/kubelet/cm:go_default_library",
-        "//pkg/kubelet/lifecycle:go_default_library",
-        "//pkg/kubelet/pod:go_default_library",
-        "//pkg/kubelet/qos:go_default_library",
-        "//pkg/kubelet/server/stats:go_default_library",
-        "//pkg/kubelet/types:go_default_library",
-        "//pkg/kubelet/util/format:go_default_library",
-        "//pkg/quota/evaluator/core:go_default_library",
-        "//vendor:github.com/golang/glog",
-        "//vendor:k8s.io/apimachinery/pkg/api/resource",
-        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
-        "//vendor:k8s.io/apimachinery/pkg/util/sets",
-        "//vendor:k8s.io/apimachinery/pkg/util/wait",
-        "//vendor:k8s.io/client-go/pkg/api/v1",
-        "//vendor:k8s.io/client-go/tools/record",
-        "//vendor:k8s.io/client-go/util/clock",
-    ],
 )

 go_test(
@ -60,22 +39,45 @@ go_test(
        "//vendor:k8s.io/apimachinery/pkg/api/resource",
        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
        "//vendor:k8s.io/apimachinery/pkg/types",
+        "//vendor:k8s.io/apiserver/pkg/util/feature",
        "//vendor:k8s.io/client-go/pkg/api/v1",
        "//vendor:k8s.io/client-go/tools/record",
        "//vendor:k8s.io/client-go/util/clock",
    ],
 )

-cgo_genrule(
-    name = "cgo_codegen",
-    srcs = ["threshold_notifier_linux.go"],
-    clinkopts = [
-        "-lz",
-        "-lm",
-        "-lpthread",
-        "-ldl",
+go_library(
+    name = "go_default_library",
+    srcs = [
+        "doc.go",
+        "eviction_manager.go",
+        "helpers.go",
+        "types.go",
    ],
+    library = ":cgo_codegen",
    tags = ["automanaged"],
+    deps = [
+        "//pkg/api:go_default_library",
+        "//pkg/api/v1:go_default_library",
+        "//pkg/features:go_default_library",
+        "//pkg/kubelet/api/v1alpha1/stats:go_default_library",
+        "//pkg/kubelet/cm:go_default_library",
+        "//pkg/kubelet/lifecycle:go_default_library",
+        "//pkg/kubelet/qos:go_default_library",
+        "//pkg/kubelet/server/stats:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
+        "//pkg/kubelet/util/format:go_default_library",
+        "//pkg/quota/evaluator/core:go_default_library",
+        "//vendor:github.com/golang/glog",
+        "//vendor:k8s.io/apimachinery/pkg/api/resource",
+        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
+        "//vendor:k8s.io/apimachinery/pkg/util/sets",
+        "//vendor:k8s.io/apimachinery/pkg/util/wait",
+        "//vendor:k8s.io/apiserver/pkg/util/feature",
+        "//vendor:k8s.io/client-go/pkg/api/v1",
+        "//vendor:k8s.io/client-go/tools/record",
+        "//vendor:k8s.io/client-go/util/clock",
+    ],
 )

 filegroup(
--- a/pkg/kubelet/eviction/eviction_manager.go
+++ b/pkg/kubelet/eviction/eviction_manager.go
@ -25,16 +25,17 @@ import (
 	"github.com/golang/glog"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/util/wait"
+	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	clientv1 "k8s.io/client-go/pkg/api/v1"
 	"k8s.io/client-go/tools/record"
 	"k8s.io/client-go/util/clock"
 	"k8s.io/kubernetes/pkg/api/v1"
+	"k8s.io/kubernetes/pkg/features"
 	"k8s.io/kubernetes/pkg/kubelet/cm"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
-	kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
 	"k8s.io/kubernetes/pkg/kubelet/qos"
 	"k8s.io/kubernetes/pkg/kubelet/server/stats"
-	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 	"k8s.io/kubernetes/pkg/kubelet/util/format"
 )

@ -111,7 +112,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
 	// the node has memory pressure, admit if not best-effort
 	if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) {
 		notBestEffort := v1.PodQOSBestEffort != qos.GetPodQOS(attrs.Pod)
-		if notBestEffort || kubetypes.IsCriticalPod(attrs.Pod) {
+		if notBestEffort {
 			return lifecycle.PodAdmitResult{Admit: true}
 		}
 	}
@ -313,13 +314,10 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	// we kill at most a single pod during each eviction interval
 	for i := range activePods {
 		pod := activePods[i]
-		if kubepod.IsStaticPod(pod) {
-			// The eviction manager doesn't evict static pods. To stop a static
-			// pod, the admin needs to remove the manifest from kubelet's
-			// --config directory.
-			// TODO(39124): This is a short term fix, we can't assume static pods
-			// are always well behaved.
-			glog.Infof("eviction manager: NOT evicting static pod %v", pod.Name)
+		// If the pod is marked as critical and support for critical pod annotations is enabled,
+		// do not evict such pods. Once Kubelet supports preemptions, these pods can be safely evicted.
+		if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
+			kubelettypes.IsCriticalPod(pod) {
 			continue
 		}
 		status := v1.PodStatus{
--- a/pkg/kubelet/eviction/eviction_manager_test.go
+++ b/pkg/kubelet/eviction/eviction_manager_test.go
@ -22,13 +22,15 @@ import (

 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/types"
+	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	clientv1 "k8s.io/client-go/pkg/api/v1"
 	"k8s.io/client-go/tools/record"
 	"k8s.io/client-go/util/clock"
+	kubeapi "k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/v1"
 	statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
-	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+	kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
 )

 // mockPodKiller is used to testing which pod is killed
@ -212,8 +214,6 @@ func TestMemoryPressure(t *testing.T) {
 	// create a best effort pod to test admission
 	bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
 	burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")
-	criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
-	criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""}

 	// synchronize
 	manager.synchronize(diskInfoProvider, activePodsFunc)
@ -224,8 +224,8 @@ func TestMemoryPressure(t *testing.T) {
 	}

 	// try to admit our pods (they should succeed)
-	expected := []bool{true, true, true}
-	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
+	expected := []bool{true, true}
+	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
 		if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
 			t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
 		}
@ -300,10 +300,9 @@ func TestMemoryPressure(t *testing.T) {
 		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
 	}

-	// the best-effort pod without critical annotation should not admit,
-	// burstable and critical pods should
-	expected = []bool{false, true, true}
-	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
+	// the best-effort pod should not admit, burstable should
+	expected = []bool{false, true}
+	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
 		if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
 			t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
 		}
@ -325,9 +324,9 @@ func TestMemoryPressure(t *testing.T) {
 		t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
 	}

-	// the best-effort pod should not admit, burstable and critical pods should
-	expected = []bool{false, true, true}
-	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
+	// the best-effort pod should not admit, burstable should
+	expected = []bool{false, true}
+	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
 		if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
 			t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
 		}
@ -350,8 +349,8 @@ func TestMemoryPressure(t *testing.T) {
 	}

 	// all pods should admit now
-	expected = []bool{true, true, true}
-	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
+	expected = []bool{true, true}
+	for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
 		if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
 			t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
 		}
@ -1089,3 +1088,135 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
 		t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
 	}
 }
+
+// TestCriticalPodsAreNotEvicted
+func TestCriticalPodsAreNotEvicted(t *testing.T) {
+	podMaker := makePodWithMemoryStats
+	summaryStatsMaker := makeMemoryStats
+	podsToMake := []podToMake{
+		{name: "critical", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"},
+	}
+	pods := []*v1.Pod{}
+	podStats := map[*v1.Pod]statsapi.PodStats{}
+	for _, podToMake := range podsToMake {
+		pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
+		pods = append(pods, pod)
+		podStats[pod] = podStat
+	}
+
+	// Mark the pod as critical
+	pods[0].Annotations = map[string]string{
+		kubelettypes.CriticalPodAnnotationKey: "",
+	}
+	pods[0].Namespace = kubeapi.NamespaceSystem
+
+	podToEvict := pods[0]
+	activePodsFunc := func() []*v1.Pod {
+		return pods
+	}
+
+	fakeClock := clock.NewFakeClock(time.Now())
+	podKiller := &mockPodKiller{}
+	diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
+	imageGC := &mockImageGC{freed: int64(0), err: nil}
+	nodeRef := &clientv1.ObjectReference{
+		Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
+	}
+
+	config := Config{
+		MaxPodGracePeriodSeconds: 5,
+		PressureTransitionPeriod: time.Minute * 5,
+		Thresholds: []Threshold{
+			{
+				Signal:   SignalMemoryAvailable,
+				Operator: OpLessThan,
+				Value: ThresholdValue{
+					Quantity: quantityMustParse("1Gi"),
+				},
+			},
+			{
+				Signal:   SignalMemoryAvailable,
+				Operator: OpLessThan,
+				Value: ThresholdValue{
+					Quantity: quantityMustParse("2Gi"),
+				},
+				GracePeriod: time.Minute * 2,
+			},
+		},
+	}
+	summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)}
+	manager := &managerImpl{
+		clock:           fakeClock,
+		killPodFunc:     podKiller.killPodNow,
+		imageGC:         imageGC,
+		config:          config,
+		recorder:        &record.FakeRecorder{},
+		summaryProvider: summaryProvider,
+		nodeRef:         nodeRef,
+		nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
+		thresholdsFirstObservedAt:    thresholdsObservedAt{},
+	}
+
+	// Enable critical pod annotation feature gate
+	utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=True")
+	// induce soft threshold
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have memory pressure
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure since soft threshold was met")
+	}
+
+	// verify no pod was yet killed because there has not yet been enough time passed.
+	if podKiller.pod != nil {
+		t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name)
+	}
+
+	// step forward in time pass the grace period
+	fakeClock.Step(3 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have memory pressure
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure since soft threshold was met")
+	}
+
+	// verify the right pod was killed with the right grace period.
+	if podKiller.pod == podToEvict {
+		t.Errorf("Manager chose to kill critical pod: %v, but should have ignored it", podKiller.pod.Name)
+	}
+	// reset state
+	podKiller.pod = nil
+	podKiller.gracePeriodOverride = nil
+
+	// remove memory pressure
+	fakeClock.Step(20 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("3Gi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should not have memory pressure
+	if manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should not report memory pressure")
+	}
+
+	// Disable critical pod annotation feature gate
+	utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=False")
+
+	// induce memory pressure!
+	fakeClock.Step(1 * time.Minute)
+	summaryProvider.result = summaryStatsMaker("500Mi", podStats)
+	manager.synchronize(diskInfoProvider, activePodsFunc)
+
+	// we should have memory pressure
+	if !manager.IsUnderMemoryPressure() {
+		t.Errorf("Manager should report memory pressure")
+	}
+
+	// check the right pod was killed
+	if podKiller.pod != podToEvict {
+		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
+	}
+}
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -1898,21 +1898,8 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *v1.Pod, start time.Time) {
 // a config source.
 func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
 	start := kl.clock.Now()
-
-	// Pass critical pods through admission check first.
-	var criticalPods []*v1.Pod
-	var nonCriticalPods []*v1.Pod
-	for _, p := range pods {
-		if kubetypes.IsCriticalPod(p) {
-			criticalPods = append(criticalPods, p)
-		} else {
-			nonCriticalPods = append(nonCriticalPods, p)
-		}
-	}
-	sort.Sort(sliceutils.PodsByCreationTime(criticalPods))
-	sort.Sort(sliceutils.PodsByCreationTime(nonCriticalPods))
-
-	for _, pod := range append(criticalPods, nonCriticalPods...) {
+	sort.Sort(sliceutils.PodsByCreationTime(pods))
+	for _, pod := range pods {
 		existingPods := kl.podManager.GetPods()
 		// Always add the pod to the pod manager. Kubelet relies on the pod
 		// manager as the source of truth for the desired state. If a pod does
--- a/pkg/kubelet/kubelet_test.go
+++ b/pkg/kubelet/kubelet_test.go
@ -479,69 +479,6 @@ func TestHandlePortConflicts(t *testing.T) {
 	require.Equal(t, v1.PodPending, status.Phase)
 }

-// Tests that we sort pods based on criticality.
-func TestCriticalPrioritySorting(t *testing.T) {
-	testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
-	kl := testKubelet.kubelet
-	nodes := []v1.Node{
-		{ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname},
-			Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: v1.ResourceList{
-				v1.ResourceCPU:    *resource.NewMilliQuantity(10, resource.DecimalSI),
-				v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI),
-				v1.ResourcePods:   *resource.NewQuantity(40, resource.DecimalSI),
-			}}},
-	}
-	kl.nodeLister = testNodeLister{nodes: nodes}
-	kl.nodeInfo = testNodeInfo{nodes: nodes}
-	testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorapi.MachineInfo{}, nil)
-	testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)
-	testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)
-
-	spec := v1.PodSpec{NodeName: string(kl.nodeName),
-		Containers: []v1.Container{{Resources: v1.ResourceRequirements{
-			Requests: v1.ResourceList{
-				"memory": resource.MustParse("90"),
-			},
-		}}},
-	}
-	pods := []*v1.Pod{
-		podWithUidNameNsSpec("000000000", "newpod", "foo", spec),
-		podWithUidNameNsSpec("987654321", "oldpod", "foo", spec),
-		podWithUidNameNsSpec("123456789", "middlepod", "foo", spec),
-	}
-
-	// Pods are not sorted by creation time.
-	startTime := time.Now()
-	pods[0].CreationTimestamp = metav1.NewTime(startTime.Add(10 * time.Second))
-	pods[1].CreationTimestamp = metav1.NewTime(startTime)
-	pods[2].CreationTimestamp = metav1.NewTime(startTime.Add(1 * time.Second))
-
-	// Make the middle and new pod critical, the middle pod should win
-	// even though it comes later in the list
-	critical := map[string]string{kubetypes.CriticalPodAnnotationKey: ""}
-	pods[0].Annotations = critical
-	pods[1].Annotations = map[string]string{}
-	pods[2].Annotations = critical
-
-	// The non-critical pod should be rejected
-	notfittingPods := []*v1.Pod{pods[0], pods[1]}
-	fittingPod := pods[2]
-
-	kl.HandlePodAdditions(pods)
-	// Check pod status stored in the status map.
-	// notfittingPod should be Failed
-	for _, p := range notfittingPods {
-		status, found := kl.statusManager.GetPodStatus(p.UID)
-		require.True(t, found, "Status of pod %q is not found in the status map", p.UID)
-		require.Equal(t, v1.PodFailed, status.Phase)
-	}
-
-	// fittingPod should be Pending
-	status, found := kl.statusManager.GetPodStatus(fittingPod.UID)
-	require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID)
-	require.Equal(t, v1.PodPending, status.Phase)
-}
-
 // Tests that we handle host name conflicts correctly by setting the failed status in status map.
 func TestHandleHostNameConflicts(t *testing.T) {
 	testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
--- a/pkg/kubelet/qos/BUILD
+++ b/pkg/kubelet/qos/BUILD
@ -8,6 +8,21 @@ load(
    "go_test",
 )

+go_test(
+    name = "go_default_test",
+    srcs = [
+        "policy_test.go",
+        "qos_test.go",
+    ],
+    library = ":go_default_library",
+    tags = ["automanaged"],
+    deps = [
+        "//pkg/api/v1:go_default_library",
+        "//vendor:k8s.io/apimachinery/pkg/api/resource",
+        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
+    ],
+)
+
 go_library(
    name = "go_default_library",
    srcs = [
@ -19,28 +34,11 @@ go_library(
    deps = [
        "//pkg/api:go_default_library",
        "//pkg/api/v1:go_default_library",
-        "//pkg/kubelet/types:go_default_library",
        "//vendor:k8s.io/apimachinery/pkg/api/resource",
        "//vendor:k8s.io/apimachinery/pkg/util/sets",
    ],
 )

-go_test(
-    name = "go_default_test",
-    srcs = [
-        "policy_test.go",
-        "qos_test.go",
-    ],
-    library = ":go_default_library",
-    tags = ["automanaged"],
-    deps = [
-        "//pkg/api/v1:go_default_library",
-        "//pkg/kubelet/types:go_default_library",
-        "//vendor:k8s.io/apimachinery/pkg/api/resource",
-        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
-    ],
-)
-
 filegroup(
    name = "package-srcs",
    srcs = glob(["**"]),
--- a/pkg/kubelet/qos/policy.go
+++ b/pkg/kubelet/qos/policy.go
@ -16,20 +16,14 @@ limitations under the License.

 package qos

-import (
-	"k8s.io/kubernetes/pkg/api/v1"
-	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
-)
+import "k8s.io/kubernetes/pkg/api/v1"

 const (
 	// PodInfraOOMAdj is very docker specific. For arbitrary runtime, it may not make
 	// sense to set sandbox level oom score, e.g. a sandbox could only be a namespace
 	// without a process.
 	// TODO: Handle infra container oom score adj in a runtime agnostic way.
-	// TODO: Should handle critical pod oom score adj with a proper preemption priority.
-	// This is the workaround for https://github.com/kubernetes/kubernetes/issues/38322.
 	PodInfraOOMAdj        int = -998
-	CriticalPodOOMAdj     int = -998
 	KubeletOOMScoreAdj    int = -999
 	DockerOOMScoreAdj     int = -999
 	KubeProxyOOMScoreAdj  int = -999
@ -44,10 +38,6 @@ const (
 // and 1000. Containers with higher OOM scores are killed if the system runs out of memory.
 // See https://lwn.net/Articles/391222/ for more information.
 func GetContainerOOMScoreAdjust(pod *v1.Pod, container *v1.Container, memoryCapacity int64) int {
-	if kubetypes.IsCriticalPod(pod) {
-		return CriticalPodOOMAdj
-	}
-
 	switch GetPodQOS(pod) {
 	case v1.PodQOSGuaranteed:
 		// Guaranteed containers should be the last to get killed.
--- a/pkg/kubelet/qos/policy_test.go
+++ b/pkg/kubelet/qos/policy_test.go
@ -21,9 +21,7 @@ import (
 	"testing"

 	"k8s.io/apimachinery/pkg/api/resource"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/kubernetes/pkg/api/v1"
-	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
 )

 const (
@ -137,25 +135,6 @@ var (
 			},
 		},
 	}
-	criticalPodWithNoLimit = v1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			Annotations: map[string]string{
-				kubetypes.CriticalPodAnnotationKey: "",
-			},
-		},
-		Spec: v1.PodSpec{
-			Containers: []v1.Container{
-				{
-					Resources: v1.ResourceRequirements{
-						Requests: v1.ResourceList{
-							v1.ResourceName(v1.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)),
-							v1.ResourceName(v1.ResourceCPU):    resource.MustParse("5m"),
-						},
-					},
-				},
-			},
-		},
-	}
 )

 type oomTest struct {
@ -209,12 +188,6 @@ func TestGetContainerOOMScoreAdjust(t *testing.T) {
 			lowOOMScoreAdj:  2,
 			highOOMScoreAdj: 2,
 		},
-		{
-			pod:             &criticalPodWithNoLimit,
-			memoryCapacity:  standardMemoryAmount,
-			lowOOMScoreAdj:  -998,
-			highOOMScoreAdj: -998,
-		},
 	}
 	for _, test := range oomTests {
 		oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity)
--- a/pkg/kubelet/types/BUILD
+++ b/pkg/kubelet/types/BUILD
@ -19,6 +19,7 @@ go_library(
    ],
    tags = ["automanaged"],
    deps = [
+        "//pkg/api:go_default_library",
        "//pkg/api/v1:go_default_library",
        "//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
    ],
--- a/pkg/kubelet/types/pod_update.go
+++ b/pkg/kubelet/types/pod_update.go
@ -20,22 +20,17 @@ import (
 	"fmt"

 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	kubeapi "k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/v1"
 )

-const ConfigSourceAnnotationKey = "kubernetes.io/config.source"
-const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror"
-const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
-const ConfigHashAnnotationKey = "kubernetes.io/config.hash"
-
-// This key needs to sync with the key used by the rescheduler, which currently
-// lives in contrib. Its presence indicates 2 things, as far as the kubelet is
-// concerned:
-// 1. Resource related admission checks will prioritize the admission of
-//    pods bearing the key, over pods without the key, regardless of QoS.
-// 2. The OOM score of pods bearing the key will be <= pods without
-//    the key (where the <= part is determied by QoS).
-const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod"
+const (
+	ConfigSourceAnnotationKey    = "kubernetes.io/config.source"
+	ConfigMirrorAnnotationKey    = "kubernetes.io/config.mirror"
+	ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
+	ConfigHashAnnotationKey      = "kubernetes.io/config.hash"
+	CriticalPodAnnotationKey     = "scheduler.alpha.kubernetes.io/critical-pod"
+)

 // PodOperation defines what changes will be made on a pod configuration.
 type PodOperation int
@ -146,6 +141,13 @@ func (sp SyncPodType) String() string {
 // key. Both the rescheduler and the kubelet use this key to make admission
 // and scheduling decisions.
 func IsCriticalPod(pod *v1.Pod) bool {
-	_, ok := pod.Annotations[CriticalPodAnnotationKey]
-	return ok
+	// Critical pods are restricted to "kube-system" namespace as of now.
+	if pod.Namespace != kubeapi.NamespaceSystem {
+		return false
+	}
+	val, ok := pod.Annotations[CriticalPodAnnotationKey]
+	if ok && val == "" {
+		return true
+	}
+	return false
 }