mirror of https://github.com/k3s-io/k3s
Merge pull request #40655 from vishh/flag-gate-critical-pod-annotation
Automatic merge from submit-queue Optionally avoid evicting critical pods in kubelet For #40573 ```release-note When feature gate "ExperimentalCriticalPodAnnotation" is set, Kubelet will avoid evicting pods in "kube-system" namespace that contains a special annotation - `scheduler.alpha.kubernetes.io/critical-pod` This feature should be used in conjunction with the rescheduler to guarantee availability for critical system pods - https://kubernetes.io/docs/admin/rescheduler/ ```pull/6/head
commit
f20b4fc67f
|
@ -123,7 +123,7 @@ fi
|
|||
RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
|
||||
|
||||
# Optional: set feature gates
|
||||
FEATURE_GATES="${KUBE_FEATURE_GATES:-}"
|
||||
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
|
||||
|
||||
# Optional: Install cluster DNS.
|
||||
ENABLE_CLUSTER_DNS="${KUBE_ENABLE_CLUSTER_DNS:-true}"
|
||||
|
|
|
@ -83,7 +83,7 @@ MASTER_IP_RANGE="${MASTER_IP_RANGE:-10.246.0.0/24}"
|
|||
RUNTIME_CONFIG="${KUBE_RUNTIME_CONFIG:-}"
|
||||
|
||||
# Optional: set feature gates
|
||||
FEATURE_GATES="${KUBE_FEATURE_GATES:-}"
|
||||
FEATURE_GATES="${KUBE_FEATURE_GATES:-ExperimentalCriticalPodAnnotation=true}"
|
||||
|
||||
TERMINATED_POD_GC_THRESHOLD=${TERMINATED_POD_GC_THRESHOLD:-100}
|
||||
|
||||
|
|
|
@ -38,9 +38,8 @@ kind: Pod
|
|||
metadata:
|
||||
name: kube-proxy
|
||||
namespace: kube-system
|
||||
# This annotation lowers the possibility that kube-proxy gets evicted when the
|
||||
# node is under memory pressure, and prioritizes it for admission, even if
|
||||
# the node is under memory pressure.
|
||||
# This annotation ensures that kube-proxy does not get evicted if the node
|
||||
# supports critical pod annotation based priority scheme.
|
||||
# Note that kube-proxy runs as a static pod so this annotation does NOT have
|
||||
# any effect on rescheduler (default scheduler and rescheduler are not
|
||||
# involved in scheduling kube-proxy).
|
||||
|
|
|
@ -58,6 +58,14 @@ const (
|
|||
// contains a privileged container, or specific non-namespaced capabilities (MKNOD, SYS_MODULE,
|
||||
// SYS_TIME). This should only be enabled if user namespace remapping is enabled in the docker daemon.
|
||||
ExperimentalHostUserNamespaceDefaultingGate utilfeature.Feature = "ExperimentalHostUserNamespaceDefaulting"
|
||||
|
||||
// owner: @vishh
|
||||
// alpha: v1.5
|
||||
//
|
||||
// Ensures guaranteed scheduling of pods marked with a special pod annotation `scheduler.alpha.kubernetes.io/critical-pod`
|
||||
// and also prevents them from being evicted from a node.
|
||||
// Note: This feature is not supported for `BestEffort` pods.
|
||||
ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation"
|
||||
)
|
||||
|
||||
func init() {
|
||||
|
@ -73,6 +81,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
|
|||
DynamicKubeletConfig: {Default: false, PreRelease: utilfeature.Alpha},
|
||||
DynamicVolumeProvisioning: {Default: true, PreRelease: utilfeature.Alpha},
|
||||
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
|
||||
ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha},
|
||||
|
||||
// inherited features from generic apiserver, relisted here to get a conflict if it is changed
|
||||
// unintentionally on either side:
|
||||
|
|
|
@ -9,37 +9,16 @@ load(
|
|||
"go_test",
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"doc.go",
|
||||
"eviction_manager.go",
|
||||
"helpers.go",
|
||||
"types.go",
|
||||
cgo_genrule(
|
||||
name = "cgo_codegen",
|
||||
srcs = ["threshold_notifier_linux.go"],
|
||||
clinkopts = [
|
||||
"-lz",
|
||||
"-lm",
|
||||
"-lpthread",
|
||||
"-ldl",
|
||||
],
|
||||
library = ":cgo_codegen",
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api:go_default_library",
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
|
||||
"//pkg/kubelet/cm:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
"//pkg/kubelet/pod:go_default_library",
|
||||
"//pkg/kubelet/qos:go_default_library",
|
||||
"//pkg/kubelet/server/stats:go_default_library",
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
"//pkg/kubelet/util/format:go_default_library",
|
||||
"//pkg/quota/evaluator/core:go_default_library",
|
||||
"//vendor:github.com/golang/glog",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/wait",
|
||||
"//vendor:k8s.io/client-go/pkg/api/v1",
|
||||
"//vendor:k8s.io/client-go/tools/record",
|
||||
"//vendor:k8s.io/client-go/util/clock",
|
||||
],
|
||||
)
|
||||
|
||||
go_test(
|
||||
|
@ -60,22 +39,45 @@ go_test(
|
|||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
"//vendor:k8s.io/apimachinery/pkg/types",
|
||||
"//vendor:k8s.io/apiserver/pkg/util/feature",
|
||||
"//vendor:k8s.io/client-go/pkg/api/v1",
|
||||
"//vendor:k8s.io/client-go/tools/record",
|
||||
"//vendor:k8s.io/client-go/util/clock",
|
||||
],
|
||||
)
|
||||
|
||||
cgo_genrule(
|
||||
name = "cgo_codegen",
|
||||
srcs = ["threshold_notifier_linux.go"],
|
||||
clinkopts = [
|
||||
"-lz",
|
||||
"-lm",
|
||||
"-lpthread",
|
||||
"-ldl",
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
"doc.go",
|
||||
"eviction_manager.go",
|
||||
"helpers.go",
|
||||
"types.go",
|
||||
],
|
||||
library = ":cgo_codegen",
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api:go_default_library",
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/features:go_default_library",
|
||||
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
|
||||
"//pkg/kubelet/cm:go_default_library",
|
||||
"//pkg/kubelet/lifecycle:go_default_library",
|
||||
"//pkg/kubelet/qos:go_default_library",
|
||||
"//pkg/kubelet/server/stats:go_default_library",
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
"//pkg/kubelet/util/format:go_default_library",
|
||||
"//pkg/quota/evaluator/core:go_default_library",
|
||||
"//vendor:github.com/golang/glog",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/wait",
|
||||
"//vendor:k8s.io/apiserver/pkg/util/feature",
|
||||
"//vendor:k8s.io/client-go/pkg/api/v1",
|
||||
"//vendor:k8s.io/client-go/tools/record",
|
||||
"//vendor:k8s.io/client-go/util/clock",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
|
|
|
@ -25,16 +25,17 @@ import (
|
|||
"github.com/golang/glog"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
clientv1 "k8s.io/client-go/pkg/api/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/client-go/util/clock"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
|
||||
"k8s.io/kubernetes/pkg/kubelet/qos"
|
||||
"k8s.io/kubernetes/pkg/kubelet/server/stats"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
||||
)
|
||||
|
||||
|
@ -111,7 +112,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
|
|||
// the node has memory pressure, admit if not best-effort
|
||||
if hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) {
|
||||
notBestEffort := v1.PodQOSBestEffort != qos.GetPodQOS(attrs.Pod)
|
||||
if notBestEffort || kubetypes.IsCriticalPod(attrs.Pod) {
|
||||
if notBestEffort {
|
||||
return lifecycle.PodAdmitResult{Admit: true}
|
||||
}
|
||||
}
|
||||
|
@ -313,13 +314,10 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
|
|||
// we kill at most a single pod during each eviction interval
|
||||
for i := range activePods {
|
||||
pod := activePods[i]
|
||||
if kubepod.IsStaticPod(pod) {
|
||||
// The eviction manager doesn't evict static pods. To stop a static
|
||||
// pod, the admin needs to remove the manifest from kubelet's
|
||||
// --config directory.
|
||||
// TODO(39124): This is a short term fix, we can't assume static pods
|
||||
// are always well behaved.
|
||||
glog.Infof("eviction manager: NOT evicting static pod %v", pod.Name)
|
||||
// If the pod is marked as critical and support for critical pod annotations is enabled,
|
||||
// do not evict such pods. Once Kubelet supports preemptions, these pods can be safely evicted.
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
|
||||
kubelettypes.IsCriticalPod(pod) {
|
||||
continue
|
||||
}
|
||||
status := v1.PodStatus{
|
||||
|
|
|
@ -22,13 +22,15 @@ import (
|
|||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
clientv1 "k8s.io/client-go/pkg/api/v1"
|
||||
"k8s.io/client-go/tools/record"
|
||||
"k8s.io/client-go/util/clock"
|
||||
kubeapi "k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
|
||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
)
|
||||
|
||||
// mockPodKiller is used to testing which pod is killed
|
||||
|
@ -212,8 +214,6 @@ func TestMemoryPressure(t *testing.T) {
|
|||
// create a best effort pod to test admission
|
||||
bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
|
||||
burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")
|
||||
criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
|
||||
criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""}
|
||||
|
||||
// synchronize
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
@ -224,8 +224,8 @@ func TestMemoryPressure(t *testing.T) {
|
|||
}
|
||||
|
||||
// try to admit our pods (they should succeed)
|
||||
expected := []bool{true, true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
|
||||
expected := []bool{true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
|
||||
}
|
||||
|
@ -300,10 +300,9 @@ func TestMemoryPressure(t *testing.T) {
|
|||
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
|
||||
}
|
||||
|
||||
// the best-effort pod without critical annotation should not admit,
|
||||
// burstable and critical pods should
|
||||
expected = []bool{false, true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
|
||||
// the best-effort pod should not admit, burstable should
|
||||
expected = []bool{false, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
|
||||
}
|
||||
|
@ -325,9 +324,9 @@ func TestMemoryPressure(t *testing.T) {
|
|||
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
|
||||
}
|
||||
|
||||
// the best-effort pod should not admit, burstable and critical pods should
|
||||
expected = []bool{false, true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
|
||||
// the best-effort pod should not admit, burstable should
|
||||
expected = []bool{false, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
|
||||
}
|
||||
|
@ -350,8 +349,8 @@ func TestMemoryPressure(t *testing.T) {
|
|||
}
|
||||
|
||||
// all pods should admit now
|
||||
expected = []bool{true, true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
|
||||
expected = []bool{true, true}
|
||||
for i, pod := range []*v1.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
|
||||
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
|
||||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
|
||||
}
|
||||
|
@ -1089,3 +1088,135 @@ func TestInodePressureNodeFsInodes(t *testing.T) {
|
|||
t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCriticalPodsAreNotEvicted
|
||||
func TestCriticalPodsAreNotEvicted(t *testing.T) {
|
||||
podMaker := makePodWithMemoryStats
|
||||
summaryStatsMaker := makeMemoryStats
|
||||
podsToMake := []podToMake{
|
||||
{name: "critical", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi"), memoryWorkingSet: "800Mi"},
|
||||
}
|
||||
pods := []*v1.Pod{}
|
||||
podStats := map[*v1.Pod]statsapi.PodStats{}
|
||||
for _, podToMake := range podsToMake {
|
||||
pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits, podToMake.memoryWorkingSet)
|
||||
pods = append(pods, pod)
|
||||
podStats[pod] = podStat
|
||||
}
|
||||
|
||||
// Mark the pod as critical
|
||||
pods[0].Annotations = map[string]string{
|
||||
kubelettypes.CriticalPodAnnotationKey: "",
|
||||
}
|
||||
pods[0].Namespace = kubeapi.NamespaceSystem
|
||||
|
||||
podToEvict := pods[0]
|
||||
activePodsFunc := func() []*v1.Pod {
|
||||
return pods
|
||||
}
|
||||
|
||||
fakeClock := clock.NewFakeClock(time.Now())
|
||||
podKiller := &mockPodKiller{}
|
||||
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false}
|
||||
imageGC := &mockImageGC{freed: int64(0), err: nil}
|
||||
nodeRef := &clientv1.ObjectReference{
|
||||
Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
|
||||
}
|
||||
|
||||
config := Config{
|
||||
MaxPodGracePeriodSeconds: 5,
|
||||
PressureTransitionPeriod: time.Minute * 5,
|
||||
Thresholds: []Threshold{
|
||||
{
|
||||
Signal: SignalMemoryAvailable,
|
||||
Operator: OpLessThan,
|
||||
Value: ThresholdValue{
|
||||
Quantity: quantityMustParse("1Gi"),
|
||||
},
|
||||
},
|
||||
{
|
||||
Signal: SignalMemoryAvailable,
|
||||
Operator: OpLessThan,
|
||||
Value: ThresholdValue{
|
||||
Quantity: quantityMustParse("2Gi"),
|
||||
},
|
||||
GracePeriod: time.Minute * 2,
|
||||
},
|
||||
},
|
||||
}
|
||||
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("2Gi", podStats)}
|
||||
manager := &managerImpl{
|
||||
clock: fakeClock,
|
||||
killPodFunc: podKiller.killPodNow,
|
||||
imageGC: imageGC,
|
||||
config: config,
|
||||
recorder: &record.FakeRecorder{},
|
||||
summaryProvider: summaryProvider,
|
||||
nodeRef: nodeRef,
|
||||
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
|
||||
thresholdsFirstObservedAt: thresholdsObservedAt{},
|
||||
}
|
||||
|
||||
// Enable critical pod annotation feature gate
|
||||
utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=True")
|
||||
// induce soft threshold
|
||||
fakeClock.Step(1 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should have memory pressure
|
||||
if !manager.IsUnderMemoryPressure() {
|
||||
t.Errorf("Manager should report memory pressure since soft threshold was met")
|
||||
}
|
||||
|
||||
// verify no pod was yet killed because there has not yet been enough time passed.
|
||||
if podKiller.pod != nil {
|
||||
t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod.Name)
|
||||
}
|
||||
|
||||
// step forward in time pass the grace period
|
||||
fakeClock.Step(3 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("1500Mi", podStats)
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should have memory pressure
|
||||
if !manager.IsUnderMemoryPressure() {
|
||||
t.Errorf("Manager should report memory pressure since soft threshold was met")
|
||||
}
|
||||
|
||||
// verify the right pod was killed with the right grace period.
|
||||
if podKiller.pod == podToEvict {
|
||||
t.Errorf("Manager chose to kill critical pod: %v, but should have ignored it", podKiller.pod.Name)
|
||||
}
|
||||
// reset state
|
||||
podKiller.pod = nil
|
||||
podKiller.gracePeriodOverride = nil
|
||||
|
||||
// remove memory pressure
|
||||
fakeClock.Step(20 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("3Gi", podStats)
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should not have memory pressure
|
||||
if manager.IsUnderMemoryPressure() {
|
||||
t.Errorf("Manager should not report memory pressure")
|
||||
}
|
||||
|
||||
// Disable critical pod annotation feature gate
|
||||
utilfeature.DefaultFeatureGate.Set("ExperimentalCriticalPodAnnotation=False")
|
||||
|
||||
// induce memory pressure!
|
||||
fakeClock.Step(1 * time.Minute)
|
||||
summaryProvider.result = summaryStatsMaker("500Mi", podStats)
|
||||
manager.synchronize(diskInfoProvider, activePodsFunc)
|
||||
|
||||
// we should have memory pressure
|
||||
if !manager.IsUnderMemoryPressure() {
|
||||
t.Errorf("Manager should report memory pressure")
|
||||
}
|
||||
|
||||
// check the right pod was killed
|
||||
if podKiller.pod != podToEvict {
|
||||
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1898,21 +1898,8 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *v1.Pod, start time.Time) {
|
|||
// a config source.
|
||||
func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
|
||||
start := kl.clock.Now()
|
||||
|
||||
// Pass critical pods through admission check first.
|
||||
var criticalPods []*v1.Pod
|
||||
var nonCriticalPods []*v1.Pod
|
||||
for _, p := range pods {
|
||||
if kubetypes.IsCriticalPod(p) {
|
||||
criticalPods = append(criticalPods, p)
|
||||
} else {
|
||||
nonCriticalPods = append(nonCriticalPods, p)
|
||||
}
|
||||
}
|
||||
sort.Sort(sliceutils.PodsByCreationTime(criticalPods))
|
||||
sort.Sort(sliceutils.PodsByCreationTime(nonCriticalPods))
|
||||
|
||||
for _, pod := range append(criticalPods, nonCriticalPods...) {
|
||||
sort.Sort(sliceutils.PodsByCreationTime(pods))
|
||||
for _, pod := range pods {
|
||||
existingPods := kl.podManager.GetPods()
|
||||
// Always add the pod to the pod manager. Kubelet relies on the pod
|
||||
// manager as the source of truth for the desired state. If a pod does
|
||||
|
|
|
@ -479,69 +479,6 @@ func TestHandlePortConflicts(t *testing.T) {
|
|||
require.Equal(t, v1.PodPending, status.Phase)
|
||||
}
|
||||
|
||||
// Tests that we sort pods based on criticality.
|
||||
func TestCriticalPrioritySorting(t *testing.T) {
|
||||
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
|
||||
kl := testKubelet.kubelet
|
||||
nodes := []v1.Node{
|
||||
{ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname},
|
||||
Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: v1.ResourceList{
|
||||
v1.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI),
|
||||
v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI),
|
||||
v1.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI),
|
||||
}}},
|
||||
}
|
||||
kl.nodeLister = testNodeLister{nodes: nodes}
|
||||
kl.nodeInfo = testNodeInfo{nodes: nodes}
|
||||
testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorapi.MachineInfo{}, nil)
|
||||
testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)
|
||||
testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)
|
||||
|
||||
spec := v1.PodSpec{NodeName: string(kl.nodeName),
|
||||
Containers: []v1.Container{{Resources: v1.ResourceRequirements{
|
||||
Requests: v1.ResourceList{
|
||||
"memory": resource.MustParse("90"),
|
||||
},
|
||||
}}},
|
||||
}
|
||||
pods := []*v1.Pod{
|
||||
podWithUidNameNsSpec("000000000", "newpod", "foo", spec),
|
||||
podWithUidNameNsSpec("987654321", "oldpod", "foo", spec),
|
||||
podWithUidNameNsSpec("123456789", "middlepod", "foo", spec),
|
||||
}
|
||||
|
||||
// Pods are not sorted by creation time.
|
||||
startTime := time.Now()
|
||||
pods[0].CreationTimestamp = metav1.NewTime(startTime.Add(10 * time.Second))
|
||||
pods[1].CreationTimestamp = metav1.NewTime(startTime)
|
||||
pods[2].CreationTimestamp = metav1.NewTime(startTime.Add(1 * time.Second))
|
||||
|
||||
// Make the middle and new pod critical, the middle pod should win
|
||||
// even though it comes later in the list
|
||||
critical := map[string]string{kubetypes.CriticalPodAnnotationKey: ""}
|
||||
pods[0].Annotations = critical
|
||||
pods[1].Annotations = map[string]string{}
|
||||
pods[2].Annotations = critical
|
||||
|
||||
// The non-critical pod should be rejected
|
||||
notfittingPods := []*v1.Pod{pods[0], pods[1]}
|
||||
fittingPod := pods[2]
|
||||
|
||||
kl.HandlePodAdditions(pods)
|
||||
// Check pod status stored in the status map.
|
||||
// notfittingPod should be Failed
|
||||
for _, p := range notfittingPods {
|
||||
status, found := kl.statusManager.GetPodStatus(p.UID)
|
||||
require.True(t, found, "Status of pod %q is not found in the status map", p.UID)
|
||||
require.Equal(t, v1.PodFailed, status.Phase)
|
||||
}
|
||||
|
||||
// fittingPod should be Pending
|
||||
status, found := kl.statusManager.GetPodStatus(fittingPod.UID)
|
||||
require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID)
|
||||
require.Equal(t, v1.PodPending, status.Phase)
|
||||
}
|
||||
|
||||
// Tests that we handle host name conflicts correctly by setting the failed status in status map.
|
||||
func TestHandleHostNameConflicts(t *testing.T) {
|
||||
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
|
||||
|
|
|
@ -8,6 +8,21 @@ load(
|
|||
"go_test",
|
||||
)
|
||||
|
||||
go_test(
|
||||
name = "go_default_test",
|
||||
srcs = [
|
||||
"policy_test.go",
|
||||
"qos_test.go",
|
||||
],
|
||||
library = ":go_default_library",
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
],
|
||||
)
|
||||
|
||||
go_library(
|
||||
name = "go_default_library",
|
||||
srcs = [
|
||||
|
@ -19,28 +34,11 @@ go_library(
|
|||
deps = [
|
||||
"//pkg/api:go_default_library",
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
],
|
||||
)
|
||||
|
||||
go_test(
|
||||
name = "go_default_test",
|
||||
srcs = [
|
||||
"policy_test.go",
|
||||
"qos_test.go",
|
||||
],
|
||||
library = ":go_default_library",
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
"//vendor:k8s.io/apimachinery/pkg/api/resource",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
],
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "package-srcs",
|
||||
srcs = glob(["**"]),
|
||||
|
|
|
@ -16,20 +16,14 @@ limitations under the License.
|
|||
|
||||
package qos
|
||||
|
||||
import (
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
)
|
||||
import "k8s.io/kubernetes/pkg/api/v1"
|
||||
|
||||
const (
|
||||
// PodInfraOOMAdj is very docker specific. For arbitrary runtime, it may not make
|
||||
// sense to set sandbox level oom score, e.g. a sandbox could only be a namespace
|
||||
// without a process.
|
||||
// TODO: Handle infra container oom score adj in a runtime agnostic way.
|
||||
// TODO: Should handle critical pod oom score adj with a proper preemption priority.
|
||||
// This is the workaround for https://github.com/kubernetes/kubernetes/issues/38322.
|
||||
PodInfraOOMAdj int = -998
|
||||
CriticalPodOOMAdj int = -998
|
||||
KubeletOOMScoreAdj int = -999
|
||||
DockerOOMScoreAdj int = -999
|
||||
KubeProxyOOMScoreAdj int = -999
|
||||
|
@ -44,10 +38,6 @@ const (
|
|||
// and 1000. Containers with higher OOM scores are killed if the system runs out of memory.
|
||||
// See https://lwn.net/Articles/391222/ for more information.
|
||||
func GetContainerOOMScoreAdjust(pod *v1.Pod, container *v1.Container, memoryCapacity int64) int {
|
||||
if kubetypes.IsCriticalPod(pod) {
|
||||
return CriticalPodOOMAdj
|
||||
}
|
||||
|
||||
switch GetPodQOS(pod) {
|
||||
case v1.PodQOSGuaranteed:
|
||||
// Guaranteed containers should be the last to get killed.
|
||||
|
|
|
@ -21,9 +21,7 @@ import (
|
|||
"testing"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -137,25 +135,6 @@ var (
|
|||
},
|
||||
},
|
||||
}
|
||||
criticalPodWithNoLimit = v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Annotations: map[string]string{
|
||||
kubetypes.CriticalPodAnnotationKey: "",
|
||||
},
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{
|
||||
{
|
||||
Resources: v1.ResourceRequirements{
|
||||
Requests: v1.ResourceList{
|
||||
v1.ResourceName(v1.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)),
|
||||
v1.ResourceName(v1.ResourceCPU): resource.MustParse("5m"),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
type oomTest struct {
|
||||
|
@ -209,12 +188,6 @@ func TestGetContainerOOMScoreAdjust(t *testing.T) {
|
|||
lowOOMScoreAdj: 2,
|
||||
highOOMScoreAdj: 2,
|
||||
},
|
||||
{
|
||||
pod: &criticalPodWithNoLimit,
|
||||
memoryCapacity: standardMemoryAmount,
|
||||
lowOOMScoreAdj: -998,
|
||||
highOOMScoreAdj: -998,
|
||||
},
|
||||
}
|
||||
for _, test := range oomTests {
|
||||
oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity)
|
||||
|
|
|
@ -19,6 +19,7 @@ go_library(
|
|||
],
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api:go_default_library",
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//vendor:k8s.io/apimachinery/pkg/apis/meta/v1",
|
||||
],
|
||||
|
|
|
@ -20,22 +20,17 @@ import (
|
|||
"fmt"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
kubeapi "k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
)
|
||||
|
||||
const ConfigSourceAnnotationKey = "kubernetes.io/config.source"
|
||||
const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror"
|
||||
const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
|
||||
const ConfigHashAnnotationKey = "kubernetes.io/config.hash"
|
||||
|
||||
// This key needs to sync with the key used by the rescheduler, which currently
|
||||
// lives in contrib. Its presence indicates 2 things, as far as the kubelet is
|
||||
// concerned:
|
||||
// 1. Resource related admission checks will prioritize the admission of
|
||||
// pods bearing the key, over pods without the key, regardless of QoS.
|
||||
// 2. The OOM score of pods bearing the key will be <= pods without
|
||||
// the key (where the <= part is determied by QoS).
|
||||
const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod"
|
||||
const (
|
||||
ConfigSourceAnnotationKey = "kubernetes.io/config.source"
|
||||
ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror"
|
||||
ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
|
||||
ConfigHashAnnotationKey = "kubernetes.io/config.hash"
|
||||
CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod"
|
||||
)
|
||||
|
||||
// PodOperation defines what changes will be made on a pod configuration.
|
||||
type PodOperation int
|
||||
|
@ -146,6 +141,13 @@ func (sp SyncPodType) String() string {
|
|||
// key. Both the rescheduler and the kubelet use this key to make admission
|
||||
// and scheduling decisions.
|
||||
func IsCriticalPod(pod *v1.Pod) bool {
|
||||
_, ok := pod.Annotations[CriticalPodAnnotationKey]
|
||||
return ok
|
||||
// Critical pods are restricted to "kube-system" namespace as of now.
|
||||
if pod.Namespace != kubeapi.NamespaceSystem {
|
||||
return false
|
||||
}
|
||||
val, ok := pod.Annotations[CriticalPodAnnotationKey]
|
||||
if ok && val == "" {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue