Critical pods priorityClass addition

pull/6/head
ravisantoshgudimetla 2018-02-12 14:04:19 -05:00
parent fdeaa8c67a
commit 68c20ad770
6 changed files with 139 additions and 29 deletions

View File

@ -22,6 +22,7 @@ import (
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kubeapi "k8s.io/kubernetes/pkg/apis/core"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
)
const (
@ -139,15 +140,16 @@ func (sp SyncPodType) String() string {
}
}
// IsCriticalPod returns true if the pod bears the critical pod annotation
// key. Both the rescheduler and the kubelet use this key to make admission
// and scheduling decisions.
// IsCriticalPod returns true if the pod bears the critical pod annotation key or if pod's priority is greater than
// or equal to SystemCriticalPriority. Both the rescheduler(deprecated in 1.10) and the kubelet use this function
// to make admission and scheduling decisions.
func IsCriticalPod(pod *v1.Pod) bool {
return IsCritical(pod.Namespace, pod.Annotations)
return IsCritical(pod.Namespace, pod.Annotations) || (pod.Spec.Priority != nil && IsCriticalPodBasedOnPriority(pod.Namespace, *pod.Spec.Priority))
}
// IsCritical returns true if parameters bear the critical pod annotation
// key. The DaemonSetController use this key directly to make scheduling decisions.
// TODO: @ravig - Deprecated. Remove this when we move to resolving critical pods based on priorityClassName.
func IsCritical(ns string, annotations map[string]string) bool {
// Critical pods are restricted to "kube-system" namespace as of now.
if ns != kubeapi.NamespaceSystem {
@ -159,3 +161,15 @@ func IsCritical(ns string, annotations map[string]string) bool {
}
return false
}
// IsCriticalPodBasedOnPriority checks if the given pod is a critical pod based on priority resolved from pod Spec.
func IsCriticalPodBasedOnPriority(ns string, priority int32) bool {
// Critical pods are restricted to "kube-system" namespace as of now.
if ns != kubeapi.NamespaceSystem {
return false
}
if priority >= schedulerapi.SystemCriticalPriority {
return true
}
return false
}

View File

@ -158,7 +158,7 @@ func TestIsCriticalPod(t *testing.T) {
{
pod: v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod3",
Name: "pod4",
Namespace: "kube-system",
Annotations: map[string]string{
"scheduler.alpha.kubernetes.io/critical-pod": "",

View File

@ -36,6 +36,14 @@ const (
MaxPriority = 10
// MaxWeight defines the max weight value.
MaxWeight = MaxInt / MaxPriority
// HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use.
HighestUserDefinablePriority = int32(1000000000)
// SystemCriticalPriority is the beginning of the range of priority values for critical system components.
SystemCriticalPriority = 2 * HighestUserDefinablePriority
// NOTE: In order to avoid conflict of names with user-defined priority classes, all the names must
// start with scheduling.SystemPriorityClassPrefix which is by default "system-".
SystemClusterCritical = "system-cluster-critical"
SystemNodeCritical = "system-node-critical"
)
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
@ -231,6 +239,12 @@ type HostPriority struct {
// HostPriorityList declares a []HostPriority type.
type HostPriorityList []HostPriority
// SystemPriorityClasses defines special priority classes which are used by system critical pods that should not be preempted by workload pods.
var SystemPriorityClasses = map[string]int32{
SystemClusterCritical: SystemCriticalPriority,
SystemNodeCritical: SystemCriticalPriority + 1000,
}
func (h HostPriorityList) Len() int {
return len(h)
}

View File

@ -31,26 +31,15 @@ import (
schedulinglisters "k8s.io/kubernetes/pkg/client/listers/scheduling/internalversion"
"k8s.io/kubernetes/pkg/features"
kubeapiserveradmission "k8s.io/kubernetes/pkg/kubeapiserver/admission"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
)
const (
// PluginName indicates name of admission plugin.
PluginName = "Priority"
// HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use.
HighestUserDefinablePriority = 1000000000
// SystemCriticalPriority is the beginning of the range of priority values for critical system components.
SystemCriticalPriority = 2 * HighestUserDefinablePriority
)
// SystemPriorityClasses defines special priority classes which are used by system critical pods that should not be preempted by workload pods.
// NOTE: In order to avoid conflict of names with user-defined priority classes, all the names must
// start with scheduling.SystemPriorityClassPrefix which is by default "system-".
var SystemPriorityClasses = map[string]int32{
"system-cluster-critical": SystemCriticalPriority,
"system-node-critical": SystemCriticalPriority + 1000,
}
// Register registers a plugin
func Register(plugins *admission.Plugins) {
plugins.Register(PluginName, func(config io.Reader) (admission.Interface, error) {
@ -166,6 +155,13 @@ func (p *PriorityPlugin) admitPod(a admission.Attributes) error {
}
if utilfeature.DefaultFeatureGate.Enabled(features.PodPriority) {
var priority int32
// TODO: @ravig - This is for backwards compatibility to ensure that critical pods with annotations just work fine.
// Remove when no longer needed.
if len(pod.Spec.PriorityClassName) == 0 &&
utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCritical(a.GetNamespace(), pod.Annotations) {
pod.Spec.PriorityClassName = schedulerapi.SystemClusterCritical
}
if len(pod.Spec.PriorityClassName) == 0 {
var err error
priority, err = p.getDefaultPriority()
@ -174,7 +170,7 @@ func (p *PriorityPlugin) admitPod(a admission.Attributes) error {
}
} else {
// First try to resolve by system priority classes.
priority, ok = SystemPriorityClasses[pod.Spec.PriorityClassName]
priority, ok = schedulerapi.SystemPriorityClasses[pod.Spec.PriorityClassName]
if !ok {
// Now that we didn't find any system priority, try resolving by user defined priority classes.
pc, err := p.lister.Get(pod.Spec.PriorityClassName)
@ -202,10 +198,10 @@ func (p *PriorityPlugin) validatePriorityClass(a admission.Attributes) error {
if !ok {
return errors.NewBadRequest("resource was marked with kind PriorityClass but was unable to be converted")
}
if pc.Value > HighestUserDefinablePriority {
return admission.NewForbidden(a, fmt.Errorf("maximum allowed value of a user defined priority is %v", HighestUserDefinablePriority))
if pc.Value > schedulerapi.HighestUserDefinablePriority {
return admission.NewForbidden(a, fmt.Errorf("maximum allowed value of a user defined priority is %v", schedulerapi.HighestUserDefinablePriority))
}
if _, ok := SystemPriorityClasses[pc.Name]; ok {
if _, ok := schedulerapi.SystemPriorityClasses[pc.Name]; ok {
return admission.NewForbidden(a, fmt.Errorf("the name of the priority class is a reserved name for system use only: %v", pc.Name))
}
// If the new PriorityClass tries to be the default priority, make sure that no other priority class is marked as default.

View File

@ -30,6 +30,7 @@ import (
informers "k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/features"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
)
func addPriorityClasses(ctrl *PriorityPlugin, priorityClasses []*scheduling.PriorityClass) {
@ -82,7 +83,7 @@ func TestPriorityClassAdmission(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "toohighclass",
},
Value: HighestUserDefinablePriority + 1,
Value: schedulerapi.HighestUserDefinablePriority + 1,
Description: "Just a test priority class",
}
@ -91,9 +92,9 @@ func TestPriorityClassAdmission(t *testing.T) {
Kind: "PriorityClass",
},
ObjectMeta: metav1.ObjectMeta{
Name: "system-cluster-critical",
Name: schedulerapi.SystemClusterCritical,
},
Value: HighestUserDefinablePriority + 1,
Value: schedulerapi.HighestUserDefinablePriority + 1,
Description: "Name conflicts with system priority class names",
}
@ -313,7 +314,7 @@ func TestPodAdmission(t *testing.T) {
Name: containerName,
},
},
PriorityClassName: "system-cluster-critical",
PriorityClassName: schedulerapi.SystemClusterCritical,
},
},
// pod[5]: mirror Pod with a system priority class name
@ -349,9 +350,27 @@ func TestPodAdmission(t *testing.T) {
Priority: &intPriority,
},
},
// pod[7]: Pod with a critical priority annotation. This needs to be automatically assigned
// system-cluster-critical
{
ObjectMeta: metav1.ObjectMeta{
Name: "pod-w-system-priority",
Namespace: "kube-system",
Annotations: map[string]string{"scheduler.alpha.kubernetes.io/critical-pod": ""},
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Name: containerName,
},
},
},
},
}
// Enable PodPriority feature gate.
utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.PodPriority))
// Enable ExperimentalCriticalPodAnnotation feature gate.
utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.ExperimentalCriticalPodAnnotation))
tests := []struct {
name string
existingClasses []*scheduling.PriorityClass
@ -394,7 +413,7 @@ func TestPodAdmission(t *testing.T) {
"pod with a system priority class",
[]*scheduling.PriorityClass{},
*pods[4],
SystemCriticalPriority,
schedulerapi.SystemCriticalPriority,
false,
},
{
@ -415,7 +434,7 @@ func TestPodAdmission(t *testing.T) {
"mirror pod with system priority class",
[]*scheduling.PriorityClass{},
*pods[5],
SystemCriticalPriority,
schedulerapi.SystemCriticalPriority,
false,
},
{
@ -425,6 +444,13 @@ func TestPodAdmission(t *testing.T) {
0,
true,
},
{
"pod with critical pod annotation",
[]*scheduling.PriorityClass{},
*pods[7],
schedulerapi.SystemCriticalPriority,
false,
},
}
for _, test := range tests {

View File

@ -26,6 +26,7 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clientset "k8s.io/client-go/kubernetes"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
@ -43,7 +44,6 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func
lowPriorityClassName := f.BaseName + "-low-priority"
mediumPriorityClassName := f.BaseName + "-medium-priority"
highPriorityClassName := f.BaseName + "-high-priority"
AfterEach(func() {
})
@ -126,6 +126,66 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func
}
})
// This test verifies that when a critical pod is created and no node with
// enough resources is found, scheduler preempts a lower priority pod to schedule
// this critical pod.
It("validates lower priority pod preemption by critical pod", func() {
var podRes v1.ResourceList
// Create one pod per node that uses a lot of the node's resources.
By("Create pods that use 60% of node resources.")
pods := make([]*v1.Pod, len(nodeList.Items))
for i, node := range nodeList.Items {
cpuAllocatable, found := node.Status.Allocatable["cpu"]
Expect(found).To(Equal(true))
milliCPU := cpuAllocatable.MilliValue() * 40 / 100
memAllocatable, found := node.Status.Allocatable["memory"]
Expect(found).To(Equal(true))
memory := memAllocatable.Value() * 60 / 100
podRes = v1.ResourceList{}
podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
// make the first pod low priority and the rest medium priority.
priorityName := mediumPriorityClassName
if i == 0 {
priorityName = lowPriorityClassName
}
pods[i] = createPausePod(f, pausePodConfig{
Name: fmt.Sprintf("pod%d-%v", i, priorityName),
PriorityClassName: priorityName,
Resources: &v1.ResourceRequirements{
Requests: podRes,
},
})
framework.Logf("Created pod: %v", pods[i].Name)
}
By("Wait for pods to be scheduled.")
for _, pod := range pods {
framework.ExpectNoError(framework.WaitForPodRunningInNamespace(cs, pod))
}
By("Run a critical pod that use 60% of a node resources.")
// Create a critical pod and make sure it is scheduled.
runPausePod(f, pausePodConfig{
Name: "critical-pod",
PriorityClassName: schedulerapi.SystemClusterCritical,
Resources: &v1.ResourceRequirements{
Requests: podRes,
},
})
// Make sure that the lowest priority pod is deleted.
preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(pods[0].Name, metav1.GetOptions{})
podDeleted := (err != nil && errors.IsNotFound(err)) ||
(err == nil && preemptedPod.DeletionTimestamp != nil)
Expect(podDeleted).To(BeTrue())
// Other pods (mid priority ones) should be present.
for i := 1; i < len(pods); i++ {
livePod, err := cs.CoreV1().Pods(pods[i].Namespace).Get(pods[i].Name, metav1.GetOptions{})
framework.ExpectNoError(err)
Expect(livePod.DeletionTimestamp).To(BeNil())
}
})
// This test verifies that when a high priority pod is pending and its
// scheduling violates a medium priority pod anti-affinity, the medium priority
// pod is preempted to allow the higher priority pod schedule.