mirror of https://github.com/k3s-io/k3s
Merge pull request #65593 from bsalamat/priority_admission
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Limit usage of system critical priority classes to the system namespace **What this PR does / why we need it**: Changes Priority admission controller to limit usage of system critical priority classes to the system namespace. This change is needed to mitigate the risk of creating many pods at system critical priority levels that could cause preemption of system critical components. **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # ref/ #65557 **Special notes for your reviewer**: **Release note**: ```release-note Limit the usage of system-node-critical and system-cluster-critical priority classes to kube-system namespace. ``` /sig schedulingpull/8/head
commit
7496c64b46
|
@ -38,6 +38,7 @@ go_library(
|
|||
"//pkg/kubeapiserver/admission:go_default_library",
|
||||
"//pkg/kubelet/types:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
|
||||
"//staging/src/k8s.io/apiserver/pkg/admission:go_default_library",
|
||||
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
|
||||
|
|
|
@ -21,6 +21,7 @@ import (
|
|||
"io"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/labels"
|
||||
"k8s.io/apiserver/pkg/admission"
|
||||
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
||||
|
@ -134,6 +135,20 @@ func (p *priorityPlugin) Validate(a admission.Attributes) error {
|
|||
}
|
||||
}
|
||||
|
||||
// priorityClassPermittedInNamespace returns true if we allow the given priority class name in the
|
||||
// given namespace. It currently checks that system priorities are created only in the system namespace.
|
||||
func priorityClassPermittedInNamespace(priorityClassName string, namespace string) bool {
|
||||
// Only allow system priorities in the system namespace. This is to prevent abuse or incorrect
|
||||
// usage of these priorities. Pods created at these priorities could preempt system critical
|
||||
// components.
|
||||
for _, spc := range scheduling.SystemPriorityClasses() {
|
||||
if spc.Name == priorityClassName && namespace != metav1.NamespaceSystem {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// admitPod makes sure a new pod does not set spec.Priority field. It also makes sure that the PriorityClassName exists if it is provided and resolves the pod priority from the PriorityClassName.
|
||||
func (p *priorityPlugin) admitPod(a admission.Attributes) error {
|
||||
operation := a.GetOperation()
|
||||
|
@ -162,6 +177,11 @@ func (p *priorityPlugin) admitPod(a admission.Attributes) error {
|
|||
return fmt.Errorf("failed to get default priority class: %v", err)
|
||||
}
|
||||
} else {
|
||||
pcName := pod.Spec.PriorityClassName
|
||||
if !priorityClassPermittedInNamespace(pcName, a.GetNamespace()) {
|
||||
return admission.NewForbidden(a, fmt.Errorf("pods with %v priorityClass is not permitted in %v namespace", pcName, a.GetNamespace()))
|
||||
}
|
||||
|
||||
// Try resolving the priority class name.
|
||||
pc, err := p.lister.Get(pod.Spec.PriorityClassName)
|
||||
if err != nil {
|
||||
|
|
|
@ -314,7 +314,7 @@ func TestPodAdmission(t *testing.T) {
|
|||
{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "pod-w-system-priority",
|
||||
Namespace: "namespace",
|
||||
Namespace: metav1.NamespaceSystem,
|
||||
},
|
||||
Spec: api.PodSpec{
|
||||
Containers: []api.Container{
|
||||
|
@ -329,7 +329,7 @@ func TestPodAdmission(t *testing.T) {
|
|||
{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "mirror-pod-w-system-priority",
|
||||
Namespace: "namespace",
|
||||
Namespace: metav1.NamespaceSystem,
|
||||
Annotations: map[string]string{api.MirrorPodAnnotationKey: ""},
|
||||
},
|
||||
Spec: api.PodSpec{
|
||||
|
@ -374,6 +374,21 @@ func TestPodAdmission(t *testing.T) {
|
|||
},
|
||||
},
|
||||
},
|
||||
// pod[8]: Pod with a system priority class name in non-system namespace
|
||||
{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "pod-w-system-priority-in-nonsystem-namespace",
|
||||
Namespace: "non-system-namespace",
|
||||
},
|
||||
Spec: api.PodSpec{
|
||||
Containers: []api.Container{
|
||||
{
|
||||
Name: containerName,
|
||||
},
|
||||
},
|
||||
PriorityClassName: scheduling.SystemClusterCritical,
|
||||
},
|
||||
},
|
||||
}
|
||||
// Enable PodPriority feature gate.
|
||||
utilfeature.DefaultFeatureGate.Set(fmt.Sprintf("%s=true", features.PodPriority))
|
||||
|
@ -459,6 +474,13 @@ func TestPodAdmission(t *testing.T) {
|
|||
scheduling.SystemCriticalPriority,
|
||||
false,
|
||||
},
|
||||
{
|
||||
"pod with system critical priority in non-system namespace",
|
||||
[]*scheduling.PriorityClass{systemClusterCritical},
|
||||
*pods[8],
|
||||
scheduling.SystemCriticalPriority,
|
||||
true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
|
@ -485,8 +507,7 @@ func TestPodAdmission(t *testing.T) {
|
|||
if !test.expectError {
|
||||
if err != nil {
|
||||
t.Errorf("Test %q: unexpected error received: %v", test.name, err)
|
||||
}
|
||||
if *test.pod.Spec.Priority != test.expectedPriority {
|
||||
} else if *test.pod.Spec.Priority != test.expectedPriority {
|
||||
t.Errorf("Test %q: expected priority is %d, but got %d.", test.name, test.expectedPriority, *test.pod.Spec.Priority)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,6 +47,7 @@ var masterNodes sets.String
|
|||
|
||||
type pausePodConfig struct {
|
||||
Name string
|
||||
Namespace string
|
||||
Affinity *v1.Affinity
|
||||
Annotations, Labels, NodeSelector map[string]string
|
||||
Resources *v1.ResourceRequirements
|
||||
|
@ -602,9 +603,11 @@ var _ = SIGDescribe("SchedulerPredicates [Serial]", func() {
|
|||
})
|
||||
|
||||
func initPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
|
||||
var gracePeriod = int64(1)
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: conf.Name,
|
||||
Namespace: conf.Namespace,
|
||||
Labels: conf.Labels,
|
||||
Annotations: conf.Annotations,
|
||||
OwnerReferences: conf.OwnerReferences,
|
||||
|
@ -619,9 +622,10 @@ func initPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
|
|||
Ports: conf.Ports,
|
||||
},
|
||||
},
|
||||
Tolerations: conf.Tolerations,
|
||||
NodeName: conf.NodeName,
|
||||
PriorityClassName: conf.PriorityClassName,
|
||||
Tolerations: conf.Tolerations,
|
||||
NodeName: conf.NodeName,
|
||||
PriorityClassName: conf.PriorityClassName,
|
||||
TerminationGracePeriodSeconds: &gracePeriod,
|
||||
},
|
||||
}
|
||||
if conf.Resources != nil {
|
||||
|
@ -631,7 +635,11 @@ func initPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
|
|||
}
|
||||
|
||||
func createPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
|
||||
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(initPausePod(f, conf))
|
||||
namespace := conf.Namespace
|
||||
if len(namespace) == 0 {
|
||||
namespace = f.Namespace.Name
|
||||
}
|
||||
pod, err := f.ClientSet.CoreV1().Pods(namespace).Create(initPausePod(f, conf))
|
||||
framework.ExpectNoError(err)
|
||||
return pod
|
||||
}
|
||||
|
@ -639,7 +647,7 @@ func createPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
|
|||
func runPausePod(f *framework.Framework, conf pausePodConfig) *v1.Pod {
|
||||
pod := createPausePod(f, conf)
|
||||
framework.ExpectNoError(framework.WaitForPodRunningInNamespace(f.ClientSet, pod))
|
||||
pod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(conf.Name, metav1.GetOptions{})
|
||||
pod, err := f.ClientSet.CoreV1().Pods(pod.Namespace).Get(conf.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
return pod
|
||||
}
|
||||
|
|
|
@ -167,6 +167,7 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func
|
|||
// Create a critical pod and make sure it is scheduled.
|
||||
runPausePod(f, pausePodConfig{
|
||||
Name: "critical-pod",
|
||||
Namespace: metav1.NamespaceSystem,
|
||||
PriorityClassName: scheduling.SystemClusterCritical,
|
||||
Resources: &v1.ResourceRequirements{
|
||||
Requests: podRes,
|
||||
|
@ -183,6 +184,9 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func
|
|||
framework.ExpectNoError(err)
|
||||
Expect(livePod.DeletionTimestamp).To(BeNil())
|
||||
}
|
||||
// Clean-up the critical pod
|
||||
err = f.ClientSet.CoreV1().Pods(metav1.NamespaceSystem).Delete("critical-pod", metav1.NewDeleteOptions(0))
|
||||
framework.ExpectNoError(err)
|
||||
})
|
||||
|
||||
// This test verifies that when a high priority pod is pending and its
|
||||
|
@ -334,10 +338,14 @@ var _ = SIGDescribe("PodPriorityResolution [Serial] [Feature:PodPreemption]", fu
|
|||
for i, spc := range systemPriorityClasses {
|
||||
pod := createPausePod(f, pausePodConfig{
|
||||
Name: fmt.Sprintf("pod%d-%v", i, spc),
|
||||
Namespace: metav1.NamespaceSystem,
|
||||
PriorityClassName: spc,
|
||||
})
|
||||
Expect(pod.Spec.Priority).NotTo(BeNil())
|
||||
framework.Logf("Created pod: %v", pod.Name)
|
||||
// Clean-up the pod.
|
||||
err := f.ClientSet.CoreV1().Pods(pod.Namespace).Delete(pod.Name, metav1.NewDeleteOptions(0))
|
||||
framework.ExpectNoError(err)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
|
|
@ -60,7 +60,6 @@ spec:
|
|||
annotations:
|
||||
scheduler.alpha.kubernetes.io/critical-pod: ''
|
||||
spec:
|
||||
priorityClassName: system-cluster-critical
|
||||
tolerations:
|
||||
- key: "CriticalAddonsOnly"
|
||||
operator: "Exists"
|
||||
|
|
Loading…
Reference in New Issue