Auto-create system critical prioity classes at API server startup

pull/8/head
Bobby (Babak) Salamat 2018-02-22 17:38:17 -08:00
parent 79257fe611
commit ebda9584f9
9 changed files with 180 additions and 63 deletions

View File

@ -23,9 +23,17 @@ const (
// that do not specify any priority class and there is no priority class
// marked as default.
DefaultPriorityWhenNoDefaultClassExists = 0
// HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use.
HighestUserDefinablePriority = int32(1000000000)
// SystemCriticalPriority is the beginning of the range of priority values for critical system components.
SystemCriticalPriority = 2 * HighestUserDefinablePriority
// SystemPriorityClassPrefix is the prefix reserved for system priority class names. Other priority
// classes are not allowed to start with this prefix.
SystemPriorityClassPrefix = "system-"
// NOTE: In order to avoid conflict of names with user-defined priority classes, all the names must
// start with SystemPriorityClassPrefix.
SystemClusterCritical = SystemPriorityClassPrefix + "cluster-critical"
SystemNodeCritical = SystemPriorityClassPrefix + "node-critical"
)
// +genclient

View File

@ -17,8 +17,6 @@ limitations under the License.
package validation
import (
"strings"
"k8s.io/apimachinery/pkg/util/validation/field"
apivalidation "k8s.io/kubernetes/pkg/apis/core/validation"
"k8s.io/kubernetes/pkg/apis/scheduling"
@ -26,12 +24,7 @@ import (
// ValidatePriorityClassName checks whether the given priority class name is valid.
func ValidatePriorityClassName(name string, prefix bool) []string {
var allErrs []string
if strings.HasPrefix(name, scheduling.SystemPriorityClassPrefix) {
allErrs = append(allErrs, "priority class names with '"+scheduling.SystemPriorityClassPrefix+"' prefix are reserved for system use only")
}
allErrs = append(allErrs, apivalidation.NameIsDNSSubdomain(name, prefix)...)
return allErrs
return apivalidation.NameIsDNSSubdomain(name, prefix)
}
// ValidatePriorityClass tests whether required fields in the PriorityClass are

View File

@ -53,10 +53,6 @@ func TestValidatePriorityClass(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{Name: "tier&1", Namespace: ""},
Value: 100,
},
"invalid system name": {
ObjectMeta: metav1.ObjectMeta{Name: scheduling.SystemPriorityClassPrefix + "test"},
Value: 100,
},
}
for k, v := range errorCases {

View File

@ -22,7 +22,7 @@ import (
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kubeapi "k8s.io/kubernetes/pkg/apis/core"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
"k8s.io/kubernetes/pkg/apis/scheduling"
)
const (
@ -168,7 +168,7 @@ func IsCriticalPodBasedOnPriority(ns string, priority int32) bool {
if ns != kubeapi.NamespaceSystem {
return false
}
if priority >= schedulerapi.SystemCriticalPriority {
if priority >= scheduling.SystemCriticalPriority {
return true
}
return false

View File

@ -17,6 +17,15 @@ limitations under the License.
package rest
import (
"fmt"
"time"
"github.com/golang/glog"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/registry/generic"
"k8s.io/apiserver/pkg/registry/rest"
genericapiserver "k8s.io/apiserver/pkg/server"
@ -24,11 +33,16 @@ import (
"k8s.io/kubernetes/pkg/api/legacyscheme"
"k8s.io/kubernetes/pkg/apis/scheduling"
schedulingapiv1alpha1 "k8s.io/kubernetes/pkg/apis/scheduling/v1alpha1"
schedulingclient "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/typed/scheduling/internalversion"
priorityclassstore "k8s.io/kubernetes/pkg/registry/scheduling/priorityclass/storage"
)
const PostStartHookName = "scheduling/bootstrap-system-priority-classes"
type RESTStorageProvider struct{}
var _ genericapiserver.PostStartHookProvider = RESTStorageProvider{}
func (p RESTStorageProvider) NewRESTStorage(apiResourceConfigSource serverstorage.APIResourceConfigSource, restOptionsGetter generic.RESTOptionsGetter) (genericapiserver.APIGroupInfo, bool) {
apiGroupInfo := genericapiserver.NewDefaultAPIGroupInfo(scheduling.GroupName, legacyscheme.Registry, legacyscheme.Scheme, legacyscheme.ParameterCodec, legacyscheme.Codecs)
@ -49,6 +63,64 @@ func (p RESTStorageProvider) v1alpha1Storage(apiResourceConfigSource serverstora
return storage
}
func (p RESTStorageProvider) PostStartHook() (string, genericapiserver.PostStartHookFunc, error) {
return PostStartHookName, AddSystemPriorityClasses(), nil
}
func AddSystemPriorityClasses() genericapiserver.PostStartHookFunc {
priorityClasses := []*scheduling.PriorityClass{
{
ObjectMeta: metav1.ObjectMeta{
Name: scheduling.SystemNodeCritical,
},
Value: scheduling.SystemCriticalPriority + 1000,
Description: "Used for system critical pods that must not be moved from their current node.",
},
{
ObjectMeta: metav1.ObjectMeta{
Name: scheduling.SystemClusterCritical,
},
Value: scheduling.SystemCriticalPriority,
Description: "Used for system critical pods that must run in the cluster, but can be moved to another node if necessary.",
},
}
return func(hookContext genericapiserver.PostStartHookContext) error {
// Adding system priority classes is important. If they fail to add, many critical system
// components may fail and cluster may break.
err := wait.Poll(1*time.Second, 30*time.Second, func() (done bool, err error) {
schedClientSet, err := schedulingclient.NewForConfig(hookContext.LoopbackClientConfig)
if err != nil {
utilruntime.HandleError(fmt.Errorf("unable to initialize client: %v", err))
return false, nil
}
for _, pc := range priorityClasses {
_, err := schedClientSet.PriorityClasses().Get(pc.Name, metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
_, err := schedClientSet.PriorityClasses().Create(pc)
if err != nil {
return false, err
} else {
glog.Infof("created PriorityClass %s with value %v", pc.Name, pc.Value)
}
} else {
// Unable to get the priority class for reasons other than "not found".
return false, err
}
}
}
glog.Infof("all system priority classes are created successfully.")
return true, nil
})
// if we're never able to make it through initialization, kill the API server.
if err != nil {
return fmt.Errorf("unable to add default system priority classes: %v", err)
}
return nil
}
}
func (p RESTStorageProvider) GroupName() string {
return scheduling.GroupName
}

View File

@ -36,14 +36,6 @@ const (
MaxPriority = 10
// MaxWeight defines the max weight value.
MaxWeight = MaxInt / MaxPriority
// HighestUserDefinablePriority is the highest priority for user defined priority classes. Priority values larger than 1 billion are reserved for Kubernetes system use.
HighestUserDefinablePriority = int32(1000000000)
// SystemCriticalPriority is the beginning of the range of priority values for critical system components.
SystemCriticalPriority = 2 * HighestUserDefinablePriority
// NOTE: In order to avoid conflict of names with user-defined priority classes, all the names must
// start with scheduling.SystemPriorityClassPrefix which is by default "system-".
SystemClusterCritical = "system-cluster-critical"
SystemNodeCritical = "system-node-critical"
)
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
@ -259,12 +251,6 @@ type HostPriority struct {
// HostPriorityList declares a []HostPriority type.
type HostPriorityList []HostPriority
// SystemPriorityClasses defines special priority classes which are used by system critical pods that should not be preempted by workload pods.
var SystemPriorityClasses = map[string]int32{
SystemClusterCritical: SystemCriticalPriority,
SystemNodeCritical: SystemCriticalPriority + 1000,
}
func (h HostPriorityList) Len() int {
return len(h)
}

View File

@ -19,10 +19,12 @@ package priority
import (
"fmt"
"io"
"strings"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apiserver/pkg/admission"
"k8s.io/apiserver/pkg/authentication/user"
utilfeature "k8s.io/apiserver/pkg/util/feature"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/scheduling"
@ -32,7 +34,6 @@ import (
"k8s.io/kubernetes/pkg/features"
kubeapiserveradmission "k8s.io/kubernetes/pkg/kubeapiserver/admission"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
)
const (
@ -154,7 +155,7 @@ func (p *priorityPlugin) admitPod(a admission.Attributes) error {
if len(pod.Spec.PriorityClassName) == 0 &&
utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCritical(a.GetNamespace(), pod.Annotations) {
pod.Spec.PriorityClassName = schedulerapi.SystemClusterCritical
pod.Spec.PriorityClassName = scheduling.SystemClusterCritical
}
if len(pod.Spec.PriorityClassName) == 0 {
var err error
@ -163,22 +164,17 @@ func (p *priorityPlugin) admitPod(a admission.Attributes) error {
return fmt.Errorf("failed to get default priority class: %v", err)
}
} else {
// First try to resolve by system priority classes.
priority, ok = schedulerapi.SystemPriorityClasses[pod.Spec.PriorityClassName]
if !ok {
// Now that we didn't find any system priority, try resolving by user defined priority classes.
pc, err := p.lister.Get(pod.Spec.PriorityClassName)
if err != nil {
if errors.IsNotFound(err) {
return admission.NewForbidden(a, fmt.Errorf("no PriorityClass with name %v was found", pod.Spec.PriorityClassName))
}
return fmt.Errorf("failed to get PriorityClass with name %s: %v", pod.Spec.PriorityClassName, err)
// Try resolving the priority class name.
pc, err := p.lister.Get(pod.Spec.PriorityClassName)
if err != nil {
if errors.IsNotFound(err) {
return admission.NewForbidden(a, fmt.Errorf("no PriorityClass with name %v was found", pod.Spec.PriorityClassName))
}
priority = pc.Value
return fmt.Errorf("failed to get PriorityClass with name %s: %v", pod.Spec.PriorityClassName, err)
}
priority = pc.Value
}
pod.Spec.Priority = &priority
}
@ -192,11 +188,15 @@ func (p *priorityPlugin) validatePriorityClass(a admission.Attributes) error {
if !ok {
return errors.NewBadRequest("resource was marked with kind PriorityClass but was unable to be converted")
}
if pc.Value > schedulerapi.HighestUserDefinablePriority {
return admission.NewForbidden(a, fmt.Errorf("maximum allowed value of a user defined priority is %v", schedulerapi.HighestUserDefinablePriority))
}
if _, ok := schedulerapi.SystemPriorityClasses[pc.Name]; ok {
return admission.NewForbidden(a, fmt.Errorf("the name of the priority class is a reserved name for system use only: %v", pc.Name))
// API server adds system critical priority classes at bootstrapping. We should
// not enforce restrictions on adding system level priority classes for API server.
if userInfo := a.GetUserInfo(); userInfo == nil || userInfo.GetName() != user.APIServerUser {
if pc.Value > scheduling.HighestUserDefinablePriority {
return admission.NewForbidden(a, fmt.Errorf("maximum allowed value of a user defined priority is %v", scheduling.HighestUserDefinablePriority))
}
if strings.HasPrefix(pc.Name, scheduling.SystemPriorityClassPrefix) {
return admission.NewForbidden(a, fmt.Errorf("priority class names with '"+scheduling.SystemPriorityClassPrefix+"' prefix are reserved for system use only"))
}
}
// If the new PriorityClass tries to be the default priority, make sure that no other priority class is marked as default.
if pc.GlobalDefault {

View File

@ -24,13 +24,13 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apiserver/pkg/admission"
"k8s.io/apiserver/pkg/authentication/user"
utilfeature "k8s.io/apiserver/pkg/util/feature"
api "k8s.io/kubernetes/pkg/apis/core"
"k8s.io/kubernetes/pkg/apis/scheduling"
informers "k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion"
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/features"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
)
func addPriorityClasses(ctrl *priorityPlugin, priorityClasses []*scheduling.PriorityClass) {
@ -75,6 +75,17 @@ var nondefaultClass1 = &scheduling.PriorityClass{
Description: "Just a test priority class",
}
var systemClusterCritical = &scheduling.PriorityClass{
TypeMeta: metav1.TypeMeta{
Kind: "PriorityClass",
},
ObjectMeta: metav1.ObjectMeta{
Name: scheduling.SystemClusterCritical,
},
Value: scheduling.SystemCriticalPriority,
GlobalDefault: true,
}
func TestPriorityClassAdmission(t *testing.T) {
var tooHighPriorityClass = &scheduling.PriorityClass{
TypeMeta: metav1.TypeMeta{
@ -83,7 +94,7 @@ func TestPriorityClassAdmission(t *testing.T) {
ObjectMeta: metav1.ObjectMeta{
Name: "toohighclass",
},
Value: schedulerapi.HighestUserDefinablePriority + 1,
Value: scheduling.HighestUserDefinablePriority + 1,
Description: "Just a test priority class",
}
@ -92,42 +103,56 @@ func TestPriorityClassAdmission(t *testing.T) {
Kind: "PriorityClass",
},
ObjectMeta: metav1.ObjectMeta{
Name: schedulerapi.SystemClusterCritical,
Name: scheduling.SystemPriorityClassPrefix + "test",
},
Value: schedulerapi.HighestUserDefinablePriority + 1,
Description: "Name conflicts with system priority class names",
Value: scheduling.HighestUserDefinablePriority + 1,
Description: "Name has system critical prefix",
}
tests := []struct {
name string
existingClasses []*scheduling.PriorityClass
newClass *scheduling.PriorityClass
userInfo user.Info
expectError bool
}{
{
"one default class",
[]*scheduling.PriorityClass{},
defaultClass1,
nil,
false,
},
{
"more than one default classes",
[]*scheduling.PriorityClass{defaultClass1},
defaultClass2,
nil,
true,
},
{
"too high PriorityClass value",
[]*scheduling.PriorityClass{},
tooHighPriorityClass,
nil,
true,
},
{
"system name conflict",
[]*scheduling.PriorityClass{},
systemClass,
nil,
true,
},
{
"system name allowed for API server",
[]*scheduling.PriorityClass{},
systemClass,
&user.DefaultInfo{
Name: user.APIServerUser,
},
false,
},
}
for _, test := range tests {
@ -146,7 +171,7 @@ func TestPriorityClassAdmission(t *testing.T) {
scheduling.Resource("priorityclasses").WithVersion("version"),
"",
admission.Create,
nil,
test.userInfo,
)
err := ctrl.Validate(attrs)
glog.Infof("Got %v", err)
@ -322,7 +347,7 @@ func TestPodAdmission(t *testing.T) {
Name: containerName,
},
},
PriorityClassName: schedulerapi.SystemClusterCritical,
PriorityClassName: scheduling.SystemClusterCritical,
},
},
// pod[5]: mirror Pod with a system priority class name
@ -419,9 +444,9 @@ func TestPodAdmission(t *testing.T) {
},
{
"pod with a system priority class",
[]*scheduling.PriorityClass{},
[]*scheduling.PriorityClass{systemClusterCritical},
*pods[4],
schedulerapi.SystemCriticalPriority,
scheduling.SystemCriticalPriority,
false,
},
{
@ -440,9 +465,9 @@ func TestPodAdmission(t *testing.T) {
},
{
"mirror pod with system priority class",
[]*scheduling.PriorityClass{},
[]*scheduling.PriorityClass{systemClusterCritical},
*pods[5],
schedulerapi.SystemCriticalPriority,
scheduling.SystemCriticalPriority,
false,
},
{
@ -454,9 +479,9 @@ func TestPodAdmission(t *testing.T) {
},
{
"pod with critical pod annotation",
[]*scheduling.PriorityClass{},
[]*scheduling.PriorityClass{systemClusterCritical},
*pods[7],
schedulerapi.SystemCriticalPriority,
scheduling.SystemCriticalPriority,
false,
},
}

View File

@ -26,7 +26,7 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clientset "k8s.io/client-go/kubernetes"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/api"
"k8s.io/kubernetes/pkg/apis/scheduling"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
@ -168,7 +168,7 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func
// Create a critical pod and make sure it is scheduled.
runPausePod(f, pausePodConfig{
Name: "critical-pod",
PriorityClassName: schedulerapi.SystemClusterCritical,
PriorityClassName: scheduling.SystemClusterCritical,
Resources: &v1.ResourceRequirements{
Requests: podRes,
},
@ -311,3 +311,40 @@ var _ = SIGDescribe("SchedulerPreemption [Serial] [Feature:PodPreemption]", func
}
})
})
var _ = SIGDescribe("PodPriorityResolution [Serial] [Feature:PodPreemption]", func() {
var cs clientset.Interface
var ns string
f := framework.NewDefaultFramework("sched-pod-priority")
BeforeEach(func() {
cs = f.ClientSet
ns = f.Namespace.Name
err := framework.CheckTestingNSDeletedExcept(cs, ns)
framework.ExpectNoError(err)
})
// This test verifies that when a higher priority pod is created and no node with
// enough resources is found, scheduler preempts a lower priority pod to schedule
// the high priority pod.
It("validates critical system priorities are created and resolved", func() {
var podRes v1.ResourceList
// Create pods that use system critical priorities and
By("Create pods that use critical system priorities.")
systemPriorityClasses := []string{
scheduling.SystemNodeCritical, scheduling.SystemClusterCritical,
}
for i, spc := range systemPriorityClasses {
pod := createPausePod(f, pausePodConfig{
Name: fmt.Sprintf("pod%d-%v", i, spc),
PriorityClassName: spc,
Resources: &v1.ResourceRequirements{
Requests: podRes,
},
})
Expect(pod.Spec.Priority).NotTo(BeNil())
framework.Logf("Created pod: %v", pod.Name)
}
})
})