diff --git a/pkg/api/helper/helpers.go b/pkg/api/helper/helpers.go index e181d4a249..636d1dda0e 100644 --- a/pkg/api/helper/helpers.go +++ b/pkg/api/helper/helpers.go @@ -31,6 +31,30 @@ import ( "k8s.io/kubernetes/pkg/api" ) +// IsHugePageResourceName returns true if the resource name has the huge page +// resource prefix. +func IsHugePageResourceName(name api.ResourceName) bool { + return strings.HasPrefix(string(name), api.ResourceHugePagesPrefix) +} + +// HugePageResourceName returns a ResourceName with the canonical hugepage +// prefix prepended for the specified page size. The page size is converted +// to its canonical representation. +func HugePageResourceName(pageSize resource.Quantity) api.ResourceName { + return api.ResourceName(fmt.Sprintf("%s%s", api.ResourceHugePagesPrefix, pageSize.String())) +} + +// HugePageSizeFromResourceName returns the page size for the specified huge page +// resource name. If the specified input is not a valid huge page resource name +// an error is returned. +func HugePageSizeFromResourceName(name api.ResourceName) (resource.Quantity, error) { + if !IsHugePageResourceName(name) { + return resource.Quantity{}, fmt.Errorf("resource name: %s is not valid hugepage name", name) + } + pageSize := strings.TrimPrefix(string(name), api.ResourceHugePagesPrefix) + return resource.ParseQuantity(pageSize) +} + // NonConvertibleFields iterates over the provided map and filters out all but // any keys with the "non-convertible.kubernetes.io" prefix. func NonConvertibleFields(annotations map[string]string) map[string]string { @@ -113,7 +137,7 @@ var standardContainerResources = sets.NewString( // IsStandardContainerResourceName returns true if the container can make a resource request // for the specified resource func IsStandardContainerResourceName(str string) bool { - return standardContainerResources.Has(str) + return standardContainerResources.Has(str) || IsHugePageResourceName(api.ResourceName(str)) } // IsExtendedResourceName returns true if the resource name is not in the @@ -153,6 +177,7 @@ var overcommitBlacklist = sets.NewString(string(api.ResourceNvidiaGPU)) // namespace and not blacklisted. func IsOvercommitAllowed(name api.ResourceName) bool { return IsDefaultNamespaceResource(name) && + !IsHugePageResourceName(name) && !overcommitBlacklist.Has(string(name)) } @@ -220,7 +245,7 @@ var standardResources = sets.NewString( // IsStandardResourceName returns true if the resource is known to the system func IsStandardResourceName(str string) bool { - return standardResources.Has(str) + return standardResources.Has(str) || IsHugePageResourceName(api.ResourceName(str)) } var integerResources = sets.NewString( diff --git a/pkg/api/helper/helpers_test.go b/pkg/api/helper/helpers_test.go index f14f50d638..7631b8bba0 100644 --- a/pkg/api/helper/helpers_test.go +++ b/pkg/api/helper/helpers_test.go @@ -58,10 +58,28 @@ func TestIsStandardResource(t *testing.T) { {"disk", false}, {"blah", false}, {"x.y.z", false}, + {"hugepages-2Mi", true}, } for i, tc := range testCases { if IsStandardResourceName(tc.input) != tc.output { - t.Errorf("case[%d], expected: %t, got: %t", i, tc.output, !tc.output) + t.Errorf("case[%d], input: %s, expected: %t, got: %t", i, tc.input, tc.output, !tc.output) + } + } +} + +func TestIsStandardContainerResource(t *testing.T) { + testCases := []struct { + input string + output bool + }{ + {"cpu", true}, + {"memory", true}, + {"disk", false}, + {"hugepages-2Mi", true}, + } + for i, tc := range testCases { + if IsStandardContainerResourceName(tc.input) != tc.output { + t.Errorf("case[%d], input: %s, expected: %t, got: %t", i, tc.input, tc.output, !tc.output) } } } @@ -353,3 +371,120 @@ func TestGetNodeAffinityFromAnnotations(t *testing.T) { } } } + +func TestIsHugePageResourceName(t *testing.T) { + testCases := []struct { + name api.ResourceName + result bool + }{ + { + name: api.ResourceName("hugepages-2Mi"), + result: true, + }, + { + name: api.ResourceName("hugepages-1Gi"), + result: true, + }, + { + name: api.ResourceName("cpu"), + result: false, + }, + { + name: api.ResourceName("memory"), + result: false, + }, + } + for _, testCase := range testCases { + if testCase.result != IsHugePageResourceName(testCase.name) { + t.Errorf("resource: %v expected result: %v", testCase.name, testCase.result) + } + } +} + +func TestHugePageResourceName(t *testing.T) { + testCases := []struct { + pageSize resource.Quantity + name api.ResourceName + }{ + { + pageSize: resource.MustParse("2Mi"), + name: api.ResourceName("hugepages-2Mi"), + }, + { + pageSize: resource.MustParse("1Gi"), + name: api.ResourceName("hugepages-1Gi"), + }, + { + // verify we do not regress our canonical representation + pageSize: *resource.NewQuantity(int64(2097152), resource.BinarySI), + name: api.ResourceName("hugepages-2Mi"), + }, + } + for _, testCase := range testCases { + if result := HugePageResourceName(testCase.pageSize); result != testCase.name { + t.Errorf("pageSize: %v, expected: %v, but got: %v", testCase.pageSize.String(), testCase.name, result.String()) + } + } +} + +func TestHugePageSizeFromResourceName(t *testing.T) { + testCases := []struct { + name api.ResourceName + expectErr bool + pageSize resource.Quantity + }{ + { + name: api.ResourceName("hugepages-2Mi"), + pageSize: resource.MustParse("2Mi"), + expectErr: false, + }, + { + name: api.ResourceName("hugepages-1Gi"), + pageSize: resource.MustParse("1Gi"), + expectErr: false, + }, + { + name: api.ResourceName("hugepages-bad"), + expectErr: true, + }, + } + for _, testCase := range testCases { + value, err := HugePageSizeFromResourceName(testCase.name) + if testCase.expectErr && err == nil { + t.Errorf("Expected an error for %v", testCase.name) + } else if !testCase.expectErr && err != nil { + t.Errorf("Unexpected error for %v, got %v", testCase.name, err) + } else if testCase.pageSize.Value() != value.Value() { + t.Errorf("Unexpected pageSize for resource %v got %v", testCase.name, value.String()) + } + } +} + +func TestIsOvercommitAllowed(t *testing.T) { + testCases := []struct { + name api.ResourceName + allowed bool + }{ + { + name: api.ResourceCPU, + allowed: true, + }, + { + name: api.ResourceMemory, + allowed: true, + }, + { + name: api.ResourceNvidiaGPU, + allowed: false, + }, + { + name: HugePageResourceName(resource.MustParse("2Mi")), + allowed: false, + }, + } + for _, testCase := range testCases { + if testCase.allowed != IsOvercommitAllowed(testCase.name) { + t.Errorf("Unexpected result for %v", testCase.name) + } + } +} diff --git a/pkg/api/types.go b/pkg/api/types.go index 089b20aa8d..c6e57efe47 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -3339,6 +3339,8 @@ const ( ResourceOpaqueIntPrefix = "pod.alpha.kubernetes.io/opaque-int-resource-" // Default namespace prefix. ResourceDefaultNamespacePrefix = "kubernetes.io/" + // Name prefix for huge page resources (alpha). + ResourceHugePagesPrefix = "hugepages-" ) // ResourceList is a set of (resource name, quantity) pairs. diff --git a/pkg/api/v1/helper/BUILD b/pkg/api/v1/helper/BUILD index 6377e7e182..f7e8849de7 100644 --- a/pkg/api/v1/helper/BUILD +++ b/pkg/api/v1/helper/BUILD @@ -24,6 +24,7 @@ go_library( deps = [ "//pkg/api/helper:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", + "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/labels:go_default_library", "//vendor/k8s.io/apimachinery/pkg/selection:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", diff --git a/pkg/api/v1/helper/helpers.go b/pkg/api/v1/helper/helpers.go index 7a05c6da99..0e6a48d816 100644 --- a/pkg/api/v1/helper/helpers.go +++ b/pkg/api/v1/helper/helpers.go @@ -22,6 +22,7 @@ import ( "strings" "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/util/sets" @@ -41,6 +42,31 @@ func IsExtendedResourceName(name v1.ResourceName) bool { func IsDefaultNamespaceResource(name v1.ResourceName) bool { return !strings.Contains(string(name), "/") || strings.Contains(string(name), v1.ResourceDefaultNamespacePrefix) + +} + +// IsHugePageResourceName returns true if the resource name has the huge page +// resource prefix. +func IsHugePageResourceName(name v1.ResourceName) bool { + return strings.HasPrefix(string(name), v1.ResourceHugePagesPrefix) +} + +// HugePageResourceName returns a ResourceName with the canonical hugepage +// prefix prepended for the specified page size. The page size is converted +// to its canonical representation. +func HugePageResourceName(pageSize resource.Quantity) v1.ResourceName { + return v1.ResourceName(fmt.Sprintf("%s%s", v1.ResourceHugePagesPrefix, pageSize.String())) +} + +// HugePageSizeFromResourceName returns the page size for the specified huge page +// resource name. If the specified input is not a valid huge page resource name +// an error is returned. +func HugePageSizeFromResourceName(name v1.ResourceName) (resource.Quantity, error) { + if !IsHugePageResourceName(name) { + return resource.Quantity{}, fmt.Errorf("resource name: %s is not valid hugepage name", name) + } + pageSize := strings.TrimPrefix(string(name), v1.ResourceHugePagesPrefix) + return resource.ParseQuantity(pageSize) } // IsOpaqueIntResourceName returns true if the resource name has the opaque diff --git a/pkg/api/v1/helper/qos/BUILD b/pkg/api/v1/helper/qos/BUILD index e41f01c36b..65eaa353ab 100644 --- a/pkg/api/v1/helper/qos/BUILD +++ b/pkg/api/v1/helper/qos/BUILD @@ -24,6 +24,7 @@ go_library( name = "go_default_library", srcs = ["qos.go"], deps = [ + "//pkg/api/v1/helper:go_default_library", "//vendor/k8s.io/api/core/v1:go_default_library", "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", diff --git a/pkg/api/v1/helper/qos/qos.go b/pkg/api/v1/helper/qos/qos.go index c4cb17d224..aceee06880 100644 --- a/pkg/api/v1/helper/qos/qos.go +++ b/pkg/api/v1/helper/qos/qos.go @@ -20,6 +20,7 @@ import ( "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" + v1helper "k8s.io/kubernetes/pkg/api/v1/helper" ) // QOSList is a set of (resource name, QoS class) pairs. @@ -27,6 +28,10 @@ type QOSList map[v1.ResourceName]v1.PodQOSClass var supportedQoSComputeResources = sets.NewString(string(v1.ResourceCPU), string(v1.ResourceMemory)) +func isSupportedQoSComputeResource(name v1.ResourceName) bool { + return supportedQoSComputeResources.Has(string(name)) || v1helper.IsHugePageResourceName(name) +} + // GetPodQOS returns the QoS class of a pod. // A pod is besteffort if none of its containers have specified any requests or limits. // A pod is guaranteed only when requests and limits are specified for all the containers and they are equal. @@ -39,7 +44,7 @@ func GetPodQOS(pod *v1.Pod) v1.PodQOSClass { for _, container := range pod.Spec.Containers { // process requests for name, quantity := range container.Resources.Requests { - if !supportedQoSComputeResources.Has(string(name)) { + if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { @@ -55,7 +60,7 @@ func GetPodQOS(pod *v1.Pod) v1.PodQOSClass { // process limits qosLimitsFound := sets.NewString() for name, quantity := range container.Resources.Limits { - if !supportedQoSComputeResources.Has(string(name)) { + if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { diff --git a/pkg/api/v1/helper/qos/qos_test.go b/pkg/api/v1/helper/qos/qos_test.go index 78cf94da3e..59c8bfad92 100644 --- a/pkg/api/v1/helper/qos/qos_test.go +++ b/pkg/api/v1/helper/qos/qos_test.go @@ -130,6 +130,12 @@ func TestGetPodQOS(t *testing.T) { }), expected: v1.PodQOSBurstable, }, + { + pod: newPod("burstable-hugepages", []v1.Container{ + newContainer("burstable", getResourceList("0", "0"), addResource("hugepages-2Mi", "1Gi", getResourceList("0", "0"))), + }), + expected: v1.PodQOSBurstable, + }, } for id, testCase := range testCases { if actual := GetPodQOS(testCase.pod); testCase.expected != actual { diff --git a/pkg/api/validation/validation.go b/pkg/api/validation/validation.go index c484f12443..a14fc6841f 100644 --- a/pkg/api/validation/validation.go +++ b/pkg/api/validation/validation.go @@ -2402,6 +2402,28 @@ func ValidateTolerations(tolerations []api.Toleration, fldPath *field.Path) fiel return allErrors } +func toResourceNames(resources api.ResourceList) []api.ResourceName { + result := []api.ResourceName{} + for resourceName := range resources { + result = append(result, resourceName) + } + return result +} + +func toSet(resourceNames []api.ResourceName) sets.String { + result := sets.NewString() + for _, resourceName := range resourceNames { + result.Insert(string(resourceName)) + } + return result +} + +func toContainerResourcesSet(ctr *api.Container) sets.String { + resourceNames := toResourceNames(ctr.Resources.Requests) + resourceNames = append(resourceNames, toResourceNames(ctr.Resources.Limits)...) + return toSet(resourceNames) +} + // validateContainersOnlyForPod does additional validation for containers on a pod versus a pod template // it only does additive validation of fields not covered in validateContainers func validateContainersOnlyForPod(containers []api.Container, fldPath *field.Path) field.ErrorList { @@ -2429,6 +2451,21 @@ func ValidatePod(pod *api.Pod) field.ErrorList { allErrs = append(allErrs, validateContainersOnlyForPod(pod.Spec.Containers, specPath.Child("containers"))...) allErrs = append(allErrs, validateContainersOnlyForPod(pod.Spec.InitContainers, specPath.Child("initContainers"))...) + if utilfeature.DefaultFeatureGate.Enabled(features.HugePages) { + hugePageResources := sets.NewString() + for i := range pod.Spec.Containers { + resourceSet := toContainerResourcesSet(&pod.Spec.Containers[i]) + for resourceStr := range resourceSet { + if v1helper.IsHugePageResourceName(v1.ResourceName(resourceStr)) { + hugePageResources.Insert(resourceStr) + } + } + } + if len(hugePageResources) > 1 { + allErrs = append(allErrs, field.Invalid(specPath, hugePageResources, "must use a single hugepage size in a pod spec")) + } + } + return allErrs } @@ -3925,6 +3962,10 @@ func ValidateResourceRequirements(requirements *api.ResourceRequirements, fldPat if resourceName == api.ResourceEphemeralStorage && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { allErrs = append(allErrs, field.Forbidden(limPath, "ResourceEphemeralStorage field disabled by feature-gate for ResourceRequirements")) } + if helper.IsHugePageResourceName(resourceName) && !utilfeature.DefaultFeatureGate.Enabled(features.HugePages) { + allErrs = append(allErrs, field.Forbidden(limPath, fmt.Sprintf("%s field disabled by feature-gate for ResourceRequirements", resourceName))) + } + } for resourceName, quantity := range requirements.Requests { fldPath := reqPath.Key(string(resourceName)) diff --git a/pkg/api/validation/validation_test.go b/pkg/api/validation/validation_test.go index 26b5aa061e..d631286ec0 100644 --- a/pkg/api/validation/validation_test.go +++ b/pkg/api/validation/validation_test.go @@ -2759,6 +2759,106 @@ func TestValidateVolumes(t *testing.T) { } } +func TestAlphaHugePagesIsolation(t *testing.T) { + successCases := []api.Pod{ + { // Basic fields. + ObjectMeta: metav1.ObjectMeta{Name: "123", Namespace: "ns"}, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File", + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + }, + Limits: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + }, + }, + }, + }, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + }, + }, + } + failureCases := []api.Pod{ + { // Basic fields. + ObjectMeta: metav1.ObjectMeta{Name: "hugepages-shared", Namespace: "ns"}, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File", + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + }, + Limits: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("2Gi"), + }, + }, + }, + }, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + }, + }, + { // Basic fields. + ObjectMeta: metav1.ObjectMeta{Name: "hugepages-multiple", Namespace: "ns"}, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "ctr", Image: "image", ImagePullPolicy: "IfNotPresent", TerminationMessagePolicy: "File", + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + api.ResourceName("hugepages-1Gi"): resource.MustParse("2Gi"), + }, + Limits: api.ResourceList{ + api.ResourceName("hugepages-2Mi"): resource.MustParse("1Gi"), + api.ResourceName("hugepages-1Gi"): resource.MustParse("2Gi"), + }, + }, + }, + }, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + }, + }, + } + // Enable alpha feature HugePages + err := utilfeature.DefaultFeatureGate.Set("HugePages=true") + if err != nil { + t.Errorf("Failed to enable feature gate for HugePages: %v", err) + return + } + for i := range successCases { + pod := &successCases[i] + if errs := ValidatePod(pod); len(errs) != 0 { + t.Errorf("Unexpected error for case[%d], err: %v", i, errs) + } + } + for i := range failureCases { + pod := &failureCases[i] + if errs := ValidatePod(pod); len(errs) == 0 { + t.Errorf("Expected error for case[%d], pod: %v", i, pod.Name) + } + } + // Disable alpha feature HugePages + err = utilfeature.DefaultFeatureGate.Set("HugePages=false") + if err != nil { + t.Errorf("Failed to disable feature gate for HugePages: %v", err) + return + } + // Disable alpha feature HugePages and ensure all success cases fail + for i := range successCases { + pod := &successCases[i] + if errs := ValidatePod(pod); len(errs) == 0 { + t.Errorf("Expected error for case[%d], pod: %v", i, pod.Name) + } + } +} + func TestAlphaLocalStorageCapacityIsolation(t *testing.T) { testCases := []api.VolumeSource{ diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index a83f5197d8..711588e80e 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -157,6 +157,12 @@ const ( // // Alternative container-level CPU affinity policies. CPUManager utilfeature.Feature = "CPUManager" + + // owner: @derekwaynecarr + // alpha: v1.8 + // + // Enable pods to consume pre-allocated huge pages of varying page sizes + HugePages utilfeature.Feature = "HugePages" ) func init() { @@ -180,6 +186,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS RotateKubeletClientCertificate: {Default: true, PreRelease: utilfeature.Beta}, PersistentLocalVolumes: {Default: false, PreRelease: utilfeature.Alpha}, LocalStorageCapacityIsolation: {Default: false, PreRelease: utilfeature.Alpha}, + HugePages: {Default: false, PreRelease: utilfeature.Alpha}, DebugContainers: {Default: false, PreRelease: utilfeature.Alpha}, PodPriority: {Default: false, PreRelease: utilfeature.Alpha}, EnableEquivalenceClassCache: {Default: false, PreRelease: utilfeature.Alpha}, diff --git a/staging/src/k8s.io/api/core/v1/types.go b/staging/src/k8s.io/api/core/v1/types.go index 4520cc6d5b..391736885d 100644 --- a/staging/src/k8s.io/api/core/v1/types.go +++ b/staging/src/k8s.io/api/core/v1/types.go @@ -3759,6 +3759,8 @@ const ( ResourceOpaqueIntPrefix = "pod.alpha.kubernetes.io/opaque-int-resource-" // Default namespace prefix. ResourceDefaultNamespacePrefix = "kubernetes.io/" + // Name prefix for huge page resources (alpha). + ResourceHugePagesPrefix = "hugepages-" ) // ResourceList is a set of (resource name, quantity) pairs.