mirror of https://github.com/k3s-io/k3s
Merge pull request #61498 from mindprince/delete-in-tree-gpu
Automatic merge from submit-queue (batch tested with PRs 61498, 62030). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Delete in-tree support for NVIDIA GPUs. This removes the alpha Accelerators feature gate which was deprecated in 1.10 (#57384). The alternative feature DevicePlugins went beta in 1.10 (#60170). Fixes #54012 ```release-note Support for "alpha.kubernetes.io/nvidia-gpu" resource which was deprecated in 1.10 is removed. Please use the resource exposed by DevicePlugins instead ("nvidia.com/gpu"). ```pull/8/head
commit
043204b1e5
|
@ -173,7 +173,6 @@ pkg/kubelet/dockershim/cm
|
||||||
pkg/kubelet/dockershim/libdocker
|
pkg/kubelet/dockershim/libdocker
|
||||||
pkg/kubelet/dockershim/testing
|
pkg/kubelet/dockershim/testing
|
||||||
pkg/kubelet/events
|
pkg/kubelet/events
|
||||||
pkg/kubelet/gpu
|
|
||||||
pkg/kubelet/images
|
pkg/kubelet/images
|
||||||
pkg/kubelet/kuberuntime
|
pkg/kubelet/kuberuntime
|
||||||
pkg/kubelet/leaky
|
pkg/kubelet/leaky
|
||||||
|
|
|
@ -172,14 +172,11 @@ func IsNativeResource(name core.ResourceName) bool {
|
||||||
strings.Contains(string(name), core.ResourceDefaultNamespacePrefix)
|
strings.Contains(string(name), core.ResourceDefaultNamespacePrefix)
|
||||||
}
|
}
|
||||||
|
|
||||||
var overcommitBlacklist = sets.NewString(string(core.ResourceNvidiaGPU))
|
|
||||||
|
|
||||||
// IsOvercommitAllowed returns true if the resource is in the default
|
// IsOvercommitAllowed returns true if the resource is in the default
|
||||||
// namespace and not blacklisted.
|
// namespace and is not hugepages.
|
||||||
func IsOvercommitAllowed(name core.ResourceName) bool {
|
func IsOvercommitAllowed(name core.ResourceName) bool {
|
||||||
return IsNativeResource(name) &&
|
return IsNativeResource(name) &&
|
||||||
!IsHugePageResourceName(name) &&
|
!IsHugePageResourceName(name)
|
||||||
!overcommitBlacklist.Has(string(name))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var standardLimitRangeTypes = sets.NewString(
|
var standardLimitRangeTypes = sets.NewString(
|
||||||
|
|
|
@ -387,10 +387,6 @@ func TestIsOvercommitAllowed(t *testing.T) {
|
||||||
name: core.ResourceMemory,
|
name: core.ResourceMemory,
|
||||||
allowed: true,
|
allowed: true,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
name: core.ResourceNvidiaGPU,
|
|
||||||
allowed: false,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
name: HugePageResourceName(resource.MustParse("2Mi")),
|
name: HugePageResourceName(resource.MustParse("2Mi")),
|
||||||
allowed: false,
|
allowed: false,
|
||||||
|
|
|
@ -47,13 +47,6 @@ func (self *ResourceList) Pods() *resource.Quantity {
|
||||||
return &resource.Quantity{}
|
return &resource.Quantity{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (self *ResourceList) NvidiaGPU() *resource.Quantity {
|
|
||||||
if val, ok := (*self)[ResourceNvidiaGPU]; ok {
|
|
||||||
return &val
|
|
||||||
}
|
|
||||||
return &resource.Quantity{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (self *ResourceList) StorageEphemeral() *resource.Quantity {
|
func (self *ResourceList) StorageEphemeral() *resource.Quantity {
|
||||||
if val, ok := (*self)[ResourceEphemeralStorage]; ok {
|
if val, ok := (*self)[ResourceEphemeralStorage]; ok {
|
||||||
return &val
|
return &val
|
||||||
|
|
|
@ -3641,8 +3641,6 @@ const (
|
||||||
// Local ephemeral storage, in bytes. (500Gi = 500GiB = 500 * 1024 * 1024 * 1024)
|
// Local ephemeral storage, in bytes. (500Gi = 500GiB = 500 * 1024 * 1024 * 1024)
|
||||||
// The resource name for ResourceEphemeralStorage is alpha and it can change across releases.
|
// The resource name for ResourceEphemeralStorage is alpha and it can change across releases.
|
||||||
ResourceEphemeralStorage ResourceName = "ephemeral-storage"
|
ResourceEphemeralStorage ResourceName = "ephemeral-storage"
|
||||||
// NVIDIA GPU, in devices. Alpha, might change: although fractional and allowing values >1, only one whole device per node is assigned.
|
|
||||||
ResourceNvidiaGPU ResourceName = "alpha.kubernetes.io/nvidia-gpu"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
|
|
@ -29,7 +29,6 @@ go_library(
|
||||||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/selection:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/selection:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
|
||||||
"//vendor/k8s.io/apimachinery/pkg/util/validation:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/util/validation:go_default_library",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
|
@ -25,7 +25,6 @@ import (
|
||||||
"k8s.io/apimachinery/pkg/api/resource"
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
"k8s.io/apimachinery/pkg/labels"
|
"k8s.io/apimachinery/pkg/labels"
|
||||||
"k8s.io/apimachinery/pkg/selection"
|
"k8s.io/apimachinery/pkg/selection"
|
||||||
"k8s.io/apimachinery/pkg/util/sets"
|
|
||||||
"k8s.io/apimachinery/pkg/util/validation"
|
"k8s.io/apimachinery/pkg/util/validation"
|
||||||
"k8s.io/kubernetes/pkg/apis/core/helper"
|
"k8s.io/kubernetes/pkg/apis/core/helper"
|
||||||
)
|
)
|
||||||
|
@ -85,14 +84,11 @@ func HugePageSizeFromResourceName(name v1.ResourceName) (resource.Quantity, erro
|
||||||
return resource.ParseQuantity(pageSize)
|
return resource.ParseQuantity(pageSize)
|
||||||
}
|
}
|
||||||
|
|
||||||
var overcommitBlacklist = sets.NewString(string(v1.ResourceNvidiaGPU))
|
|
||||||
|
|
||||||
// IsOvercommitAllowed returns true if the resource is in the default
|
// IsOvercommitAllowed returns true if the resource is in the default
|
||||||
// namespace and not blacklisted and is not hugepages.
|
// namespace and is not hugepages.
|
||||||
func IsOvercommitAllowed(name v1.ResourceName) bool {
|
func IsOvercommitAllowed(name v1.ResourceName) bool {
|
||||||
return IsNativeResource(name) &&
|
return IsNativeResource(name) &&
|
||||||
!IsHugePageResourceName(name) &&
|
!IsHugePageResourceName(name)
|
||||||
!overcommitBlacklist.Has(string(name))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extended and Hugepages resources
|
// Extended and Hugepages resources
|
||||||
|
|
|
@ -125,10 +125,6 @@ func TestIsOvercommitAllowed(t *testing.T) {
|
||||||
resourceName: "kubernetes.io/resource-foo",
|
resourceName: "kubernetes.io/resource-foo",
|
||||||
expectVal: true,
|
expectVal: true,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
resourceName: "alpha.kubernetes.io/nvidia-gpu",
|
|
||||||
expectVal: false,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
resourceName: "hugepages-100m",
|
resourceName: "hugepages-100m",
|
||||||
expectVal: false,
|
expectVal: false,
|
||||||
|
|
|
@ -38,12 +38,6 @@ func TestGetPodQOS(t *testing.T) {
|
||||||
}),
|
}),
|
||||||
expected: v1.PodQOSGuaranteed,
|
expected: v1.PodQOSGuaranteed,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
pod: newPod("guaranteed-with-gpu", []v1.Container{
|
|
||||||
newContainer("guaranteed", getResourceList("100m", "100Mi"), addResource("nvidia-gpu", "2", getResourceList("100m", "100Mi"))),
|
|
||||||
}),
|
|
||||||
expected: v1.PodQOSGuaranteed,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
pod: newPod("guaranteed-guaranteed", []v1.Container{
|
pod: newPod("guaranteed-guaranteed", []v1.Container{
|
||||||
newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
|
newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
|
||||||
|
@ -51,13 +45,6 @@ func TestGetPodQOS(t *testing.T) {
|
||||||
}),
|
}),
|
||||||
expected: v1.PodQOSGuaranteed,
|
expected: v1.PodQOSGuaranteed,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
pod: newPod("guaranteed-guaranteed-with-gpu", []v1.Container{
|
|
||||||
newContainer("guaranteed", getResourceList("100m", "100Mi"), addResource("nvidia-gpu", "2", getResourceList("100m", "100Mi"))),
|
|
||||||
newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
|
|
||||||
}),
|
|
||||||
expected: v1.PodQOSGuaranteed,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
pod: newPod("best-effort-best-effort", []v1.Container{
|
pod: newPod("best-effort-best-effort", []v1.Container{
|
||||||
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
|
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
|
||||||
|
@ -71,29 +58,16 @@ func TestGetPodQOS(t *testing.T) {
|
||||||
}),
|
}),
|
||||||
expected: v1.PodQOSBestEffort,
|
expected: v1.PodQOSBestEffort,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
pod: newPod("best-effort-best-effort-with-gpu", []v1.Container{
|
|
||||||
newContainer("best-effort", getResourceList("", ""), addResource("nvidia-gpu", "2", getResourceList("", ""))),
|
|
||||||
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
|
|
||||||
}),
|
|
||||||
expected: v1.PodQOSBestEffort,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
pod: newPod("best-effort-with-gpu", []v1.Container{
|
|
||||||
newContainer("best-effort", getResourceList("", ""), addResource("nvidia-gpu", "2", getResourceList("", ""))),
|
|
||||||
}),
|
|
||||||
expected: v1.PodQOSBestEffort,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
pod: newPod("best-effort-burstable", []v1.Container{
|
pod: newPod("best-effort-burstable", []v1.Container{
|
||||||
newContainer("best-effort", getResourceList("", ""), addResource("nvidia-gpu", "2", getResourceList("", ""))),
|
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
|
||||||
newContainer("burstable", getResourceList("1", ""), getResourceList("2", "")),
|
newContainer("burstable", getResourceList("1", ""), getResourceList("2", "")),
|
||||||
}),
|
}),
|
||||||
expected: v1.PodQOSBurstable,
|
expected: v1.PodQOSBurstable,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pod: newPod("best-effort-guaranteed", []v1.Container{
|
pod: newPod("best-effort-guaranteed", []v1.Container{
|
||||||
newContainer("best-effort", getResourceList("", ""), addResource("nvidia-gpu", "2", getResourceList("", ""))),
|
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
|
||||||
newContainer("guaranteed", getResourceList("10m", "100Mi"), getResourceList("10m", "100Mi")),
|
newContainer("guaranteed", getResourceList("10m", "100Mi"), getResourceList("10m", "100Mi")),
|
||||||
}),
|
}),
|
||||||
expected: v1.PodQOSBurstable,
|
expected: v1.PodQOSBurstable,
|
||||||
|
@ -132,7 +106,7 @@ func TestGetPodQOS(t *testing.T) {
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
pod: newPod("burstable-2", []v1.Container{
|
pod: newPod("burstable-2", []v1.Container{
|
||||||
newContainer("burstable", getResourceList("0", "0"), addResource("nvidia-gpu", "2", getResourceList("100m", "200Mi"))),
|
newContainer("burstable", getResourceList("0", "0"), getResourceList("100m", "200Mi")),
|
||||||
}),
|
}),
|
||||||
expected: v1.PodQOSBurstable,
|
expected: v1.PodQOSBurstable,
|
||||||
},
|
},
|
||||||
|
|
|
@ -61,8 +61,6 @@ func ValidateResourceRequirements(requirements *v1.ResourceRequirements, fldPath
|
||||||
} else if quantity.Cmp(limitQuantity) > 0 {
|
} else if quantity.Cmp(limitQuantity) > 0 {
|
||||||
allErrs = append(allErrs, field.Invalid(reqPath, quantity.String(), fmt.Sprintf("must be less than or equal to %s limit", resourceName)))
|
allErrs = append(allErrs, field.Invalid(reqPath, quantity.String(), fmt.Sprintf("must be less than or equal to %s limit", resourceName)))
|
||||||
}
|
}
|
||||||
} else if resourceName == v1.ResourceNvidiaGPU {
|
|
||||||
allErrs = append(allErrs, field.Invalid(reqPath, quantity.String(), fmt.Sprintf("must be equal to %s request", v1.ResourceNvidiaGPU)))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -32,36 +32,15 @@ func TestValidateResourceRequirements(t *testing.T) {
|
||||||
requirements v1.ResourceRequirements
|
requirements v1.ResourceRequirements
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
Name: "GPU only setting Limits",
|
Name: "Resources with Requests equal to Limits",
|
||||||
requirements: v1.ResourceRequirements{
|
|
||||||
Limits: v1.ResourceList{
|
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Name: "GPU setting Limits equals Requests",
|
|
||||||
requirements: v1.ResourceRequirements{
|
|
||||||
Limits: v1.ResourceList{
|
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
|
|
||||||
},
|
|
||||||
Requests: v1.ResourceList{
|
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Name: "Resources with GPU with Requests",
|
|
||||||
requirements: v1.ResourceRequirements{
|
requirements: v1.ResourceRequirements{
|
||||||
Requests: v1.ResourceList{
|
Requests: v1.ResourceList{
|
||||||
v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
|
v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
|
||||||
v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
|
v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("1"),
|
|
||||||
},
|
},
|
||||||
Limits: v1.ResourceList{
|
Limits: v1.ResourceList{
|
||||||
v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
|
v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
|
||||||
v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
|
v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("1"),
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -111,36 +90,6 @@ func TestValidateResourceRequirements(t *testing.T) {
|
||||||
Name string
|
Name string
|
||||||
requirements v1.ResourceRequirements
|
requirements v1.ResourceRequirements
|
||||||
}{
|
}{
|
||||||
{
|
|
||||||
Name: "GPU only setting Requests",
|
|
||||||
requirements: v1.ResourceRequirements{
|
|
||||||
Requests: v1.ResourceList{
|
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Name: "GPU setting Limits less than Requests",
|
|
||||||
requirements: v1.ResourceRequirements{
|
|
||||||
Limits: v1.ResourceList{
|
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
|
|
||||||
},
|
|
||||||
Requests: v1.ResourceList{
|
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("11"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Name: "GPU setting Limits larger than Requests",
|
|
||||||
requirements: v1.ResourceRequirements{
|
|
||||||
Limits: v1.ResourceList{
|
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
|
|
||||||
},
|
|
||||||
Requests: v1.ResourceList{
|
|
||||||
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("9"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
Name: "Resources with Requests Larger Than Limits",
|
Name: "Resources with Requests Larger Than Limits",
|
||||||
requirements: v1.ResourceRequirements{
|
requirements: v1.ResourceRequirements{
|
||||||
|
|
|
@ -5042,25 +5042,7 @@ func TestValidateContainers(t *testing.T) {
|
||||||
TerminationMessagePolicy: "File",
|
TerminationMessagePolicy: "File",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
Name: "resources-test-with-gpu-with-request",
|
Name: "resources-test-with-request-and-limit",
|
||||||
Image: "image",
|
|
||||||
Resources: core.ResourceRequirements{
|
|
||||||
Requests: core.ResourceList{
|
|
||||||
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
|
|
||||||
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
|
||||||
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
|
|
||||||
},
|
|
||||||
Limits: core.ResourceList{
|
|
||||||
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
|
|
||||||
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
|
||||||
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
ImagePullPolicy: "IfNotPresent",
|
|
||||||
TerminationMessagePolicy: "File",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
Name: "resources-test-with-gpu-without-request",
|
|
||||||
Image: "image",
|
Image: "image",
|
||||||
Resources: core.ResourceRequirements{
|
Resources: core.ResourceRequirements{
|
||||||
Requests: core.ResourceList{
|
Requests: core.ResourceList{
|
||||||
|
@ -5068,9 +5050,8 @@ func TestValidateContainers(t *testing.T) {
|
||||||
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
||||||
},
|
},
|
||||||
Limits: core.ResourceList{
|
Limits: core.ResourceList{
|
||||||
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
|
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
|
||||||
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
||||||
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
ImagePullPolicy: "IfNotPresent",
|
ImagePullPolicy: "IfNotPresent",
|
||||||
|
@ -5359,41 +5340,6 @@ func TestValidateContainers(t *testing.T) {
|
||||||
TerminationMessagePolicy: "File",
|
TerminationMessagePolicy: "File",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"Resource GPU limit must match request": {
|
|
||||||
{
|
|
||||||
Name: "gpu-resource-request-limit",
|
|
||||||
Image: "image",
|
|
||||||
Resources: core.ResourceRequirements{
|
|
||||||
Requests: core.ResourceList{
|
|
||||||
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
|
|
||||||
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
|
||||||
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("0"),
|
|
||||||
},
|
|
||||||
Limits: core.ResourceList{
|
|
||||||
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
|
|
||||||
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
|
||||||
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
TerminationMessagePolicy: "File",
|
|
||||||
ImagePullPolicy: "IfNotPresent",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"Resource GPU invalid setting only request": {
|
|
||||||
{
|
|
||||||
Name: "gpu-resource-request-limit",
|
|
||||||
Image: "image",
|
|
||||||
Resources: core.ResourceRequirements{
|
|
||||||
Requests: core.ResourceList{
|
|
||||||
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
|
|
||||||
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
|
|
||||||
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
TerminationMessagePolicy: "File",
|
|
||||||
ImagePullPolicy: "IfNotPresent",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"Request limit simple invalid": {
|
"Request limit simple invalid": {
|
||||||
{
|
{
|
||||||
Name: "abc-123",
|
Name: "abc-123",
|
||||||
|
|
|
@ -53,16 +53,6 @@ const (
|
||||||
// Note: This feature is not supported for `BestEffort` pods.
|
// Note: This feature is not supported for `BestEffort` pods.
|
||||||
ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation"
|
ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation"
|
||||||
|
|
||||||
// owner: @vishh
|
|
||||||
// alpha: v1.6
|
|
||||||
//
|
|
||||||
// This is deprecated and will be removed in v1.11. Use DevicePlugins instead.
|
|
||||||
//
|
|
||||||
// Enables support for GPUs as a schedulable resource.
|
|
||||||
// Only Nvidia GPUs are supported as of v1.6.
|
|
||||||
// Works only with Docker Container Runtime.
|
|
||||||
Accelerators utilfeature.Feature = "Accelerators"
|
|
||||||
|
|
||||||
// owner: @jiayingz
|
// owner: @jiayingz
|
||||||
// beta: v1.10
|
// beta: v1.10
|
||||||
//
|
//
|
||||||
|
@ -296,7 +286,6 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
|
||||||
DynamicKubeletConfig: {Default: false, PreRelease: utilfeature.Alpha},
|
DynamicKubeletConfig: {Default: false, PreRelease: utilfeature.Alpha},
|
||||||
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
|
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
|
||||||
ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha},
|
ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha},
|
||||||
Accelerators: {Default: false, PreRelease: utilfeature.Alpha},
|
|
||||||
DevicePlugins: {Default: true, PreRelease: utilfeature.Beta},
|
DevicePlugins: {Default: true, PreRelease: utilfeature.Beta},
|
||||||
TaintBasedEvictions: {Default: false, PreRelease: utilfeature.Alpha},
|
TaintBasedEvictions: {Default: false, PreRelease: utilfeature.Alpha},
|
||||||
RotateKubeletServerCertificate: {Default: false, PreRelease: utilfeature.Alpha},
|
RotateKubeletServerCertificate: {Default: false, PreRelease: utilfeature.Alpha},
|
||||||
|
|
|
@ -55,8 +55,6 @@ go_library(
|
||||||
"//pkg/kubelet/envvars:go_default_library",
|
"//pkg/kubelet/envvars:go_default_library",
|
||||||
"//pkg/kubelet/events:go_default_library",
|
"//pkg/kubelet/events:go_default_library",
|
||||||
"//pkg/kubelet/eviction:go_default_library",
|
"//pkg/kubelet/eviction:go_default_library",
|
||||||
"//pkg/kubelet/gpu:go_default_library",
|
|
||||||
"//pkg/kubelet/gpu/nvidia:go_default_library",
|
|
||||||
"//pkg/kubelet/images:go_default_library",
|
"//pkg/kubelet/images:go_default_library",
|
||||||
"//pkg/kubelet/kubeletconfig:go_default_library",
|
"//pkg/kubelet/kubeletconfig:go_default_library",
|
||||||
"//pkg/kubelet/kuberuntime:go_default_library",
|
"//pkg/kubelet/kuberuntime:go_default_library",
|
||||||
|
@ -179,7 +177,6 @@ go_test(
|
||||||
"//pkg/kubelet/container:go_default_library",
|
"//pkg/kubelet/container:go_default_library",
|
||||||
"//pkg/kubelet/container/testing:go_default_library",
|
"//pkg/kubelet/container/testing:go_default_library",
|
||||||
"//pkg/kubelet/eviction:go_default_library",
|
"//pkg/kubelet/eviction:go_default_library",
|
||||||
"//pkg/kubelet/gpu:go_default_library",
|
|
||||||
"//pkg/kubelet/images:go_default_library",
|
"//pkg/kubelet/images:go_default_library",
|
||||||
"//pkg/kubelet/lifecycle:go_default_library",
|
"//pkg/kubelet/lifecycle:go_default_library",
|
||||||
"//pkg/kubelet/logs:go_default_library",
|
"//pkg/kubelet/logs:go_default_library",
|
||||||
|
@ -264,7 +261,6 @@ filegroup(
|
||||||
"//pkg/kubelet/envvars:all-srcs",
|
"//pkg/kubelet/envvars:all-srcs",
|
||||||
"//pkg/kubelet/events:all-srcs",
|
"//pkg/kubelet/events:all-srcs",
|
||||||
"//pkg/kubelet/eviction:all-srcs",
|
"//pkg/kubelet/eviction:all-srcs",
|
||||||
"//pkg/kubelet/gpu:all-srcs",
|
|
||||||
"//pkg/kubelet/images:all-srcs",
|
"//pkg/kubelet/images:all-srcs",
|
||||||
"//pkg/kubelet/kubeletconfig:all-srcs",
|
"//pkg/kubelet/kubeletconfig:all-srcs",
|
||||||
"//pkg/kubelet/kuberuntime:all-srcs",
|
"//pkg/kubelet/kuberuntime:all-srcs",
|
||||||
|
|
|
@ -1,32 +0,0 @@
|
||||||
package(default_visibility = ["//visibility:public"])
|
|
||||||
|
|
||||||
load(
|
|
||||||
"@io_bazel_rules_go//go:def.bzl",
|
|
||||||
"go_library",
|
|
||||||
)
|
|
||||||
|
|
||||||
go_library(
|
|
||||||
name = "go_default_library",
|
|
||||||
srcs = [
|
|
||||||
"gpu_manager_stub.go",
|
|
||||||
"types.go",
|
|
||||||
],
|
|
||||||
importpath = "k8s.io/kubernetes/pkg/kubelet/gpu",
|
|
||||||
deps = ["//vendor/k8s.io/api/core/v1:go_default_library"],
|
|
||||||
)
|
|
||||||
|
|
||||||
filegroup(
|
|
||||||
name = "package-srcs",
|
|
||||||
srcs = glob(["**"]),
|
|
||||||
tags = ["automanaged"],
|
|
||||||
visibility = ["//visibility:private"],
|
|
||||||
)
|
|
||||||
|
|
||||||
filegroup(
|
|
||||||
name = "all-srcs",
|
|
||||||
srcs = [
|
|
||||||
":package-srcs",
|
|
||||||
"//pkg/kubelet/gpu/nvidia:all-srcs",
|
|
||||||
],
|
|
||||||
tags = ["automanaged"],
|
|
||||||
)
|
|
|
@ -1,12 +0,0 @@
|
||||||
approvers:
|
|
||||||
- dchen1107
|
|
||||||
- derekwaynecarr
|
|
||||||
- vishh
|
|
||||||
- yujuhong
|
|
||||||
reviewers:
|
|
||||||
- cmluciano
|
|
||||||
- jiayingz
|
|
||||||
- mindprince
|
|
||||||
- RenaudWasTaken
|
|
||||||
- vishh
|
|
||||||
- sig-node-reviewers
|
|
|
@ -1,41 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright 2017 The Kubernetes Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package gpu
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"k8s.io/api/core/v1"
|
|
||||||
)
|
|
||||||
|
|
||||||
type gpuManagerStub struct{}
|
|
||||||
|
|
||||||
func (gms *gpuManagerStub) Start() error {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (gms *gpuManagerStub) Capacity() v1.ResourceList {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (gms *gpuManagerStub) AllocateGPU(_ *v1.Pod, _ *v1.Container) ([]string, error) {
|
|
||||||
return nil, fmt.Errorf("GPUs are not supported")
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewGPUManagerStub() GPUManager {
|
|
||||||
return &gpuManagerStub{}
|
|
||||||
}
|
|
|
@ -1,54 +0,0 @@
|
||||||
package(default_visibility = ["//visibility:public"])
|
|
||||||
|
|
||||||
load(
|
|
||||||
"@io_bazel_rules_go//go:def.bzl",
|
|
||||||
"go_library",
|
|
||||||
"go_test",
|
|
||||||
)
|
|
||||||
|
|
||||||
go_library(
|
|
||||||
name = "go_default_library",
|
|
||||||
srcs = [
|
|
||||||
"helpers.go",
|
|
||||||
"nvidia_gpu_manager.go",
|
|
||||||
],
|
|
||||||
importpath = "k8s.io/kubernetes/pkg/kubelet/gpu/nvidia",
|
|
||||||
deps = [
|
|
||||||
"//pkg/kubelet/dockershim:go_default_library",
|
|
||||||
"//pkg/kubelet/dockershim/libdocker:go_default_library",
|
|
||||||
"//pkg/kubelet/gpu:go_default_library",
|
|
||||||
"//vendor/github.com/golang/glog:go_default_library",
|
|
||||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
|
||||||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
|
||||||
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
filegroup(
|
|
||||||
name = "package-srcs",
|
|
||||||
srcs = glob(["**"]),
|
|
||||||
tags = ["automanaged"],
|
|
||||||
visibility = ["//visibility:private"],
|
|
||||||
)
|
|
||||||
|
|
||||||
filegroup(
|
|
||||||
name = "all-srcs",
|
|
||||||
srcs = [":package-srcs"],
|
|
||||||
tags = ["automanaged"],
|
|
||||||
)
|
|
||||||
|
|
||||||
go_test(
|
|
||||||
name = "go_default_test",
|
|
||||||
srcs = ["nvidia_gpu_manager_test.go"],
|
|
||||||
embed = [":go_default_library"],
|
|
||||||
deps = [
|
|
||||||
"//pkg/kubelet/dockershim:go_default_library",
|
|
||||||
"//pkg/kubelet/dockershim/libdocker:go_default_library",
|
|
||||||
"//vendor/github.com/stretchr/testify/assert:go_default_library",
|
|
||||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
|
||||||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
|
||||||
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
|
||||||
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
|
||||||
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
|
|
||||||
],
|
|
||||||
)
|
|
|
@ -1,77 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright 2017 The Kubernetes Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package nvidia
|
|
||||||
|
|
||||||
import "k8s.io/apimachinery/pkg/util/sets"
|
|
||||||
|
|
||||||
type containerToGPU map[string]sets.String
|
|
||||||
|
|
||||||
// podGPUs represents a list of pod to GPU mappings.
|
|
||||||
type podGPUs struct {
|
|
||||||
podGPUMapping map[string]containerToGPU
|
|
||||||
}
|
|
||||||
|
|
||||||
func newPodGPUs() *podGPUs {
|
|
||||||
return &podGPUs{
|
|
||||||
podGPUMapping: make(map[string]containerToGPU),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
func (pgpu *podGPUs) pods() sets.String {
|
|
||||||
ret := sets.NewString()
|
|
||||||
for k := range pgpu.podGPUMapping {
|
|
||||||
ret.Insert(k)
|
|
||||||
}
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pgpu *podGPUs) insert(podUID, contName string, device string) {
|
|
||||||
if _, exists := pgpu.podGPUMapping[podUID]; !exists {
|
|
||||||
pgpu.podGPUMapping[podUID] = make(containerToGPU)
|
|
||||||
}
|
|
||||||
if _, exists := pgpu.podGPUMapping[podUID][contName]; !exists {
|
|
||||||
pgpu.podGPUMapping[podUID][contName] = sets.NewString()
|
|
||||||
}
|
|
||||||
pgpu.podGPUMapping[podUID][contName].Insert(device)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pgpu *podGPUs) getGPUs(podUID, contName string) sets.String {
|
|
||||||
containers, exists := pgpu.podGPUMapping[podUID]
|
|
||||||
if !exists {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
devices, exists := containers[contName]
|
|
||||||
if !exists {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return devices
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pgpu *podGPUs) delete(pods []string) {
|
|
||||||
for _, uid := range pods {
|
|
||||||
delete(pgpu.podGPUMapping, uid)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pgpu *podGPUs) devices() sets.String {
|
|
||||||
ret := sets.NewString()
|
|
||||||
for _, containerToGPU := range pgpu.podGPUMapping {
|
|
||||||
for _, deviceSet := range containerToGPU {
|
|
||||||
ret = ret.Union(deviceSet)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret
|
|
||||||
}
|
|
|
@ -1,280 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright 2017 The Kubernetes Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package nvidia
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"io/ioutil"
|
|
||||||
"os"
|
|
||||||
"path"
|
|
||||||
"regexp"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
|
||||||
|
|
||||||
"github.com/golang/glog"
|
|
||||||
|
|
||||||
"k8s.io/api/core/v1"
|
|
||||||
"k8s.io/apimachinery/pkg/api/resource"
|
|
||||||
"k8s.io/apimachinery/pkg/util/sets"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/dockershim"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/gpu"
|
|
||||||
)
|
|
||||||
|
|
||||||
// TODO: rework to use Nvidia's NVML, which is more complex, but also provides more fine-grained information and stats.
|
|
||||||
const (
|
|
||||||
// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
|
|
||||||
// If the driver installed correctly, the 2 devices will be there.
|
|
||||||
nvidiaCtlDevice string = "/dev/nvidiactl"
|
|
||||||
nvidiaUVMDevice string = "/dev/nvidia-uvm"
|
|
||||||
// Optional device.
|
|
||||||
nvidiaUVMToolsDevice string = "/dev/nvidia-uvm-tools"
|
|
||||||
devDirectory = "/dev"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
nvidiaDeviceRE = regexp.MustCompile(`^nvidia[0-9]*$`)
|
|
||||||
nvidiaFullpathRE = regexp.MustCompile(`^/dev/nvidia[0-9]*$`)
|
|
||||||
)
|
|
||||||
|
|
||||||
type activePodsLister interface {
|
|
||||||
// Returns a list of active pods on the node.
|
|
||||||
GetActivePods() []*v1.Pod
|
|
||||||
}
|
|
||||||
|
|
||||||
// nvidiaGPUManager manages nvidia gpu devices.
|
|
||||||
type nvidiaGPUManager struct {
|
|
||||||
sync.Mutex
|
|
||||||
// All gpus available on the Node
|
|
||||||
allGPUs sets.String
|
|
||||||
allocated *podGPUs
|
|
||||||
defaultDevices []string
|
|
||||||
// The interface which could get GPU mapping from all the containers.
|
|
||||||
// TODO: Should make this independent of Docker in the future.
|
|
||||||
dockerClient libdocker.Interface
|
|
||||||
activePodsLister activePodsLister
|
|
||||||
}
|
|
||||||
|
|
||||||
// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs.
|
|
||||||
// TODO: Migrate to use pod level cgroups and make it generic to all runtimes.
|
|
||||||
func NewNvidiaGPUManager(activePodsLister activePodsLister, config *dockershim.ClientConfig) (gpu.GPUManager, error) {
|
|
||||||
dockerClient := dockershim.NewDockerClientFromConfig(config)
|
|
||||||
if dockerClient == nil {
|
|
||||||
return nil, fmt.Errorf("invalid docker client configure specified")
|
|
||||||
}
|
|
||||||
|
|
||||||
return &nvidiaGPUManager{
|
|
||||||
allGPUs: sets.NewString(),
|
|
||||||
dockerClient: dockerClient,
|
|
||||||
activePodsLister: activePodsLister,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Initialize the GPU devices, so far only needed to discover the GPU paths.
|
|
||||||
func (ngm *nvidiaGPUManager) Start() error {
|
|
||||||
if ngm.dockerClient == nil {
|
|
||||||
return fmt.Errorf("Invalid docker client specified in GPU Manager")
|
|
||||||
}
|
|
||||||
ngm.Lock()
|
|
||||||
defer ngm.Unlock()
|
|
||||||
|
|
||||||
if _, err := os.Stat(nvidiaCtlDevice); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, err := os.Stat(nvidiaUVMDevice); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
|
|
||||||
_, err := os.Stat(nvidiaUVMToolsDevice)
|
|
||||||
if !os.IsNotExist(err) {
|
|
||||||
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := ngm.discoverGPUs(); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get how many GPU cards we have.
|
|
||||||
func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList {
|
|
||||||
gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI)
|
|
||||||
return v1.ResourceList{
|
|
||||||
v1.ResourceNvidiaGPU: *gpus,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// AllocateGPUs returns `num` GPUs if available, error otherwise.
|
|
||||||
// Allocation is made thread safe using the following logic.
|
|
||||||
// A list of all GPUs allocated is maintained along with their respective Pod UIDs.
|
|
||||||
// It is expected that the list of active pods will not return any false positives.
|
|
||||||
// As part of initialization or allocation, the list of GPUs in use will be computed once.
|
|
||||||
// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods.
|
|
||||||
// GPUs allocated to terminated pods are freed up lazily as part of allocation.
|
|
||||||
// GPUs are allocated based on the internal list of allocatedGPUs.
|
|
||||||
// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation.
|
|
||||||
// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough.
|
|
||||||
// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage.
|
|
||||||
// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead.
|
|
||||||
// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations.
|
|
||||||
// The pod level cgroups will then serve as a checkpoint of GPUs in use.
|
|
||||||
func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) {
|
|
||||||
gpusNeeded := container.Resources.Limits.NvidiaGPU().Value()
|
|
||||||
if gpusNeeded == 0 {
|
|
||||||
return []string{}, nil
|
|
||||||
}
|
|
||||||
ngm.Lock()
|
|
||||||
defer ngm.Unlock()
|
|
||||||
if ngm.allocated == nil {
|
|
||||||
// Initialization is not complete. Try now. Failures can no longer be tolerated.
|
|
||||||
ngm.allocated = ngm.gpusInUse()
|
|
||||||
} else {
|
|
||||||
// update internal list of GPUs in use prior to allocating new GPUs.
|
|
||||||
ngm.updateAllocatedGPUs()
|
|
||||||
}
|
|
||||||
// Check if GPUs have already been allocated. If so return them right away.
|
|
||||||
// This can happen if a container restarts for example.
|
|
||||||
if devices := ngm.allocated.getGPUs(string(pod.UID), container.Name); devices != nil {
|
|
||||||
glog.V(2).Infof("Found pre-allocated GPUs for container %q in Pod %q: %v", container.Name, pod.UID, devices.List())
|
|
||||||
return append(devices.List(), ngm.defaultDevices...), nil
|
|
||||||
}
|
|
||||||
// Get GPU devices in use.
|
|
||||||
devicesInUse := ngm.allocated.devices()
|
|
||||||
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
|
|
||||||
// Get a list of available GPUs.
|
|
||||||
available := ngm.allGPUs.Difference(devicesInUse)
|
|
||||||
glog.V(5).Infof("gpus available: %v", available.List())
|
|
||||||
if int64(available.Len()) < gpusNeeded {
|
|
||||||
return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
|
|
||||||
}
|
|
||||||
ret := available.UnsortedList()[:gpusNeeded]
|
|
||||||
for _, device := range ret {
|
|
||||||
// Update internal allocated GPU cache.
|
|
||||||
ngm.allocated.insert(string(pod.UID), container.Name, device)
|
|
||||||
}
|
|
||||||
// Add standard devices files that needs to be exposed.
|
|
||||||
ret = append(ret, ngm.defaultDevices...)
|
|
||||||
|
|
||||||
return ret, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// updateAllocatedGPUs updates the list of GPUs in use.
|
|
||||||
// It gets a list of active pods and then frees any GPUs that are bound to terminated pods.
|
|
||||||
// Returns error on failure.
|
|
||||||
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() {
|
|
||||||
activePods := ngm.activePodsLister.GetActivePods()
|
|
||||||
activePodUids := sets.NewString()
|
|
||||||
for _, pod := range activePods {
|
|
||||||
activePodUids.Insert(string(pod.UID))
|
|
||||||
}
|
|
||||||
allocatedPodUids := ngm.allocated.pods()
|
|
||||||
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
|
|
||||||
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
|
|
||||||
ngm.allocated.delete(podsToBeRemoved.List())
|
|
||||||
}
|
|
||||||
|
|
||||||
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
|
|
||||||
// TODO: Without NVML support we only can check whether there has GPU devices, but
|
|
||||||
// could not give a health check or get more information like GPU cores, memory, or
|
|
||||||
// family name. Need to support NVML in the future. But we do not need NVML until
|
|
||||||
// we want more features, features like schedule containers according to GPU family
|
|
||||||
// name.
|
|
||||||
func (ngm *nvidiaGPUManager) discoverGPUs() error {
|
|
||||||
files, err := ioutil.ReadDir(devDirectory)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
for _, f := range files {
|
|
||||||
if f.IsDir() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if nvidiaDeviceRE.MatchString(f.Name()) {
|
|
||||||
glog.V(2).Infof("Found Nvidia GPU %q", f.Name())
|
|
||||||
ngm.allGPUs.Insert(path.Join(devDirectory, f.Name()))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
|
|
||||||
func (ngm *nvidiaGPUManager) gpusInUse() *podGPUs {
|
|
||||||
pods := ngm.activePodsLister.GetActivePods()
|
|
||||||
type containerIdentifier struct {
|
|
||||||
id string
|
|
||||||
name string
|
|
||||||
}
|
|
||||||
type podContainers struct {
|
|
||||||
uid string
|
|
||||||
containers []containerIdentifier
|
|
||||||
}
|
|
||||||
// List of containers to inspect.
|
|
||||||
podContainersToInspect := []podContainers{}
|
|
||||||
for _, pod := range pods {
|
|
||||||
containers := sets.NewString()
|
|
||||||
for _, container := range pod.Spec.Containers {
|
|
||||||
// GPUs are expected to be specified only in limits.
|
|
||||||
if !container.Resources.Limits.NvidiaGPU().IsZero() {
|
|
||||||
containers.Insert(container.Name)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// If no GPUs were requested skip this pod.
|
|
||||||
if containers.Len() == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// TODO: If kubelet restarts right after allocating a GPU to a pod, the container might not have started yet and so container status might not be available yet.
|
|
||||||
// Use an internal checkpoint instead or try using the CRI if its checkpoint is reliable.
|
|
||||||
var containersToInspect []containerIdentifier
|
|
||||||
for _, container := range pod.Status.ContainerStatuses {
|
|
||||||
if containers.Has(container.Name) {
|
|
||||||
containersToInspect = append(containersToInspect, containerIdentifier{strings.Replace(container.ContainerID, "docker://", "", 1), container.Name})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// add the pod and its containers that need to be inspected.
|
|
||||||
podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containersToInspect})
|
|
||||||
}
|
|
||||||
ret := newPodGPUs()
|
|
||||||
for _, podContainer := range podContainersToInspect {
|
|
||||||
for _, containerIdentifier := range podContainer.containers {
|
|
||||||
containerJSON, err := ngm.dockerClient.InspectContainer(containerIdentifier.id)
|
|
||||||
if err != nil {
|
|
||||||
glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerIdentifier.id, podContainer.uid)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
devices := containerJSON.HostConfig.Devices
|
|
||||||
if devices == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, device := range devices {
|
|
||||||
if isValidPath(device.PathOnHost) {
|
|
||||||
glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
|
|
||||||
ret.insert(podContainer.uid, containerIdentifier.name, device.PathOnHost)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
func isValidPath(path string) bool {
|
|
||||||
return nvidiaFullpathRE.MatchString(path)
|
|
||||||
}
|
|
|
@ -1,213 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright 2017 The Kubernetes Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package nvidia
|
|
||||||
|
|
||||||
import (
|
|
||||||
"os"
|
|
||||||
"reflect"
|
|
||||||
"testing"
|
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
|
||||||
|
|
||||||
"k8s.io/api/core/v1"
|
|
||||||
"k8s.io/apimachinery/pkg/api/resource"
|
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
||||||
"k8s.io/apimachinery/pkg/util/sets"
|
|
||||||
"k8s.io/apimachinery/pkg/util/uuid"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/dockershim"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
|
|
||||||
)
|
|
||||||
|
|
||||||
type testActivePodsLister struct {
|
|
||||||
activePods []*v1.Pod
|
|
||||||
}
|
|
||||||
|
|
||||||
func (tapl *testActivePodsLister) GetActivePods() []*v1.Pod {
|
|
||||||
return tapl.activePods
|
|
||||||
}
|
|
||||||
|
|
||||||
func makeTestPod(numContainers, gpusPerContainer int) *v1.Pod {
|
|
||||||
quantity := resource.NewQuantity(int64(gpusPerContainer), resource.DecimalSI)
|
|
||||||
resources := v1.ResourceRequirements{
|
|
||||||
Limits: v1.ResourceList{
|
|
||||||
v1.ResourceNvidiaGPU: *quantity,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
pod := &v1.Pod{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
UID: uuid.NewUUID(),
|
|
||||||
},
|
|
||||||
Spec: v1.PodSpec{
|
|
||||||
Containers: []v1.Container{},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
for ; numContainers > 0; numContainers-- {
|
|
||||||
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
|
|
||||||
Name: string(uuid.NewUUID()),
|
|
||||||
Resources: resources,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
return pod
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNewNvidiaGPUManager(t *testing.T) {
|
|
||||||
podLister := &testActivePodsLister{}
|
|
||||||
|
|
||||||
// Expects nil GPUManager and an error with nil dockerClient.
|
|
||||||
testGpuManager1, err := NewNvidiaGPUManager(podLister, nil)
|
|
||||||
as := assert.New(t)
|
|
||||||
as.Nil(testGpuManager1)
|
|
||||||
as.NotNil(err)
|
|
||||||
|
|
||||||
// Expects a GPUManager to be created with non-nil dockerClient.
|
|
||||||
testGpuManager2, err := NewNvidiaGPUManager(podLister, &dockershim.ClientConfig{
|
|
||||||
DockerEndpoint: libdocker.FakeDockerEndpoint,
|
|
||||||
})
|
|
||||||
as.NotNil(testGpuManager2)
|
|
||||||
as.Nil(err)
|
|
||||||
|
|
||||||
// Expects zero capacity without any GPUs.
|
|
||||||
gpuCapacity := testGpuManager2.Capacity()
|
|
||||||
as.Equal(len(gpuCapacity), 1)
|
|
||||||
rgpu := gpuCapacity[v1.ResourceNvidiaGPU]
|
|
||||||
as.Equal(rgpu.Value(), int64(0))
|
|
||||||
|
|
||||||
err2 := testGpuManager2.Start()
|
|
||||||
if !os.IsNotExist(err2) {
|
|
||||||
gpus := reflect.ValueOf(testGpuManager2).Elem().FieldByName("allGPUs").Len()
|
|
||||||
as.NotZero(gpus)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestMultiContainerPodGPUAllocation(t *testing.T) {
|
|
||||||
podLister := &testActivePodsLister{}
|
|
||||||
|
|
||||||
testGpuManager := &nvidiaGPUManager{
|
|
||||||
activePodsLister: podLister,
|
|
||||||
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
|
||||||
allocated: newPodGPUs(),
|
|
||||||
}
|
|
||||||
|
|
||||||
// Expect that no devices are in use.
|
|
||||||
gpusInUse := testGpuManager.gpusInUse()
|
|
||||||
as := assert.New(t)
|
|
||||||
as.Equal(len(gpusInUse.devices()), 0)
|
|
||||||
|
|
||||||
// Allocated GPUs for a pod with two containers.
|
|
||||||
pod := makeTestPod(2, 1)
|
|
||||||
// Allocate for the first container.
|
|
||||||
devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
|
|
||||||
as.Nil(err)
|
|
||||||
as.Equal(len(devices1), 1)
|
|
||||||
|
|
||||||
podLister.activePods = append(podLister.activePods, pod)
|
|
||||||
// Allocate for the second container.
|
|
||||||
devices2, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[1])
|
|
||||||
as.Nil(err)
|
|
||||||
as.Equal(len(devices2), 1)
|
|
||||||
|
|
||||||
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
|
||||||
|
|
||||||
// further allocations should fail.
|
|
||||||
newPod := makeTestPod(2, 1)
|
|
||||||
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
|
||||||
as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
|
|
||||||
|
|
||||||
// Now terminate the original pod and observe that GPU allocation for new pod succeeds.
|
|
||||||
podLister.activePods = podLister.activePods[:0]
|
|
||||||
|
|
||||||
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
|
|
||||||
as.Nil(err)
|
|
||||||
as.Equal(len(devices1), 1)
|
|
||||||
|
|
||||||
podLister.activePods = append(podLister.activePods, newPod)
|
|
||||||
|
|
||||||
devices2, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[1])
|
|
||||||
as.Nil(err)
|
|
||||||
as.Equal(len(devices2), 1)
|
|
||||||
|
|
||||||
as.NotEqual(devices1, devices2, "expected containers to get different devices")
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestMultiPodGPUAllocation(t *testing.T) {
|
|
||||||
podLister := &testActivePodsLister{}
|
|
||||||
|
|
||||||
testGpuManager := &nvidiaGPUManager{
|
|
||||||
activePodsLister: podLister,
|
|
||||||
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
|
||||||
allocated: newPodGPUs(),
|
|
||||||
}
|
|
||||||
|
|
||||||
// Expect that no devices are in use.
|
|
||||||
gpusInUse := testGpuManager.gpusInUse()
|
|
||||||
as := assert.New(t)
|
|
||||||
as.Equal(len(gpusInUse.devices()), 0)
|
|
||||||
|
|
||||||
// Allocated GPUs for a pod with two containers.
|
|
||||||
podA := makeTestPod(1, 1)
|
|
||||||
// Allocate for the first container.
|
|
||||||
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
|
||||||
as.Nil(err)
|
|
||||||
as.Equal(len(devicesA), 1)
|
|
||||||
|
|
||||||
podLister.activePods = append(podLister.activePods, podA)
|
|
||||||
|
|
||||||
// further allocations should fail.
|
|
||||||
podB := makeTestPod(1, 1)
|
|
||||||
// Allocate for the first container.
|
|
||||||
devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
|
||||||
as.Nil(err)
|
|
||||||
as.Equal(len(devicesB), 1)
|
|
||||||
as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestPodContainerRestart(t *testing.T) {
|
|
||||||
podLister := &testActivePodsLister{}
|
|
||||||
|
|
||||||
testGpuManager := &nvidiaGPUManager{
|
|
||||||
activePodsLister: podLister,
|
|
||||||
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
|
|
||||||
allocated: newPodGPUs(),
|
|
||||||
defaultDevices: []string{"/dev/nvidia-smi"},
|
|
||||||
}
|
|
||||||
|
|
||||||
// Expect that no devices are in use.
|
|
||||||
gpusInUse := testGpuManager.gpusInUse()
|
|
||||||
as := assert.New(t)
|
|
||||||
as.Equal(len(gpusInUse.devices()), 0)
|
|
||||||
|
|
||||||
// Make a pod with one containers that requests two GPUs.
|
|
||||||
podA := makeTestPod(1, 2)
|
|
||||||
// Allocate GPUs
|
|
||||||
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
|
||||||
as.Nil(err)
|
|
||||||
as.Equal(len(devicesA), 3)
|
|
||||||
|
|
||||||
podLister.activePods = append(podLister.activePods, podA)
|
|
||||||
|
|
||||||
// further allocations should fail.
|
|
||||||
podB := makeTestPod(1, 1)
|
|
||||||
_, err = testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
|
|
||||||
as.NotNil(err)
|
|
||||||
|
|
||||||
// Allcate GPU for existing Pod A.
|
|
||||||
// The same gpus must be returned.
|
|
||||||
devicesAretry, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
|
|
||||||
as.Nil(err)
|
|
||||||
as.Equal(len(devicesA), 3)
|
|
||||||
as.True(sets.NewString(devicesA...).Equal(sets.NewString(devicesAretry...)))
|
|
||||||
}
|
|
|
@ -1,32 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright 2017 The Kubernetes Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package gpu
|
|
||||||
|
|
||||||
import "k8s.io/api/core/v1"
|
|
||||||
|
|
||||||
// GPUManager manages GPUs on a local node.
|
|
||||||
// Implementations are expected to be thread safe.
|
|
||||||
type GPUManager interface {
|
|
||||||
// Start logically initializes GPUManager
|
|
||||||
Start() error
|
|
||||||
// Capacity returns the total number of GPUs on the node.
|
|
||||||
Capacity() v1.ResourceList
|
|
||||||
// AllocateGPU attempts to allocate GPUs for input container.
|
|
||||||
// Returns paths to allocated GPUs and nil on success.
|
|
||||||
// Returns an error on failure.
|
|
||||||
AllocateGPU(*v1.Pod, *v1.Container) ([]string, error)
|
|
||||||
}
|
|
|
@ -69,8 +69,6 @@ import (
|
||||||
dockerremote "k8s.io/kubernetes/pkg/kubelet/dockershim/remote"
|
dockerremote "k8s.io/kubernetes/pkg/kubelet/dockershim/remote"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/events"
|
"k8s.io/kubernetes/pkg/kubelet/events"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/gpu"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/images"
|
"k8s.io/kubernetes/pkg/kubelet/images"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/kubeletconfig"
|
"k8s.io/kubernetes/pkg/kubelet/kubeletconfig"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
|
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
|
||||||
|
@ -866,20 +864,6 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
|
||||||
klet.appArmorValidator = apparmor.NewValidator(containerRuntime)
|
klet.appArmorValidator = apparmor.NewValidator(containerRuntime)
|
||||||
klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
|
klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
|
||||||
klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewNoNewPrivsAdmitHandler(klet.containerRuntime))
|
klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewNoNewPrivsAdmitHandler(klet.containerRuntime))
|
||||||
if utilfeature.DefaultFeatureGate.Enabled(features.Accelerators) {
|
|
||||||
if containerRuntime == kubetypes.DockerContainerRuntime {
|
|
||||||
glog.Warningln("Accelerators feature is deprecated and will be removed in v1.11. Please use device plugins instead. They can be enabled using the DevicePlugins feature gate.")
|
|
||||||
if klet.gpuManager, err = nvidia.NewNvidiaGPUManager(klet, kubeDeps.DockerClientConfig); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
glog.Errorf("Accelerators feature is supported with docker runtime only. Disabling this feature internally.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Set GPU manager to a stub implementation if it is not enabled or cannot be supported.
|
|
||||||
if klet.gpuManager == nil {
|
|
||||||
klet.gpuManager = gpu.NewGPUManagerStub()
|
|
||||||
}
|
|
||||||
// Finally, put the most recent version of the config on the Kubelet, so
|
// Finally, put the most recent version of the config on the Kubelet, so
|
||||||
// people can see how it was configured.
|
// people can see how it was configured.
|
||||||
klet.kubeletConfiguration = *kubeCfg
|
klet.kubeletConfiguration = *kubeCfg
|
||||||
|
@ -1152,9 +1136,6 @@ type Kubelet struct {
|
||||||
// experimental behavior is desired.
|
// experimental behavior is desired.
|
||||||
experimentalHostUserNamespaceDefaulting bool
|
experimentalHostUserNamespaceDefaulting bool
|
||||||
|
|
||||||
// GPU Manager
|
|
||||||
gpuManager gpu.GPUManager
|
|
||||||
|
|
||||||
// dockerLegacyService contains some legacy methods for backward compatibility.
|
// dockerLegacyService contains some legacy methods for backward compatibility.
|
||||||
// It should be set only when docker is using non json-file logging driver.
|
// It should be set only when docker is using non json-file logging driver.
|
||||||
dockerLegacyService dockershim.DockerLegacyService
|
dockerLegacyService dockershim.DockerLegacyService
|
||||||
|
@ -1292,11 +1273,6 @@ func (kl *Kubelet) initializeModules() error {
|
||||||
return fmt.Errorf("Failed to start OOM watcher %v", err)
|
return fmt.Errorf("Failed to start OOM watcher %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize GPUs
|
|
||||||
if err := kl.gpuManager.Start(); err != nil {
|
|
||||||
glog.Errorf("Failed to start gpuManager %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start resource analyzer
|
// Start resource analyzer
|
||||||
kl.resourceAnalyzer.Start()
|
kl.resourceAnalyzer.Start()
|
||||||
|
|
||||||
|
|
|
@ -540,14 +540,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
|
||||||
node.Status.Capacity = v1.ResourceList{}
|
node.Status.Capacity = v1.ResourceList{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// populate GPU capacity.
|
|
||||||
gpuCapacity := kl.gpuManager.Capacity()
|
|
||||||
if gpuCapacity != nil {
|
|
||||||
for k, v := range gpuCapacity {
|
|
||||||
node.Status.Capacity[k] = v
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var devicePluginAllocatable v1.ResourceList
|
var devicePluginAllocatable v1.ResourceList
|
||||||
var devicePluginCapacity v1.ResourceList
|
var devicePluginCapacity v1.ResourceList
|
||||||
var removedDevicePlugins []string
|
var removedDevicePlugins []string
|
||||||
|
|
|
@ -90,26 +90,6 @@ func (kl *Kubelet) GetActivePods() []*v1.Pod {
|
||||||
return activePods
|
return activePods
|
||||||
}
|
}
|
||||||
|
|
||||||
// makeGPUDevices determines the devices for the given container.
|
|
||||||
// Experimental.
|
|
||||||
func (kl *Kubelet) makeGPUDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) {
|
|
||||||
if container.Resources.Limits.NvidiaGPU().IsZero() {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
var devices []kubecontainer.DeviceInfo
|
|
||||||
for _, path := range nvidiaGPUPaths {
|
|
||||||
// Devices have to be mapped one to one because of nvidia CUDA library requirements.
|
|
||||||
devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: path, Permissions: "mrw"})
|
|
||||||
}
|
|
||||||
|
|
||||||
return devices, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func makeAbsolutePath(goos, path string) string {
|
func makeAbsolutePath(goos, path string) string {
|
||||||
if goos != "windows" {
|
if goos != "windows" {
|
||||||
return "/" + path
|
return "/" + path
|
||||||
|
@ -470,12 +450,6 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai
|
||||||
volumes := kl.volumeManager.GetMountedVolumesForPod(podName)
|
volumes := kl.volumeManager.GetMountedVolumesForPod(podName)
|
||||||
|
|
||||||
opts.PortMappings = kubecontainer.MakePortMappings(container)
|
opts.PortMappings = kubecontainer.MakePortMappings(container)
|
||||||
// TODO(random-liu): Move following convert functions into pkg/kubelet/container
|
|
||||||
devices, err := kl.makeGPUDevices(pod, container)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
opts.Devices = append(opts.Devices, devices...)
|
|
||||||
|
|
||||||
// TODO: remove feature gate check after no longer needed
|
// TODO: remove feature gate check after no longer needed
|
||||||
if utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
|
if utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {
|
||||||
|
|
|
@ -49,7 +49,6 @@ import (
|
||||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||||
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
|
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
"k8s.io/kubernetes/pkg/kubelet/eviction"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/gpu"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/images"
|
"k8s.io/kubernetes/pkg/kubelet/images"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
|
||||||
"k8s.io/kubernetes/pkg/kubelet/logs"
|
"k8s.io/kubernetes/pkg/kubelet/logs"
|
||||||
|
@ -325,7 +324,6 @@ func newTestKubeletWithImageList(
|
||||||
|
|
||||||
kubelet.AddPodSyncLoopHandler(activeDeadlineHandler)
|
kubelet.AddPodSyncLoopHandler(activeDeadlineHandler)
|
||||||
kubelet.AddPodSyncHandler(activeDeadlineHandler)
|
kubelet.AddPodSyncHandler(activeDeadlineHandler)
|
||||||
kubelet.gpuManager = gpu.NewGPUManagerStub()
|
|
||||||
return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug}
|
return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -248,7 +248,6 @@ func sortPodsByQOS(pods []*v1.Pod) (bestEffort, burstable, guaranteed []*v1.Pod)
|
||||||
// returns true if pod1 has a smaller request than pod2
|
// returns true if pod1 has a smaller request than pod2
|
||||||
func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
|
func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
|
||||||
priorityList := []v1.ResourceName{
|
priorityList := []v1.ResourceName{
|
||||||
v1.ResourceNvidiaGPU,
|
|
||||||
v1.ResourceMemory,
|
v1.ResourceMemory,
|
||||||
v1.ResourceCPU,
|
v1.ResourceCPU,
|
||||||
}
|
}
|
||||||
|
|
|
@ -682,10 +682,6 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource {
|
||||||
if cpu := rQuantity.MilliValue(); cpu > result.MilliCPU {
|
if cpu := rQuantity.MilliValue(); cpu > result.MilliCPU {
|
||||||
result.MilliCPU = cpu
|
result.MilliCPU = cpu
|
||||||
}
|
}
|
||||||
case v1.ResourceNvidiaGPU:
|
|
||||||
if gpu := rQuantity.Value(); gpu > result.NvidiaGPU {
|
|
||||||
result.NvidiaGPU = gpu
|
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
if v1helper.IsScalarResourceName(rName) {
|
if v1helper.IsScalarResourceName(rName) {
|
||||||
value := rQuantity.Value()
|
value := rQuantity.Value()
|
||||||
|
@ -734,7 +730,6 @@ func PodFitsResources(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *s
|
||||||
}
|
}
|
||||||
if podRequest.MilliCPU == 0 &&
|
if podRequest.MilliCPU == 0 &&
|
||||||
podRequest.Memory == 0 &&
|
podRequest.Memory == 0 &&
|
||||||
podRequest.NvidiaGPU == 0 &&
|
|
||||||
podRequest.EphemeralStorage == 0 &&
|
podRequest.EphemeralStorage == 0 &&
|
||||||
len(podRequest.ScalarResources) == 0 {
|
len(podRequest.ScalarResources) == 0 {
|
||||||
return len(predicateFails) == 0, predicateFails, nil
|
return len(predicateFails) == 0, predicateFails, nil
|
||||||
|
@ -747,10 +742,6 @@ func PodFitsResources(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *s
|
||||||
if allocatable.Memory < podRequest.Memory+nodeInfo.RequestedResource().Memory {
|
if allocatable.Memory < podRequest.Memory+nodeInfo.RequestedResource().Memory {
|
||||||
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceMemory, podRequest.Memory, nodeInfo.RequestedResource().Memory, allocatable.Memory))
|
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceMemory, podRequest.Memory, nodeInfo.RequestedResource().Memory, allocatable.Memory))
|
||||||
}
|
}
|
||||||
if allocatable.NvidiaGPU < podRequest.NvidiaGPU+nodeInfo.RequestedResource().NvidiaGPU {
|
|
||||||
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceNvidiaGPU, podRequest.NvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, allocatable.NvidiaGPU))
|
|
||||||
}
|
|
||||||
|
|
||||||
if allocatable.EphemeralStorage < podRequest.EphemeralStorage+nodeInfo.RequestedResource().EphemeralStorage {
|
if allocatable.EphemeralStorage < podRequest.EphemeralStorage+nodeInfo.RequestedResource().EphemeralStorage {
|
||||||
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceEphemeralStorage, podRequest.EphemeralStorage, nodeInfo.RequestedResource().EphemeralStorage, allocatable.EphemeralStorage))
|
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceEphemeralStorage, podRequest.EphemeralStorage, nodeInfo.RequestedResource().EphemeralStorage, allocatable.EphemeralStorage))
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,13 +44,12 @@ var (
|
||||||
hugePageResourceA = v1helper.HugePageResourceName(resource.MustParse("2Mi"))
|
hugePageResourceA = v1helper.HugePageResourceName(resource.MustParse("2Mi"))
|
||||||
)
|
)
|
||||||
|
|
||||||
func makeResources(milliCPU, memory, nvidiaGPUs, pods, extendedA, storage, hugePageA int64) v1.NodeResources {
|
func makeResources(milliCPU, memory, pods, extendedA, storage, hugePageA int64) v1.NodeResources {
|
||||||
return v1.NodeResources{
|
return v1.NodeResources{
|
||||||
Capacity: v1.ResourceList{
|
Capacity: v1.ResourceList{
|
||||||
v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI),
|
v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI),
|
||||||
v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI),
|
v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI),
|
||||||
v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI),
|
v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI),
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI),
|
|
||||||
extendedResourceA: *resource.NewQuantity(extendedA, resource.DecimalSI),
|
extendedResourceA: *resource.NewQuantity(extendedA, resource.DecimalSI),
|
||||||
v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI),
|
v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI),
|
||||||
hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI),
|
hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI),
|
||||||
|
@ -58,12 +57,11 @@ func makeResources(milliCPU, memory, nvidiaGPUs, pods, extendedA, storage, hugeP
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, extendedA, storage, hugePageA int64) v1.ResourceList {
|
func makeAllocatableResources(milliCPU, memory, pods, extendedA, storage, hugePageA int64) v1.ResourceList {
|
||||||
return v1.ResourceList{
|
return v1.ResourceList{
|
||||||
v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI),
|
v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI),
|
||||||
v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI),
|
v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI),
|
||||||
v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI),
|
v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI),
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI),
|
|
||||||
extendedResourceA: *resource.NewQuantity(extendedA, resource.DecimalSI),
|
extendedResourceA: *resource.NewQuantity(extendedA, resource.DecimalSI),
|
||||||
v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI),
|
v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI),
|
||||||
hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI),
|
hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI),
|
||||||
|
@ -357,7 +355,7 @@ func TestPodFitsResources(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range enoughPodsTests {
|
for _, test := range enoughPodsTests {
|
||||||
node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20, 5)}}
|
node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 5, 20, 5)}}
|
||||||
test.nodeInfo.SetNode(&node)
|
test.nodeInfo.SetNode(&node)
|
||||||
RegisterPredicateMetadataProducerWithExtendedResourceOptions(test.ignoredExtendedResources)
|
RegisterPredicateMetadataProducerWithExtendedResourceOptions(test.ignoredExtendedResources)
|
||||||
meta := PredicateMetadata(test.pod, nil)
|
meta := PredicateMetadata(test.pod, nil)
|
||||||
|
@ -414,7 +412,7 @@ func TestPodFitsResources(t *testing.T) {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
for _, test := range notEnoughPodsTests {
|
for _, test := range notEnoughPodsTests {
|
||||||
node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0, 0, 0)}}
|
node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 1, 0, 0, 0)}}
|
||||||
test.nodeInfo.SetNode(&node)
|
test.nodeInfo.SetNode(&node)
|
||||||
fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo)
|
fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -472,7 +470,7 @@ func TestPodFitsResources(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range storagePodsTests {
|
for _, test := range storagePodsTests {
|
||||||
node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20, 5)}}
|
node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 5, 20, 5)}}
|
||||||
test.nodeInfo.SetNode(&node)
|
test.nodeInfo.SetNode(&node)
|
||||||
fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo)
|
fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -2062,7 +2060,7 @@ func TestRunGeneralPredicates(t *testing.T) {
|
||||||
newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})),
|
newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})),
|
||||||
node: &v1.Node{
|
node: &v1.Node{
|
||||||
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
|
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
|
||||||
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)},
|
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 0, 0, 0)},
|
||||||
},
|
},
|
||||||
fits: true,
|
fits: true,
|
||||||
wErr: nil,
|
wErr: nil,
|
||||||
|
@ -2074,7 +2072,7 @@ func TestRunGeneralPredicates(t *testing.T) {
|
||||||
newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 19})),
|
newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 19})),
|
||||||
node: &v1.Node{
|
node: &v1.Node{
|
||||||
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
|
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
|
||||||
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)},
|
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 0, 0, 0)},
|
||||||
},
|
},
|
||||||
fits: false,
|
fits: false,
|
||||||
wErr: nil,
|
wErr: nil,
|
||||||
|
@ -2084,34 +2082,6 @@ func TestRunGeneralPredicates(t *testing.T) {
|
||||||
},
|
},
|
||||||
test: "not enough cpu and memory resource",
|
test: "not enough cpu and memory resource",
|
||||||
},
|
},
|
||||||
{
|
|
||||||
pod: &v1.Pod{},
|
|
||||||
nodeInfo: schedulercache.NewNodeInfo(
|
|
||||||
newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})),
|
|
||||||
node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}},
|
|
||||||
fits: true,
|
|
||||||
wErr: nil,
|
|
||||||
test: "no resources/port/host requested always fits on GPU machine",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}),
|
|
||||||
nodeInfo: schedulercache.NewNodeInfo(
|
|
||||||
newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 1})),
|
|
||||||
node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}},
|
|
||||||
fits: false,
|
|
||||||
wErr: nil,
|
|
||||||
reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(v1.ResourceNvidiaGPU, 1, 1, 1)},
|
|
||||||
test: "not enough GPU resource",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}),
|
|
||||||
nodeInfo: schedulercache.NewNodeInfo(
|
|
||||||
newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 0})),
|
|
||||||
node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}},
|
|
||||||
fits: true,
|
|
||||||
wErr: nil,
|
|
||||||
test: "enough GPU resource",
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
pod: &v1.Pod{
|
pod: &v1.Pod{
|
||||||
Spec: v1.PodSpec{
|
Spec: v1.PodSpec{
|
||||||
|
@ -2121,7 +2091,7 @@ func TestRunGeneralPredicates(t *testing.T) {
|
||||||
nodeInfo: schedulercache.NewNodeInfo(),
|
nodeInfo: schedulercache.NewNodeInfo(),
|
||||||
node: &v1.Node{
|
node: &v1.Node{
|
||||||
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
|
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
|
||||||
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)},
|
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 0, 0, 0)},
|
||||||
},
|
},
|
||||||
fits: false,
|
fits: false,
|
||||||
wErr: nil,
|
wErr: nil,
|
||||||
|
@ -2133,7 +2103,7 @@ func TestRunGeneralPredicates(t *testing.T) {
|
||||||
nodeInfo: schedulercache.NewNodeInfo(newPodWithPort(123)),
|
nodeInfo: schedulercache.NewNodeInfo(newPodWithPort(123)),
|
||||||
node: &v1.Node{
|
node: &v1.Node{
|
||||||
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
|
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
|
||||||
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)},
|
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 0, 0, 0)},
|
||||||
},
|
},
|
||||||
fits: false,
|
fits: false,
|
||||||
wErr: nil,
|
wErr: nil,
|
||||||
|
@ -3443,7 +3413,7 @@ func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) {
|
||||||
ImagePullPolicy: "Always",
|
ImagePullPolicy: "Always",
|
||||||
// at least one requirement -> burstable pod
|
// at least one requirement -> burstable pod
|
||||||
Resources: v1.ResourceRequirements{
|
Resources: v1.ResourceRequirements{
|
||||||
Requests: makeAllocatableResources(100, 100, 100, 100, 0, 0, 0),
|
Requests: makeAllocatableResources(100, 100, 100, 0, 0, 0),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
|
@ -109,10 +109,6 @@ func getResourceLimits(pod *v1.Pod) *schedulercache.Resource {
|
||||||
if ephemeralStorage := rQuantity.Value(); ephemeralStorage > result.EphemeralStorage {
|
if ephemeralStorage := rQuantity.Value(); ephemeralStorage > result.EphemeralStorage {
|
||||||
result.EphemeralStorage = ephemeralStorage
|
result.EphemeralStorage = ephemeralStorage
|
||||||
}
|
}
|
||||||
case v1.ResourceNvidiaGPU:
|
|
||||||
if gpu := rQuantity.Value(); gpu > result.NvidiaGPU {
|
|
||||||
result.NvidiaGPU = gpu
|
|
||||||
}
|
|
||||||
default:
|
default:
|
||||||
if v1helper.IsScalarResourceName(rName) {
|
if v1helper.IsScalarResourceName(rName) {
|
||||||
value := rQuantity.Value()
|
value := rQuantity.Value()
|
||||||
|
|
|
@ -114,7 +114,6 @@ func (transientSchedInfo *transientSchedulerInfo) resetTransientSchedulerInfo()
|
||||||
type Resource struct {
|
type Resource struct {
|
||||||
MilliCPU int64
|
MilliCPU int64
|
||||||
Memory int64
|
Memory int64
|
||||||
NvidiaGPU int64
|
|
||||||
EphemeralStorage int64
|
EphemeralStorage int64
|
||||||
// We store allowedPodNumber (which is Node.Status.Allocatable.Pods().Value())
|
// We store allowedPodNumber (which is Node.Status.Allocatable.Pods().Value())
|
||||||
// explicitly as int, to avoid conversions and improve performance.
|
// explicitly as int, to avoid conversions and improve performance.
|
||||||
|
@ -142,8 +141,6 @@ func (r *Resource) Add(rl v1.ResourceList) {
|
||||||
r.MilliCPU += rQuant.MilliValue()
|
r.MilliCPU += rQuant.MilliValue()
|
||||||
case v1.ResourceMemory:
|
case v1.ResourceMemory:
|
||||||
r.Memory += rQuant.Value()
|
r.Memory += rQuant.Value()
|
||||||
case v1.ResourceNvidiaGPU:
|
|
||||||
r.NvidiaGPU += rQuant.Value()
|
|
||||||
case v1.ResourcePods:
|
case v1.ResourcePods:
|
||||||
r.AllowedPodNumber += int(rQuant.Value())
|
r.AllowedPodNumber += int(rQuant.Value())
|
||||||
case v1.ResourceEphemeralStorage:
|
case v1.ResourceEphemeralStorage:
|
||||||
|
@ -161,7 +158,6 @@ func (r *Resource) ResourceList() v1.ResourceList {
|
||||||
result := v1.ResourceList{
|
result := v1.ResourceList{
|
||||||
v1.ResourceCPU: *resource.NewMilliQuantity(r.MilliCPU, resource.DecimalSI),
|
v1.ResourceCPU: *resource.NewMilliQuantity(r.MilliCPU, resource.DecimalSI),
|
||||||
v1.ResourceMemory: *resource.NewQuantity(r.Memory, resource.BinarySI),
|
v1.ResourceMemory: *resource.NewQuantity(r.Memory, resource.BinarySI),
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(r.NvidiaGPU, resource.DecimalSI),
|
|
||||||
v1.ResourcePods: *resource.NewQuantity(int64(r.AllowedPodNumber), resource.BinarySI),
|
v1.ResourcePods: *resource.NewQuantity(int64(r.AllowedPodNumber), resource.BinarySI),
|
||||||
v1.ResourceEphemeralStorage: *resource.NewQuantity(r.EphemeralStorage, resource.BinarySI),
|
v1.ResourceEphemeralStorage: *resource.NewQuantity(r.EphemeralStorage, resource.BinarySI),
|
||||||
}
|
}
|
||||||
|
@ -180,7 +176,6 @@ func (r *Resource) Clone() *Resource {
|
||||||
res := &Resource{
|
res := &Resource{
|
||||||
MilliCPU: r.MilliCPU,
|
MilliCPU: r.MilliCPU,
|
||||||
Memory: r.Memory,
|
Memory: r.Memory,
|
||||||
NvidiaGPU: r.NvidiaGPU,
|
|
||||||
AllowedPodNumber: r.AllowedPodNumber,
|
AllowedPodNumber: r.AllowedPodNumber,
|
||||||
EphemeralStorage: r.EphemeralStorage,
|
EphemeralStorage: r.EphemeralStorage,
|
||||||
}
|
}
|
||||||
|
@ -369,7 +364,6 @@ func (n *NodeInfo) AddPod(pod *v1.Pod) {
|
||||||
res, non0CPU, non0Mem := calculateResource(pod)
|
res, non0CPU, non0Mem := calculateResource(pod)
|
||||||
n.requestedResource.MilliCPU += res.MilliCPU
|
n.requestedResource.MilliCPU += res.MilliCPU
|
||||||
n.requestedResource.Memory += res.Memory
|
n.requestedResource.Memory += res.Memory
|
||||||
n.requestedResource.NvidiaGPU += res.NvidiaGPU
|
|
||||||
n.requestedResource.EphemeralStorage += res.EphemeralStorage
|
n.requestedResource.EphemeralStorage += res.EphemeralStorage
|
||||||
if n.requestedResource.ScalarResources == nil && len(res.ScalarResources) > 0 {
|
if n.requestedResource.ScalarResources == nil && len(res.ScalarResources) > 0 {
|
||||||
n.requestedResource.ScalarResources = map[v1.ResourceName]int64{}
|
n.requestedResource.ScalarResources = map[v1.ResourceName]int64{}
|
||||||
|
@ -425,7 +419,6 @@ func (n *NodeInfo) RemovePod(pod *v1.Pod) error {
|
||||||
|
|
||||||
n.requestedResource.MilliCPU -= res.MilliCPU
|
n.requestedResource.MilliCPU -= res.MilliCPU
|
||||||
n.requestedResource.Memory -= res.Memory
|
n.requestedResource.Memory -= res.Memory
|
||||||
n.requestedResource.NvidiaGPU -= res.NvidiaGPU
|
|
||||||
n.requestedResource.EphemeralStorage -= res.EphemeralStorage
|
n.requestedResource.EphemeralStorage -= res.EphemeralStorage
|
||||||
if len(res.ScalarResources) > 0 && n.requestedResource.ScalarResources == nil {
|
if len(res.ScalarResources) > 0 && n.requestedResource.ScalarResources == nil {
|
||||||
n.requestedResource.ScalarResources = map[v1.ResourceName]int64{}
|
n.requestedResource.ScalarResources = map[v1.ResourceName]int64{}
|
||||||
|
|
|
@ -41,7 +41,6 @@ func TestNewResource(t *testing.T) {
|
||||||
resourceList: map[v1.ResourceName]resource.Quantity{
|
resourceList: map[v1.ResourceName]resource.Quantity{
|
||||||
v1.ResourceCPU: *resource.NewScaledQuantity(4, -3),
|
v1.ResourceCPU: *resource.NewScaledQuantity(4, -3),
|
||||||
v1.ResourceMemory: *resource.NewQuantity(2000, resource.BinarySI),
|
v1.ResourceMemory: *resource.NewQuantity(2000, resource.BinarySI),
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(1000, resource.DecimalSI),
|
|
||||||
v1.ResourcePods: *resource.NewQuantity(80, resource.BinarySI),
|
v1.ResourcePods: *resource.NewQuantity(80, resource.BinarySI),
|
||||||
v1.ResourceEphemeralStorage: *resource.NewQuantity(5000, resource.BinarySI),
|
v1.ResourceEphemeralStorage: *resource.NewQuantity(5000, resource.BinarySI),
|
||||||
"scalar.test/" + "scalar1": *resource.NewQuantity(1, resource.DecimalSI),
|
"scalar.test/" + "scalar1": *resource.NewQuantity(1, resource.DecimalSI),
|
||||||
|
@ -50,7 +49,6 @@ func TestNewResource(t *testing.T) {
|
||||||
expected: &Resource{
|
expected: &Resource{
|
||||||
MilliCPU: 4,
|
MilliCPU: 4,
|
||||||
Memory: 2000,
|
Memory: 2000,
|
||||||
NvidiaGPU: 1000,
|
|
||||||
EphemeralStorage: 5000,
|
EphemeralStorage: 5000,
|
||||||
AllowedPodNumber: 80,
|
AllowedPodNumber: 80,
|
||||||
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
|
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
|
||||||
|
@ -76,7 +74,6 @@ func TestResourceList(t *testing.T) {
|
||||||
expected: map[v1.ResourceName]resource.Quantity{
|
expected: map[v1.ResourceName]resource.Quantity{
|
||||||
v1.ResourceCPU: *resource.NewScaledQuantity(0, -3),
|
v1.ResourceCPU: *resource.NewScaledQuantity(0, -3),
|
||||||
v1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI),
|
v1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI),
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
|
|
||||||
v1.ResourcePods: *resource.NewQuantity(0, resource.BinarySI),
|
v1.ResourcePods: *resource.NewQuantity(0, resource.BinarySI),
|
||||||
v1.ResourceEphemeralStorage: *resource.NewQuantity(0, resource.BinarySI),
|
v1.ResourceEphemeralStorage: *resource.NewQuantity(0, resource.BinarySI),
|
||||||
},
|
},
|
||||||
|
@ -85,7 +82,6 @@ func TestResourceList(t *testing.T) {
|
||||||
resource: &Resource{
|
resource: &Resource{
|
||||||
MilliCPU: 4,
|
MilliCPU: 4,
|
||||||
Memory: 2000,
|
Memory: 2000,
|
||||||
NvidiaGPU: 1000,
|
|
||||||
EphemeralStorage: 5000,
|
EphemeralStorage: 5000,
|
||||||
AllowedPodNumber: 80,
|
AllowedPodNumber: 80,
|
||||||
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
|
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
|
||||||
|
@ -93,7 +89,6 @@ func TestResourceList(t *testing.T) {
|
||||||
expected: map[v1.ResourceName]resource.Quantity{
|
expected: map[v1.ResourceName]resource.Quantity{
|
||||||
v1.ResourceCPU: *resource.NewScaledQuantity(4, -3),
|
v1.ResourceCPU: *resource.NewScaledQuantity(4, -3),
|
||||||
v1.ResourceMemory: *resource.NewQuantity(2000, resource.BinarySI),
|
v1.ResourceMemory: *resource.NewQuantity(2000, resource.BinarySI),
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(1000, resource.DecimalSI),
|
|
||||||
v1.ResourcePods: *resource.NewQuantity(80, resource.BinarySI),
|
v1.ResourcePods: *resource.NewQuantity(80, resource.BinarySI),
|
||||||
v1.ResourceEphemeralStorage: *resource.NewQuantity(5000, resource.BinarySI),
|
v1.ResourceEphemeralStorage: *resource.NewQuantity(5000, resource.BinarySI),
|
||||||
"scalar.test/" + "scalar1": *resource.NewQuantity(1, resource.DecimalSI),
|
"scalar.test/" + "scalar1": *resource.NewQuantity(1, resource.DecimalSI),
|
||||||
|
@ -123,7 +118,6 @@ func TestResourceClone(t *testing.T) {
|
||||||
resource: &Resource{
|
resource: &Resource{
|
||||||
MilliCPU: 4,
|
MilliCPU: 4,
|
||||||
Memory: 2000,
|
Memory: 2000,
|
||||||
NvidiaGPU: 1000,
|
|
||||||
EphemeralStorage: 5000,
|
EphemeralStorage: 5000,
|
||||||
AllowedPodNumber: 80,
|
AllowedPodNumber: 80,
|
||||||
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
|
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
|
||||||
|
@ -131,7 +125,6 @@ func TestResourceClone(t *testing.T) {
|
||||||
expected: &Resource{
|
expected: &Resource{
|
||||||
MilliCPU: 4,
|
MilliCPU: 4,
|
||||||
Memory: 2000,
|
Memory: 2000,
|
||||||
NvidiaGPU: 1000,
|
|
||||||
EphemeralStorage: 5000,
|
EphemeralStorage: 5000,
|
||||||
AllowedPodNumber: 80,
|
AllowedPodNumber: 80,
|
||||||
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
|
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
|
||||||
|
@ -168,7 +161,6 @@ func TestResourceAddScalar(t *testing.T) {
|
||||||
resource: &Resource{
|
resource: &Resource{
|
||||||
MilliCPU: 4,
|
MilliCPU: 4,
|
||||||
Memory: 2000,
|
Memory: 2000,
|
||||||
NvidiaGPU: 1000,
|
|
||||||
EphemeralStorage: 5000,
|
EphemeralStorage: 5000,
|
||||||
AllowedPodNumber: 80,
|
AllowedPodNumber: 80,
|
||||||
ScalarResources: map[v1.ResourceName]int64{"hugepages-test": 2},
|
ScalarResources: map[v1.ResourceName]int64{"hugepages-test": 2},
|
||||||
|
@ -178,7 +170,6 @@ func TestResourceAddScalar(t *testing.T) {
|
||||||
expected: &Resource{
|
expected: &Resource{
|
||||||
MilliCPU: 4,
|
MilliCPU: 4,
|
||||||
Memory: 2000,
|
Memory: 2000,
|
||||||
NvidiaGPU: 1000,
|
|
||||||
EphemeralStorage: 5000,
|
EphemeralStorage: 5000,
|
||||||
AllowedPodNumber: 80,
|
AllowedPodNumber: 80,
|
||||||
ScalarResources: map[v1.ResourceName]int64{"hugepages-test": 2, "scalar2": 200},
|
ScalarResources: map[v1.ResourceName]int64{"hugepages-test": 2, "scalar2": 200},
|
||||||
|
@ -205,7 +196,6 @@ func TestNewNodeInfo(t *testing.T) {
|
||||||
requestedResource: &Resource{
|
requestedResource: &Resource{
|
||||||
MilliCPU: 300,
|
MilliCPU: 300,
|
||||||
Memory: 1524,
|
Memory: 1524,
|
||||||
NvidiaGPU: 0,
|
|
||||||
EphemeralStorage: 0,
|
EphemeralStorage: 0,
|
||||||
AllowedPodNumber: 0,
|
AllowedPodNumber: 0,
|
||||||
ScalarResources: map[v1.ResourceName]int64(nil),
|
ScalarResources: map[v1.ResourceName]int64(nil),
|
||||||
|
@ -213,7 +203,6 @@ func TestNewNodeInfo(t *testing.T) {
|
||||||
nonzeroRequest: &Resource{
|
nonzeroRequest: &Resource{
|
||||||
MilliCPU: 300,
|
MilliCPU: 300,
|
||||||
Memory: 1524,
|
Memory: 1524,
|
||||||
NvidiaGPU: 0,
|
|
||||||
EphemeralStorage: 0,
|
EphemeralStorage: 0,
|
||||||
AllowedPodNumber: 0,
|
AllowedPodNumber: 0,
|
||||||
ScalarResources: map[v1.ResourceName]int64(nil),
|
ScalarResources: map[v1.ResourceName]int64(nil),
|
||||||
|
@ -516,7 +505,6 @@ func TestNodeInfoAddPod(t *testing.T) {
|
||||||
requestedResource: &Resource{
|
requestedResource: &Resource{
|
||||||
MilliCPU: 300,
|
MilliCPU: 300,
|
||||||
Memory: 1524,
|
Memory: 1524,
|
||||||
NvidiaGPU: 0,
|
|
||||||
EphemeralStorage: 0,
|
EphemeralStorage: 0,
|
||||||
AllowedPodNumber: 0,
|
AllowedPodNumber: 0,
|
||||||
ScalarResources: map[v1.ResourceName]int64(nil),
|
ScalarResources: map[v1.ResourceName]int64(nil),
|
||||||
|
@ -524,7 +512,6 @@ func TestNodeInfoAddPod(t *testing.T) {
|
||||||
nonzeroRequest: &Resource{
|
nonzeroRequest: &Resource{
|
||||||
MilliCPU: 300,
|
MilliCPU: 300,
|
||||||
Memory: 1524,
|
Memory: 1524,
|
||||||
NvidiaGPU: 0,
|
|
||||||
EphemeralStorage: 0,
|
EphemeralStorage: 0,
|
||||||
AllowedPodNumber: 0,
|
AllowedPodNumber: 0,
|
||||||
ScalarResources: map[v1.ResourceName]int64(nil),
|
ScalarResources: map[v1.ResourceName]int64(nil),
|
||||||
|
@ -630,7 +617,6 @@ func TestNodeInfoRemovePod(t *testing.T) {
|
||||||
requestedResource: &Resource{
|
requestedResource: &Resource{
|
||||||
MilliCPU: 300,
|
MilliCPU: 300,
|
||||||
Memory: 1524,
|
Memory: 1524,
|
||||||
NvidiaGPU: 0,
|
|
||||||
EphemeralStorage: 0,
|
EphemeralStorage: 0,
|
||||||
AllowedPodNumber: 0,
|
AllowedPodNumber: 0,
|
||||||
ScalarResources: map[v1.ResourceName]int64(nil),
|
ScalarResources: map[v1.ResourceName]int64(nil),
|
||||||
|
@ -638,7 +624,6 @@ func TestNodeInfoRemovePod(t *testing.T) {
|
||||||
nonzeroRequest: &Resource{
|
nonzeroRequest: &Resource{
|
||||||
MilliCPU: 300,
|
MilliCPU: 300,
|
||||||
Memory: 1524,
|
Memory: 1524,
|
||||||
NvidiaGPU: 0,
|
|
||||||
EphemeralStorage: 0,
|
EphemeralStorage: 0,
|
||||||
AllowedPodNumber: 0,
|
AllowedPodNumber: 0,
|
||||||
ScalarResources: map[v1.ResourceName]int64(nil),
|
ScalarResources: map[v1.ResourceName]int64(nil),
|
||||||
|
@ -748,7 +733,6 @@ func TestNodeInfoRemovePod(t *testing.T) {
|
||||||
requestedResource: &Resource{
|
requestedResource: &Resource{
|
||||||
MilliCPU: 200,
|
MilliCPU: 200,
|
||||||
Memory: 1024,
|
Memory: 1024,
|
||||||
NvidiaGPU: 0,
|
|
||||||
EphemeralStorage: 0,
|
EphemeralStorage: 0,
|
||||||
AllowedPodNumber: 0,
|
AllowedPodNumber: 0,
|
||||||
ScalarResources: map[v1.ResourceName]int64(nil),
|
ScalarResources: map[v1.ResourceName]int64(nil),
|
||||||
|
@ -756,7 +740,6 @@ func TestNodeInfoRemovePod(t *testing.T) {
|
||||||
nonzeroRequest: &Resource{
|
nonzeroRequest: &Resource{
|
||||||
MilliCPU: 200,
|
MilliCPU: 200,
|
||||||
Memory: 1024,
|
Memory: 1024,
|
||||||
NvidiaGPU: 0,
|
|
||||||
EphemeralStorage: 0,
|
EphemeralStorage: 0,
|
||||||
AllowedPodNumber: 0,
|
AllowedPodNumber: 0,
|
||||||
ScalarResources: map[v1.ResourceName]int64(nil),
|
ScalarResources: map[v1.ResourceName]int64(nil),
|
||||||
|
|
|
@ -48,13 +48,6 @@ func (self *ResourceList) Pods() *resource.Quantity {
|
||||||
return &resource.Quantity{}
|
return &resource.Quantity{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (self *ResourceList) NvidiaGPU() *resource.Quantity {
|
|
||||||
if val, ok := (*self)[ResourceNvidiaGPU]; ok {
|
|
||||||
return &val
|
|
||||||
}
|
|
||||||
return &resource.Quantity{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (self *ResourceList) StorageEphemeral() *resource.Quantity {
|
func (self *ResourceList) StorageEphemeral() *resource.Quantity {
|
||||||
if val, ok := (*self)[ResourceEphemeralStorage]; ok {
|
if val, ok := (*self)[ResourceEphemeralStorage]; ok {
|
||||||
return &val
|
return &val
|
||||||
|
|
|
@ -4076,8 +4076,6 @@ const (
|
||||||
// Local ephemeral storage, in bytes. (500Gi = 500GiB = 500 * 1024 * 1024 * 1024)
|
// Local ephemeral storage, in bytes. (500Gi = 500GiB = 500 * 1024 * 1024 * 1024)
|
||||||
// The resource name for ResourceEphemeralStorage is alpha and it can change across releases.
|
// The resource name for ResourceEphemeralStorage is alpha and it can change across releases.
|
||||||
ResourceEphemeralStorage ResourceName = "ephemeral-storage"
|
ResourceEphemeralStorage ResourceName = "ephemeral-storage"
|
||||||
// NVIDIA GPU, in devices. Alpha, might change: although fractional and allowing values >1, only one whole device per node is assigned.
|
|
||||||
ResourceNvidiaGPU ResourceName = "alpha.kubernetes.io/nvidia-gpu"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
|
|
@ -40,54 +40,11 @@ const (
|
||||||
driverInstallTimeout = 10 * time.Minute
|
driverInstallTimeout = 10 * time.Minute
|
||||||
)
|
)
|
||||||
|
|
||||||
type podCreationFuncType func() *v1.Pod
|
|
||||||
|
|
||||||
var (
|
var (
|
||||||
gpuResourceName v1.ResourceName
|
gpuResourceName v1.ResourceName
|
||||||
dsYamlUrl string
|
dsYamlUrl string
|
||||||
podCreationFunc podCreationFuncType
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func makeCudaAdditionTestPod() *v1.Pod {
|
|
||||||
podName := testPodNamePrefix + string(uuid.NewUUID())
|
|
||||||
testPod := &v1.Pod{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
Name: podName,
|
|
||||||
},
|
|
||||||
Spec: v1.PodSpec{
|
|
||||||
RestartPolicy: v1.RestartPolicyNever,
|
|
||||||
Containers: []v1.Container{
|
|
||||||
{
|
|
||||||
Name: "vector-addition",
|
|
||||||
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
|
|
||||||
Resources: v1.ResourceRequirements{
|
|
||||||
Limits: v1.ResourceList{
|
|
||||||
gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
VolumeMounts: []v1.VolumeMount{
|
|
||||||
{
|
|
||||||
Name: "nvidia-libraries",
|
|
||||||
MountPath: "/usr/local/nvidia/lib64",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Volumes: []v1.Volume{
|
|
||||||
{
|
|
||||||
Name: "nvidia-libraries",
|
|
||||||
VolumeSource: v1.VolumeSource{
|
|
||||||
HostPath: &v1.HostPathVolumeSource{
|
|
||||||
Path: "/home/kubernetes/bin/nvidia/lib",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
return testPod
|
|
||||||
}
|
|
||||||
|
|
||||||
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
|
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
|
||||||
podName := testPodNamePrefix + string(uuid.NewUUID())
|
podName := testPodNamePrefix + string(uuid.NewUUID())
|
||||||
testPod := &v1.Pod{
|
testPod := &v1.Pod{
|
||||||
|
@ -163,20 +120,13 @@ func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *fra
|
||||||
}
|
}
|
||||||
framework.Logf("Cluster is running on COS. Proceeding with test")
|
framework.Logf("Cluster is running on COS. Proceeding with test")
|
||||||
|
|
||||||
if f.BaseName == "gpus" {
|
dsYamlUrlFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
|
||||||
dsYamlUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/master/cos-nvidia-gpu-installer/daemonset.yaml"
|
if dsYamlUrlFromEnv != "" {
|
||||||
gpuResourceName = v1.ResourceNvidiaGPU
|
dsYamlUrl = dsYamlUrlFromEnv
|
||||||
podCreationFunc = makeCudaAdditionTestPod
|
|
||||||
} else {
|
} else {
|
||||||
dsYamlUrlFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
|
dsYamlUrl = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml"
|
||||||
if dsYamlUrlFromEnv != "" {
|
|
||||||
dsYamlUrl = dsYamlUrlFromEnv
|
|
||||||
} else {
|
|
||||||
dsYamlUrl = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml"
|
|
||||||
}
|
|
||||||
gpuResourceName = framework.NVIDIAGPUResourceName
|
|
||||||
podCreationFunc = makeCudaAdditionDevicePluginTestPod
|
|
||||||
}
|
}
|
||||||
|
gpuResourceName = framework.NVIDIAGPUResourceName
|
||||||
|
|
||||||
framework.Logf("Using %v", dsYamlUrl)
|
framework.Logf("Using %v", dsYamlUrl)
|
||||||
// Creates the DaemonSet that installs Nvidia Drivers.
|
// Creates the DaemonSet that installs Nvidia Drivers.
|
||||||
|
@ -218,7 +168,7 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
|
||||||
framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
|
framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
|
||||||
podList := []*v1.Pod{}
|
podList := []*v1.Pod{}
|
||||||
for i := int64(0); i < getGPUsAvailable(f); i++ {
|
for i := int64(0); i < getGPUsAvailable(f); i++ {
|
||||||
podList = append(podList, f.PodClient().Create(podCreationFunc()))
|
podList = append(podList, f.PodClient().Create(makeCudaAdditionDevicePluginTestPod()))
|
||||||
}
|
}
|
||||||
framework.Logf("Wait for all test pods to succeed")
|
framework.Logf("Wait for all test pods to succeed")
|
||||||
// Wait for all pods to succeed
|
// Wait for all pods to succeed
|
||||||
|
@ -234,13 +184,6 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
|
||||||
framework.ExpectNoError(err, "getting resource usage summary")
|
framework.ExpectNoError(err, "getting resource usage summary")
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ = SIGDescribe("[Feature:GPU]", func() {
|
|
||||||
f := framework.NewDefaultFramework("gpus")
|
|
||||||
It("run Nvidia GPU tests on Container Optimized OS only", func() {
|
|
||||||
testNvidiaGPUsOnCOS(f)
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
|
var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
|
||||||
f := framework.NewDefaultFramework("device-plugin-gpus")
|
f := framework.NewDefaultFramework("device-plugin-gpus")
|
||||||
It("run Nvidia GPU Device Plugin tests on Container Optimized OS only", func() {
|
It("run Nvidia GPU Device Plugin tests on Container Optimized OS only", func() {
|
||||||
|
|
|
@ -11,7 +11,6 @@ go_library(
|
||||||
"docker_util.go",
|
"docker_util.go",
|
||||||
"framework.go",
|
"framework.go",
|
||||||
"gpu_device_plugin.go",
|
"gpu_device_plugin.go",
|
||||||
"gpus.go",
|
|
||||||
"image_list.go",
|
"image_list.go",
|
||||||
"simple_mount.go",
|
"simple_mount.go",
|
||||||
"util.go",
|
"util.go",
|
||||||
|
|
|
@ -17,6 +17,7 @@ limitations under the License.
|
||||||
package e2e_node
|
package e2e_node
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"os/exec"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -132,6 +133,16 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
func checkIfNvidiaGPUsExistOnNode() bool {
|
||||||
|
// Cannot use `lspci` because it is not installed on all distros by default.
|
||||||
|
err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run()
|
||||||
|
if err != nil {
|
||||||
|
framework.Logf("check for nvidia GPUs failed. Got Error: %v", err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func logDevicePluginMetrics() {
|
func logDevicePluginMetrics() {
|
||||||
ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255")
|
ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255")
|
||||||
framework.ExpectNoError(err)
|
framework.ExpectNoError(err)
|
||||||
|
|
|
@ -1,174 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright 2017 The Kubernetes Authors.
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package e2e_node
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"os/exec"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"k8s.io/api/core/v1"
|
|
||||||
"k8s.io/apimachinery/pkg/api/resource"
|
|
||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
||||||
"k8s.io/kubernetes/pkg/features"
|
|
||||||
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
|
|
||||||
"k8s.io/kubernetes/test/e2e/framework"
|
|
||||||
|
|
||||||
. "github.com/onsi/ginkgo"
|
|
||||||
. "github.com/onsi/gomega"
|
|
||||||
)
|
|
||||||
|
|
||||||
func getGPUsAvailable(f *framework.Framework) int64 {
|
|
||||||
nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
|
|
||||||
framework.ExpectNoError(err, "getting node list")
|
|
||||||
var gpusAvailable int64
|
|
||||||
for _, node := range nodeList.Items {
|
|
||||||
gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
|
|
||||||
}
|
|
||||||
return gpusAvailable
|
|
||||||
}
|
|
||||||
|
|
||||||
func gpusExistOnAllNodes(f *framework.Framework) bool {
|
|
||||||
nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
|
|
||||||
framework.ExpectNoError(err, "getting node list")
|
|
||||||
for _, node := range nodeList.Items {
|
|
||||||
if node.Name == "kubernetes-master" {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if node.Status.Capacity.NvidiaGPU().Value() == 0 {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
func checkIfNvidiaGPUsExistOnNode() bool {
|
|
||||||
// Cannot use `lspci` because it is not installed on all distros by default.
|
|
||||||
err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run()
|
|
||||||
if err != nil {
|
|
||||||
framework.Logf("check for nvidia GPUs failed. Got Error: %v", err)
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// Serial because the test updates kubelet configuration.
|
|
||||||
var _ = framework.KubeDescribe("GPU [Serial]", func() {
|
|
||||||
f := framework.NewDefaultFramework("gpu-test")
|
|
||||||
Context("attempt to use GPUs if available", func() {
|
|
||||||
It("setup the node and create pods to test gpus", func() {
|
|
||||||
By("ensuring that Nvidia GPUs exist on the node")
|
|
||||||
if !checkIfNvidiaGPUsExistOnNode() {
|
|
||||||
Skip("Nvidia GPUs do not exist on the node. Skipping test.")
|
|
||||||
}
|
|
||||||
By("ensuring that dynamic kubelet configuration is enabled")
|
|
||||||
enabled, err := isKubeletConfigEnabled(f)
|
|
||||||
framework.ExpectNoError(err)
|
|
||||||
if !enabled {
|
|
||||||
Skip("Dynamic Kubelet configuration is not enabled. Skipping test.")
|
|
||||||
}
|
|
||||||
|
|
||||||
By("enabling support for GPUs")
|
|
||||||
var oldCfg *kubeletconfig.KubeletConfiguration
|
|
||||||
defer func() {
|
|
||||||
if oldCfg != nil {
|
|
||||||
framework.ExpectNoError(setKubeletConfiguration(f, oldCfg))
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Enable Accelerators
|
|
||||||
oldCfg, err = getCurrentKubeletConfig()
|
|
||||||
framework.ExpectNoError(err)
|
|
||||||
newCfg := oldCfg.DeepCopy()
|
|
||||||
newCfg.FeatureGates[string(features.Accelerators)] = true
|
|
||||||
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
|
|
||||||
|
|
||||||
By("Waiting for GPUs to become available on the local node")
|
|
||||||
Eventually(gpusExistOnAllNodes(f), 10*time.Minute, time.Second).Should(BeTrue())
|
|
||||||
|
|
||||||
By("Creating a pod that will consume all GPUs")
|
|
||||||
podSuccess := makePod(getGPUsAvailable(f), "gpus-success")
|
|
||||||
podSuccess = f.PodClient().CreateSync(podSuccess)
|
|
||||||
|
|
||||||
By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
|
|
||||||
const minContainerRestartCount = 2
|
|
||||||
Eventually(func() bool {
|
|
||||||
p, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(podSuccess.Name, metav1.GetOptions{})
|
|
||||||
if err != nil {
|
|
||||||
framework.Logf("failed to get pod status: %v", err)
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
if p.Status.ContainerStatuses[0].RestartCount < minContainerRestartCount {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}, time.Minute, time.Second).Should(BeTrue())
|
|
||||||
|
|
||||||
By("Checking if the pod outputted Success to its logs")
|
|
||||||
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
|
|
||||||
|
|
||||||
By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet")
|
|
||||||
podFailure := makePod(1, "gpu-failure")
|
|
||||||
framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) {
|
|
||||||
if pod.Status.Phase == v1.PodFailed {
|
|
||||||
return true, nil
|
|
||||||
|
|
||||||
}
|
|
||||||
return false, nil
|
|
||||||
})
|
|
||||||
|
|
||||||
By("stopping the original Pod with GPUs")
|
|
||||||
gp := int64(0)
|
|
||||||
deleteOptions := metav1.DeleteOptions{
|
|
||||||
GracePeriodSeconds: &gp,
|
|
||||||
}
|
|
||||||
f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, framework.DefaultPodDeletionTimeout)
|
|
||||||
|
|
||||||
By("attempting to start the failed pod again")
|
|
||||||
f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, framework.DefaultPodDeletionTimeout)
|
|
||||||
podFailure = f.PodClient().CreateSync(podFailure)
|
|
||||||
|
|
||||||
By("Checking if the pod outputted Success to its logs")
|
|
||||||
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success"))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
func makePod(gpus int64, name string) *v1.Pod {
|
|
||||||
resources := v1.ResourceRequirements{
|
|
||||||
Limits: v1.ResourceList{
|
|
||||||
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$' | wc -l) ]]; then exit 1; else echo Success; fi", gpus)
|
|
||||||
return &v1.Pod{
|
|
||||||
ObjectMeta: metav1.ObjectMeta{
|
|
||||||
Name: name,
|
|
||||||
},
|
|
||||||
Spec: v1.PodSpec{
|
|
||||||
RestartPolicy: v1.RestartPolicyAlways,
|
|
||||||
Containers: []v1.Container{
|
|
||||||
{
|
|
||||||
Image: busyboxImage,
|
|
||||||
Name: name,
|
|
||||||
Command: []string{"sh", "-c", gpuverificationCmd},
|
|
||||||
Resources: resources,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue