Merge pull request #61498 from mindprince/delete-in-tree-gpu

Automatic merge from submit-queue (batch tested with PRs 61498, 62030). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Delete in-tree support for NVIDIA GPUs.

This removes the alpha Accelerators feature gate which was deprecated in 1.10 (#57384).
The alternative feature DevicePlugins went beta in 1.10 (#60170).

Fixes #54012

```release-note
Support for "alpha.kubernetes.io/nvidia-gpu" resource which was deprecated in 1.10 is removed. Please use the resource exposed by DevicePlugins instead ("nvidia.com/gpu").
```
pull/8/head
Kubernetes Submit Queue 2018-04-03 02:02:04 -07:00 committed by GitHub
commit 043204b1e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 42 additions and 1315 deletions

View File

@ -173,7 +173,6 @@ pkg/kubelet/dockershim/cm
pkg/kubelet/dockershim/libdocker
pkg/kubelet/dockershim/testing
pkg/kubelet/events
pkg/kubelet/gpu
pkg/kubelet/images
pkg/kubelet/kuberuntime
pkg/kubelet/leaky

View File

@ -172,14 +172,11 @@ func IsNativeResource(name core.ResourceName) bool {
strings.Contains(string(name), core.ResourceDefaultNamespacePrefix)
}
var overcommitBlacklist = sets.NewString(string(core.ResourceNvidiaGPU))
// IsOvercommitAllowed returns true if the resource is in the default
// namespace and not blacklisted.
// namespace and is not hugepages.
func IsOvercommitAllowed(name core.ResourceName) bool {
return IsNativeResource(name) &&
!IsHugePageResourceName(name) &&
!overcommitBlacklist.Has(string(name))
!IsHugePageResourceName(name)
}
var standardLimitRangeTypes = sets.NewString(

View File

@ -387,10 +387,6 @@ func TestIsOvercommitAllowed(t *testing.T) {
name: core.ResourceMemory,
allowed: true,
},
{
name: core.ResourceNvidiaGPU,
allowed: false,
},
{
name: HugePageResourceName(resource.MustParse("2Mi")),
allowed: false,

View File

@ -47,13 +47,6 @@ func (self *ResourceList) Pods() *resource.Quantity {
return &resource.Quantity{}
}
func (self *ResourceList) NvidiaGPU() *resource.Quantity {
if val, ok := (*self)[ResourceNvidiaGPU]; ok {
return &val
}
return &resource.Quantity{}
}
func (self *ResourceList) StorageEphemeral() *resource.Quantity {
if val, ok := (*self)[ResourceEphemeralStorage]; ok {
return &val

View File

@ -3641,8 +3641,6 @@ const (
// Local ephemeral storage, in bytes. (500Gi = 500GiB = 500 * 1024 * 1024 * 1024)
// The resource name for ResourceEphemeralStorage is alpha and it can change across releases.
ResourceEphemeralStorage ResourceName = "ephemeral-storage"
// NVIDIA GPU, in devices. Alpha, might change: although fractional and allowing values >1, only one whole device per node is assigned.
ResourceNvidiaGPU ResourceName = "alpha.kubernetes.io/nvidia-gpu"
)
const (

View File

@ -29,7 +29,6 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/selection:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/validation:go_default_library",
],
)

View File

@ -25,7 +25,6 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/validation"
"k8s.io/kubernetes/pkg/apis/core/helper"
)
@ -85,14 +84,11 @@ func HugePageSizeFromResourceName(name v1.ResourceName) (resource.Quantity, erro
return resource.ParseQuantity(pageSize)
}
var overcommitBlacklist = sets.NewString(string(v1.ResourceNvidiaGPU))
// IsOvercommitAllowed returns true if the resource is in the default
// namespace and not blacklisted and is not hugepages.
// namespace and is not hugepages.
func IsOvercommitAllowed(name v1.ResourceName) bool {
return IsNativeResource(name) &&
!IsHugePageResourceName(name) &&
!overcommitBlacklist.Has(string(name))
!IsHugePageResourceName(name)
}
// Extended and Hugepages resources

View File

@ -125,10 +125,6 @@ func TestIsOvercommitAllowed(t *testing.T) {
resourceName: "kubernetes.io/resource-foo",
expectVal: true,
},
{
resourceName: "alpha.kubernetes.io/nvidia-gpu",
expectVal: false,
},
{
resourceName: "hugepages-100m",
expectVal: false,

View File

@ -38,12 +38,6 @@ func TestGetPodQOS(t *testing.T) {
}),
expected: v1.PodQOSGuaranteed,
},
{
pod: newPod("guaranteed-with-gpu", []v1.Container{
newContainer("guaranteed", getResourceList("100m", "100Mi"), addResource("nvidia-gpu", "2", getResourceList("100m", "100Mi"))),
}),
expected: v1.PodQOSGuaranteed,
},
{
pod: newPod("guaranteed-guaranteed", []v1.Container{
newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
@ -51,13 +45,6 @@ func TestGetPodQOS(t *testing.T) {
}),
expected: v1.PodQOSGuaranteed,
},
{
pod: newPod("guaranteed-guaranteed-with-gpu", []v1.Container{
newContainer("guaranteed", getResourceList("100m", "100Mi"), addResource("nvidia-gpu", "2", getResourceList("100m", "100Mi"))),
newContainer("guaranteed", getResourceList("100m", "100Mi"), getResourceList("100m", "100Mi")),
}),
expected: v1.PodQOSGuaranteed,
},
{
pod: newPod("best-effort-best-effort", []v1.Container{
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
@ -71,29 +58,16 @@ func TestGetPodQOS(t *testing.T) {
}),
expected: v1.PodQOSBestEffort,
},
{
pod: newPod("best-effort-best-effort-with-gpu", []v1.Container{
newContainer("best-effort", getResourceList("", ""), addResource("nvidia-gpu", "2", getResourceList("", ""))),
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
}),
expected: v1.PodQOSBestEffort,
},
{
pod: newPod("best-effort-with-gpu", []v1.Container{
newContainer("best-effort", getResourceList("", ""), addResource("nvidia-gpu", "2", getResourceList("", ""))),
}),
expected: v1.PodQOSBestEffort,
},
{
pod: newPod("best-effort-burstable", []v1.Container{
newContainer("best-effort", getResourceList("", ""), addResource("nvidia-gpu", "2", getResourceList("", ""))),
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
newContainer("burstable", getResourceList("1", ""), getResourceList("2", "")),
}),
expected: v1.PodQOSBurstable,
},
{
pod: newPod("best-effort-guaranteed", []v1.Container{
newContainer("best-effort", getResourceList("", ""), addResource("nvidia-gpu", "2", getResourceList("", ""))),
newContainer("best-effort", getResourceList("", ""), getResourceList("", "")),
newContainer("guaranteed", getResourceList("10m", "100Mi"), getResourceList("10m", "100Mi")),
}),
expected: v1.PodQOSBurstable,
@ -132,7 +106,7 @@ func TestGetPodQOS(t *testing.T) {
},
{
pod: newPod("burstable-2", []v1.Container{
newContainer("burstable", getResourceList("0", "0"), addResource("nvidia-gpu", "2", getResourceList("100m", "200Mi"))),
newContainer("burstable", getResourceList("0", "0"), getResourceList("100m", "200Mi")),
}),
expected: v1.PodQOSBurstable,
},

View File

@ -61,8 +61,6 @@ func ValidateResourceRequirements(requirements *v1.ResourceRequirements, fldPath
} else if quantity.Cmp(limitQuantity) > 0 {
allErrs = append(allErrs, field.Invalid(reqPath, quantity.String(), fmt.Sprintf("must be less than or equal to %s limit", resourceName)))
}
} else if resourceName == v1.ResourceNvidiaGPU {
allErrs = append(allErrs, field.Invalid(reqPath, quantity.String(), fmt.Sprintf("must be equal to %s request", v1.ResourceNvidiaGPU)))
}
}

View File

@ -32,36 +32,15 @@ func TestValidateResourceRequirements(t *testing.T) {
requirements v1.ResourceRequirements
}{
{
Name: "GPU only setting Limits",
requirements: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
},
},
},
{
Name: "GPU setting Limits equals Requests",
requirements: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
},
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
},
},
},
{
Name: "Resources with GPU with Requests",
Name: "Resources with Requests equal to Limits",
requirements: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("1"),
},
Limits: v1.ResourceList{
v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"),
v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"),
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("1"),
},
},
},
@ -111,36 +90,6 @@ func TestValidateResourceRequirements(t *testing.T) {
Name string
requirements v1.ResourceRequirements
}{
{
Name: "GPU only setting Requests",
requirements: v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
},
},
},
{
Name: "GPU setting Limits less than Requests",
requirements: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
},
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("11"),
},
},
},
{
Name: "GPU setting Limits larger than Requests",
requirements: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("10"),
},
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceNvidiaGPU): resource.MustParse("9"),
},
},
},
{
Name: "Resources with Requests Larger Than Limits",
requirements: v1.ResourceRequirements{

View File

@ -5042,25 +5042,7 @@ func TestValidateContainers(t *testing.T) {
TerminationMessagePolicy: "File",
},
{
Name: "resources-test-with-gpu-with-request",
Image: "image",
Resources: core.ResourceRequirements{
Requests: core.ResourceList{
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
},
Limits: core.ResourceList{
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
},
},
ImagePullPolicy: "IfNotPresent",
TerminationMessagePolicy: "File",
},
{
Name: "resources-test-with-gpu-without-request",
Name: "resources-test-with-request-and-limit",
Image: "image",
Resources: core.ResourceRequirements{
Requests: core.ResourceList{
@ -5070,7 +5052,6 @@ func TestValidateContainers(t *testing.T) {
Limits: core.ResourceList{
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
},
},
ImagePullPolicy: "IfNotPresent",
@ -5359,41 +5340,6 @@ func TestValidateContainers(t *testing.T) {
TerminationMessagePolicy: "File",
},
},
"Resource GPU limit must match request": {
{
Name: "gpu-resource-request-limit",
Image: "image",
Resources: core.ResourceRequirements{
Requests: core.ResourceList{
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("0"),
},
Limits: core.ResourceList{
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
},
},
TerminationMessagePolicy: "File",
ImagePullPolicy: "IfNotPresent",
},
},
"Resource GPU invalid setting only request": {
{
Name: "gpu-resource-request-limit",
Image: "image",
Resources: core.ResourceRequirements{
Requests: core.ResourceList{
core.ResourceName(core.ResourceCPU): resource.MustParse("10"),
core.ResourceName(core.ResourceMemory): resource.MustParse("10G"),
core.ResourceName(core.ResourceNvidiaGPU): resource.MustParse("1"),
},
},
TerminationMessagePolicy: "File",
ImagePullPolicy: "IfNotPresent",
},
},
"Request limit simple invalid": {
{
Name: "abc-123",

View File

@ -53,16 +53,6 @@ const (
// Note: This feature is not supported for `BestEffort` pods.
ExperimentalCriticalPodAnnotation utilfeature.Feature = "ExperimentalCriticalPodAnnotation"
// owner: @vishh
// alpha: v1.6
//
// This is deprecated and will be removed in v1.11. Use DevicePlugins instead.
//
// Enables support for GPUs as a schedulable resource.
// Only Nvidia GPUs are supported as of v1.6.
// Works only with Docker Container Runtime.
Accelerators utilfeature.Feature = "Accelerators"
// owner: @jiayingz
// beta: v1.10
//
@ -296,7 +286,6 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
DynamicKubeletConfig: {Default: false, PreRelease: utilfeature.Alpha},
ExperimentalHostUserNamespaceDefaultingGate: {Default: false, PreRelease: utilfeature.Beta},
ExperimentalCriticalPodAnnotation: {Default: false, PreRelease: utilfeature.Alpha},
Accelerators: {Default: false, PreRelease: utilfeature.Alpha},
DevicePlugins: {Default: true, PreRelease: utilfeature.Beta},
TaintBasedEvictions: {Default: false, PreRelease: utilfeature.Alpha},
RotateKubeletServerCertificate: {Default: false, PreRelease: utilfeature.Alpha},

View File

@ -55,8 +55,6 @@ go_library(
"//pkg/kubelet/envvars:go_default_library",
"//pkg/kubelet/events:go_default_library",
"//pkg/kubelet/eviction:go_default_library",
"//pkg/kubelet/gpu:go_default_library",
"//pkg/kubelet/gpu/nvidia:go_default_library",
"//pkg/kubelet/images:go_default_library",
"//pkg/kubelet/kubeletconfig:go_default_library",
"//pkg/kubelet/kuberuntime:go_default_library",
@ -179,7 +177,6 @@ go_test(
"//pkg/kubelet/container:go_default_library",
"//pkg/kubelet/container/testing:go_default_library",
"//pkg/kubelet/eviction:go_default_library",
"//pkg/kubelet/gpu:go_default_library",
"//pkg/kubelet/images:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/logs:go_default_library",
@ -264,7 +261,6 @@ filegroup(
"//pkg/kubelet/envvars:all-srcs",
"//pkg/kubelet/events:all-srcs",
"//pkg/kubelet/eviction:all-srcs",
"//pkg/kubelet/gpu:all-srcs",
"//pkg/kubelet/images:all-srcs",
"//pkg/kubelet/kubeletconfig:all-srcs",
"//pkg/kubelet/kuberuntime:all-srcs",

View File

@ -1,32 +0,0 @@
package(default_visibility = ["//visibility:public"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
)
go_library(
name = "go_default_library",
srcs = [
"gpu_manager_stub.go",
"types.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/gpu",
deps = ["//vendor/k8s.io/api/core/v1:go_default_library"],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [
":package-srcs",
"//pkg/kubelet/gpu/nvidia:all-srcs",
],
tags = ["automanaged"],
)

View File

@ -1,12 +0,0 @@
approvers:
- dchen1107
- derekwaynecarr
- vishh
- yujuhong
reviewers:
- cmluciano
- jiayingz
- mindprince
- RenaudWasTaken
- vishh
- sig-node-reviewers

View File

@ -1,41 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gpu
import (
"fmt"
"k8s.io/api/core/v1"
)
type gpuManagerStub struct{}
func (gms *gpuManagerStub) Start() error {
return nil
}
func (gms *gpuManagerStub) Capacity() v1.ResourceList {
return nil
}
func (gms *gpuManagerStub) AllocateGPU(_ *v1.Pod, _ *v1.Container) ([]string, error) {
return nil, fmt.Errorf("GPUs are not supported")
}
func NewGPUManagerStub() GPUManager {
return &gpuManagerStub{}
}

View File

@ -1,54 +0,0 @@
package(default_visibility = ["//visibility:public"])
load(
"@io_bazel_rules_go//go:def.bzl",
"go_library",
"go_test",
)
go_library(
name = "go_default_library",
srcs = [
"helpers.go",
"nvidia_gpu_manager.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/gpu/nvidia",
deps = [
"//pkg/kubelet/dockershim:go_default_library",
"//pkg/kubelet/dockershim/libdocker:go_default_library",
"//pkg/kubelet/gpu:go_default_library",
"//vendor/github.com/golang/glog:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
],
)
filegroup(
name = "package-srcs",
srcs = glob(["**"]),
tags = ["automanaged"],
visibility = ["//visibility:private"],
)
filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
tags = ["automanaged"],
)
go_test(
name = "go_default_test",
srcs = ["nvidia_gpu_manager_test.go"],
embed = [":go_default_library"],
deps = [
"//pkg/kubelet/dockershim:go_default_library",
"//pkg/kubelet/dockershim/libdocker:go_default_library",
"//vendor/github.com/stretchr/testify/assert:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
],
)

View File

@ -1,77 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nvidia
import "k8s.io/apimachinery/pkg/util/sets"
type containerToGPU map[string]sets.String
// podGPUs represents a list of pod to GPU mappings.
type podGPUs struct {
podGPUMapping map[string]containerToGPU
}
func newPodGPUs() *podGPUs {
return &podGPUs{
podGPUMapping: make(map[string]containerToGPU),
}
}
func (pgpu *podGPUs) pods() sets.String {
ret := sets.NewString()
for k := range pgpu.podGPUMapping {
ret.Insert(k)
}
return ret
}
func (pgpu *podGPUs) insert(podUID, contName string, device string) {
if _, exists := pgpu.podGPUMapping[podUID]; !exists {
pgpu.podGPUMapping[podUID] = make(containerToGPU)
}
if _, exists := pgpu.podGPUMapping[podUID][contName]; !exists {
pgpu.podGPUMapping[podUID][contName] = sets.NewString()
}
pgpu.podGPUMapping[podUID][contName].Insert(device)
}
func (pgpu *podGPUs) getGPUs(podUID, contName string) sets.String {
containers, exists := pgpu.podGPUMapping[podUID]
if !exists {
return nil
}
devices, exists := containers[contName]
if !exists {
return nil
}
return devices
}
func (pgpu *podGPUs) delete(pods []string) {
for _, uid := range pods {
delete(pgpu.podGPUMapping, uid)
}
}
func (pgpu *podGPUs) devices() sets.String {
ret := sets.NewString()
for _, containerToGPU := range pgpu.podGPUMapping {
for _, deviceSet := range containerToGPU {
ret = ret.Union(deviceSet)
}
}
return ret
}

View File

@ -1,280 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nvidia
import (
"fmt"
"io/ioutil"
"os"
"path"
"regexp"
"strings"
"sync"
"github.com/golang/glog"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/dockershim"
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
"k8s.io/kubernetes/pkg/kubelet/gpu"
)
// TODO: rework to use Nvidia's NVML, which is more complex, but also provides more fine-grained information and stats.
const (
// All NVIDIA GPUs cards should be mounted with nvidiactl and nvidia-uvm
// If the driver installed correctly, the 2 devices will be there.
nvidiaCtlDevice string = "/dev/nvidiactl"
nvidiaUVMDevice string = "/dev/nvidia-uvm"
// Optional device.
nvidiaUVMToolsDevice string = "/dev/nvidia-uvm-tools"
devDirectory = "/dev"
)
var (
nvidiaDeviceRE = regexp.MustCompile(`^nvidia[0-9]*$`)
nvidiaFullpathRE = regexp.MustCompile(`^/dev/nvidia[0-9]*$`)
)
type activePodsLister interface {
// Returns a list of active pods on the node.
GetActivePods() []*v1.Pod
}
// nvidiaGPUManager manages nvidia gpu devices.
type nvidiaGPUManager struct {
sync.Mutex
// All gpus available on the Node
allGPUs sets.String
allocated *podGPUs
defaultDevices []string
// The interface which could get GPU mapping from all the containers.
// TODO: Should make this independent of Docker in the future.
dockerClient libdocker.Interface
activePodsLister activePodsLister
}
// NewNvidiaGPUManager returns a GPUManager that manages local Nvidia GPUs.
// TODO: Migrate to use pod level cgroups and make it generic to all runtimes.
func NewNvidiaGPUManager(activePodsLister activePodsLister, config *dockershim.ClientConfig) (gpu.GPUManager, error) {
dockerClient := dockershim.NewDockerClientFromConfig(config)
if dockerClient == nil {
return nil, fmt.Errorf("invalid docker client configure specified")
}
return &nvidiaGPUManager{
allGPUs: sets.NewString(),
dockerClient: dockerClient,
activePodsLister: activePodsLister,
}, nil
}
// Initialize the GPU devices, so far only needed to discover the GPU paths.
func (ngm *nvidiaGPUManager) Start() error {
if ngm.dockerClient == nil {
return fmt.Errorf("Invalid docker client specified in GPU Manager")
}
ngm.Lock()
defer ngm.Unlock()
if _, err := os.Stat(nvidiaCtlDevice); err != nil {
return err
}
if _, err := os.Stat(nvidiaUVMDevice); err != nil {
return err
}
ngm.defaultDevices = []string{nvidiaCtlDevice, nvidiaUVMDevice}
_, err := os.Stat(nvidiaUVMToolsDevice)
if !os.IsNotExist(err) {
ngm.defaultDevices = append(ngm.defaultDevices, nvidiaUVMToolsDevice)
}
if err := ngm.discoverGPUs(); err != nil {
return err
}
// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
return nil
}
// Get how many GPU cards we have.
func (ngm *nvidiaGPUManager) Capacity() v1.ResourceList {
gpus := resource.NewQuantity(int64(len(ngm.allGPUs)), resource.DecimalSI)
return v1.ResourceList{
v1.ResourceNvidiaGPU: *gpus,
}
}
// AllocateGPUs returns `num` GPUs if available, error otherwise.
// Allocation is made thread safe using the following logic.
// A list of all GPUs allocated is maintained along with their respective Pod UIDs.
// It is expected that the list of active pods will not return any false positives.
// As part of initialization or allocation, the list of GPUs in use will be computed once.
// Whenever an allocation happens, the list of GPUs allocated is updated based on the list of currently active pods.
// GPUs allocated to terminated pods are freed up lazily as part of allocation.
// GPUs are allocated based on the internal list of allocatedGPUs.
// It is not safe to generate a list of GPUs in use by inspecting active containers because of the delay between GPU allocation and container creation.
// A GPU allocated to a container might be re-allocated to a subsequent container because the original container wasn't started quick enough.
// The current algorithm scans containers only once and then uses a list of active pods to track GPU usage.
// This is a sub-optimal solution and a better alternative would be that of using pod level cgroups instead.
// GPUs allocated to containers should be reflected in pod level device cgroups before completing allocations.
// The pod level cgroups will then serve as a checkpoint of GPUs in use.
func (ngm *nvidiaGPUManager) AllocateGPU(pod *v1.Pod, container *v1.Container) ([]string, error) {
gpusNeeded := container.Resources.Limits.NvidiaGPU().Value()
if gpusNeeded == 0 {
return []string{}, nil
}
ngm.Lock()
defer ngm.Unlock()
if ngm.allocated == nil {
// Initialization is not complete. Try now. Failures can no longer be tolerated.
ngm.allocated = ngm.gpusInUse()
} else {
// update internal list of GPUs in use prior to allocating new GPUs.
ngm.updateAllocatedGPUs()
}
// Check if GPUs have already been allocated. If so return them right away.
// This can happen if a container restarts for example.
if devices := ngm.allocated.getGPUs(string(pod.UID), container.Name); devices != nil {
glog.V(2).Infof("Found pre-allocated GPUs for container %q in Pod %q: %v", container.Name, pod.UID, devices.List())
return append(devices.List(), ngm.defaultDevices...), nil
}
// Get GPU devices in use.
devicesInUse := ngm.allocated.devices()
glog.V(5).Infof("gpus in use: %v", devicesInUse.List())
// Get a list of available GPUs.
available := ngm.allGPUs.Difference(devicesInUse)
glog.V(5).Infof("gpus available: %v", available.List())
if int64(available.Len()) < gpusNeeded {
return nil, fmt.Errorf("requested number of GPUs unavailable. Requested: %d, Available: %d", gpusNeeded, available.Len())
}
ret := available.UnsortedList()[:gpusNeeded]
for _, device := range ret {
// Update internal allocated GPU cache.
ngm.allocated.insert(string(pod.UID), container.Name, device)
}
// Add standard devices files that needs to be exposed.
ret = append(ret, ngm.defaultDevices...)
return ret, nil
}
// updateAllocatedGPUs updates the list of GPUs in use.
// It gets a list of active pods and then frees any GPUs that are bound to terminated pods.
// Returns error on failure.
func (ngm *nvidiaGPUManager) updateAllocatedGPUs() {
activePods := ngm.activePodsLister.GetActivePods()
activePodUids := sets.NewString()
for _, pod := range activePods {
activePodUids.Insert(string(pod.UID))
}
allocatedPodUids := ngm.allocated.pods()
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
ngm.allocated.delete(podsToBeRemoved.List())
}
// discoverGPUs identifies allGPUs NVIDIA GPU devices available on the local node by walking `/dev` directory.
// TODO: Without NVML support we only can check whether there has GPU devices, but
// could not give a health check or get more information like GPU cores, memory, or
// family name. Need to support NVML in the future. But we do not need NVML until
// we want more features, features like schedule containers according to GPU family
// name.
func (ngm *nvidiaGPUManager) discoverGPUs() error {
files, err := ioutil.ReadDir(devDirectory)
if err != nil {
return err
}
for _, f := range files {
if f.IsDir() {
continue
}
if nvidiaDeviceRE.MatchString(f.Name()) {
glog.V(2).Infof("Found Nvidia GPU %q", f.Name())
ngm.allGPUs.Insert(path.Join(devDirectory, f.Name()))
}
}
return nil
}
// gpusInUse returns a list of GPUs in use along with the respective pods that are using it.
func (ngm *nvidiaGPUManager) gpusInUse() *podGPUs {
pods := ngm.activePodsLister.GetActivePods()
type containerIdentifier struct {
id string
name string
}
type podContainers struct {
uid string
containers []containerIdentifier
}
// List of containers to inspect.
podContainersToInspect := []podContainers{}
for _, pod := range pods {
containers := sets.NewString()
for _, container := range pod.Spec.Containers {
// GPUs are expected to be specified only in limits.
if !container.Resources.Limits.NvidiaGPU().IsZero() {
containers.Insert(container.Name)
}
}
// If no GPUs were requested skip this pod.
if containers.Len() == 0 {
continue
}
// TODO: If kubelet restarts right after allocating a GPU to a pod, the container might not have started yet and so container status might not be available yet.
// Use an internal checkpoint instead or try using the CRI if its checkpoint is reliable.
var containersToInspect []containerIdentifier
for _, container := range pod.Status.ContainerStatuses {
if containers.Has(container.Name) {
containersToInspect = append(containersToInspect, containerIdentifier{strings.Replace(container.ContainerID, "docker://", "", 1), container.Name})
}
}
// add the pod and its containers that need to be inspected.
podContainersToInspect = append(podContainersToInspect, podContainers{string(pod.UID), containersToInspect})
}
ret := newPodGPUs()
for _, podContainer := range podContainersToInspect {
for _, containerIdentifier := range podContainer.containers {
containerJSON, err := ngm.dockerClient.InspectContainer(containerIdentifier.id)
if err != nil {
glog.V(3).Infof("Failed to inspect container %q in pod %q while attempting to reconcile nvidia gpus in use", containerIdentifier.id, podContainer.uid)
continue
}
devices := containerJSON.HostConfig.Devices
if devices == nil {
continue
}
for _, device := range devices {
if isValidPath(device.PathOnHost) {
glog.V(4).Infof("Nvidia GPU %q is in use by Docker Container: %q", device.PathOnHost, containerJSON.ID)
ret.insert(podContainer.uid, containerIdentifier.name, device.PathOnHost)
}
}
}
}
return ret
}
func isValidPath(path string) bool {
return nvidiaFullpathRE.MatchString(path)
}

View File

@ -1,213 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nvidia
import (
"os"
"reflect"
"testing"
"github.com/stretchr/testify/assert"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/kubernetes/pkg/kubelet/dockershim"
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker"
)
type testActivePodsLister struct {
activePods []*v1.Pod
}
func (tapl *testActivePodsLister) GetActivePods() []*v1.Pod {
return tapl.activePods
}
func makeTestPod(numContainers, gpusPerContainer int) *v1.Pod {
quantity := resource.NewQuantity(int64(gpusPerContainer), resource.DecimalSI)
resources := v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceNvidiaGPU: *quantity,
},
}
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: uuid.NewUUID(),
},
Spec: v1.PodSpec{
Containers: []v1.Container{},
},
}
for ; numContainers > 0; numContainers-- {
pod.Spec.Containers = append(pod.Spec.Containers, v1.Container{
Name: string(uuid.NewUUID()),
Resources: resources,
})
}
return pod
}
func TestNewNvidiaGPUManager(t *testing.T) {
podLister := &testActivePodsLister{}
// Expects nil GPUManager and an error with nil dockerClient.
testGpuManager1, err := NewNvidiaGPUManager(podLister, nil)
as := assert.New(t)
as.Nil(testGpuManager1)
as.NotNil(err)
// Expects a GPUManager to be created with non-nil dockerClient.
testGpuManager2, err := NewNvidiaGPUManager(podLister, &dockershim.ClientConfig{
DockerEndpoint: libdocker.FakeDockerEndpoint,
})
as.NotNil(testGpuManager2)
as.Nil(err)
// Expects zero capacity without any GPUs.
gpuCapacity := testGpuManager2.Capacity()
as.Equal(len(gpuCapacity), 1)
rgpu := gpuCapacity[v1.ResourceNvidiaGPU]
as.Equal(rgpu.Value(), int64(0))
err2 := testGpuManager2.Start()
if !os.IsNotExist(err2) {
gpus := reflect.ValueOf(testGpuManager2).Elem().FieldByName("allGPUs").Len()
as.NotZero(gpus)
}
}
func TestMultiContainerPodGPUAllocation(t *testing.T) {
podLister := &testActivePodsLister{}
testGpuManager := &nvidiaGPUManager{
activePodsLister: podLister,
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
allocated: newPodGPUs(),
}
// Expect that no devices are in use.
gpusInUse := testGpuManager.gpusInUse()
as := assert.New(t)
as.Equal(len(gpusInUse.devices()), 0)
// Allocated GPUs for a pod with two containers.
pod := makeTestPod(2, 1)
// Allocate for the first container.
devices1, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devices1), 1)
podLister.activePods = append(podLister.activePods, pod)
// Allocate for the second container.
devices2, err := testGpuManager.AllocateGPU(pod, &pod.Spec.Containers[1])
as.Nil(err)
as.Equal(len(devices2), 1)
as.NotEqual(devices1, devices2, "expected containers to get different devices")
// further allocations should fail.
newPod := makeTestPod(2, 1)
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
as.NotNil(err, "expected gpu allocation to fail. got: %v", devices1)
// Now terminate the original pod and observe that GPU allocation for new pod succeeds.
podLister.activePods = podLister.activePods[:0]
devices1, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devices1), 1)
podLister.activePods = append(podLister.activePods, newPod)
devices2, err = testGpuManager.AllocateGPU(newPod, &newPod.Spec.Containers[1])
as.Nil(err)
as.Equal(len(devices2), 1)
as.NotEqual(devices1, devices2, "expected containers to get different devices")
}
func TestMultiPodGPUAllocation(t *testing.T) {
podLister := &testActivePodsLister{}
testGpuManager := &nvidiaGPUManager{
activePodsLister: podLister,
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
allocated: newPodGPUs(),
}
// Expect that no devices are in use.
gpusInUse := testGpuManager.gpusInUse()
as := assert.New(t)
as.Equal(len(gpusInUse.devices()), 0)
// Allocated GPUs for a pod with two containers.
podA := makeTestPod(1, 1)
// Allocate for the first container.
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devicesA), 1)
podLister.activePods = append(podLister.activePods, podA)
// further allocations should fail.
podB := makeTestPod(1, 1)
// Allocate for the first container.
devicesB, err := testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devicesB), 1)
as.NotEqual(devicesA, devicesB, "expected pods to get different devices")
}
func TestPodContainerRestart(t *testing.T) {
podLister := &testActivePodsLister{}
testGpuManager := &nvidiaGPUManager{
activePodsLister: podLister,
allGPUs: sets.NewString("/dev/nvidia0", "/dev/nvidia1"),
allocated: newPodGPUs(),
defaultDevices: []string{"/dev/nvidia-smi"},
}
// Expect that no devices are in use.
gpusInUse := testGpuManager.gpusInUse()
as := assert.New(t)
as.Equal(len(gpusInUse.devices()), 0)
// Make a pod with one containers that requests two GPUs.
podA := makeTestPod(1, 2)
// Allocate GPUs
devicesA, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devicesA), 3)
podLister.activePods = append(podLister.activePods, podA)
// further allocations should fail.
podB := makeTestPod(1, 1)
_, err = testGpuManager.AllocateGPU(podB, &podB.Spec.Containers[0])
as.NotNil(err)
// Allcate GPU for existing Pod A.
// The same gpus must be returned.
devicesAretry, err := testGpuManager.AllocateGPU(podA, &podA.Spec.Containers[0])
as.Nil(err)
as.Equal(len(devicesA), 3)
as.True(sets.NewString(devicesA...).Equal(sets.NewString(devicesAretry...)))
}

View File

@ -1,32 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gpu
import "k8s.io/api/core/v1"
// GPUManager manages GPUs on a local node.
// Implementations are expected to be thread safe.
type GPUManager interface {
// Start logically initializes GPUManager
Start() error
// Capacity returns the total number of GPUs on the node.
Capacity() v1.ResourceList
// AllocateGPU attempts to allocate GPUs for input container.
// Returns paths to allocated GPUs and nil on success.
// Returns an error on failure.
AllocateGPU(*v1.Pod, *v1.Container) ([]string, error)
}

View File

@ -69,8 +69,6 @@ import (
dockerremote "k8s.io/kubernetes/pkg/kubelet/dockershim/remote"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/gpu"
"k8s.io/kubernetes/pkg/kubelet/gpu/nvidia"
"k8s.io/kubernetes/pkg/kubelet/images"
"k8s.io/kubernetes/pkg/kubelet/kubeletconfig"
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
@ -866,20 +864,6 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
klet.appArmorValidator = apparmor.NewValidator(containerRuntime)
klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewAppArmorAdmitHandler(klet.appArmorValidator))
klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewNoNewPrivsAdmitHandler(klet.containerRuntime))
if utilfeature.DefaultFeatureGate.Enabled(features.Accelerators) {
if containerRuntime == kubetypes.DockerContainerRuntime {
glog.Warningln("Accelerators feature is deprecated and will be removed in v1.11. Please use device plugins instead. They can be enabled using the DevicePlugins feature gate.")
if klet.gpuManager, err = nvidia.NewNvidiaGPUManager(klet, kubeDeps.DockerClientConfig); err != nil {
return nil, err
}
} else {
glog.Errorf("Accelerators feature is supported with docker runtime only. Disabling this feature internally.")
}
}
// Set GPU manager to a stub implementation if it is not enabled or cannot be supported.
if klet.gpuManager == nil {
klet.gpuManager = gpu.NewGPUManagerStub()
}
// Finally, put the most recent version of the config on the Kubelet, so
// people can see how it was configured.
klet.kubeletConfiguration = *kubeCfg
@ -1152,9 +1136,6 @@ type Kubelet struct {
// experimental behavior is desired.
experimentalHostUserNamespaceDefaulting bool
// GPU Manager
gpuManager gpu.GPUManager
// dockerLegacyService contains some legacy methods for backward compatibility.
// It should be set only when docker is using non json-file logging driver.
dockerLegacyService dockershim.DockerLegacyService
@ -1292,11 +1273,6 @@ func (kl *Kubelet) initializeModules() error {
return fmt.Errorf("Failed to start OOM watcher %v", err)
}
// Initialize GPUs
if err := kl.gpuManager.Start(); err != nil {
glog.Errorf("Failed to start gpuManager %v", err)
}
// Start resource analyzer
kl.resourceAnalyzer.Start()

View File

@ -540,14 +540,6 @@ func (kl *Kubelet) setNodeStatusMachineInfo(node *v1.Node) {
node.Status.Capacity = v1.ResourceList{}
}
// populate GPU capacity.
gpuCapacity := kl.gpuManager.Capacity()
if gpuCapacity != nil {
for k, v := range gpuCapacity {
node.Status.Capacity[k] = v
}
}
var devicePluginAllocatable v1.ResourceList
var devicePluginCapacity v1.ResourceList
var removedDevicePlugins []string

View File

@ -90,26 +90,6 @@ func (kl *Kubelet) GetActivePods() []*v1.Pod {
return activePods
}
// makeGPUDevices determines the devices for the given container.
// Experimental.
func (kl *Kubelet) makeGPUDevices(pod *v1.Pod, container *v1.Container) ([]kubecontainer.DeviceInfo, error) {
if container.Resources.Limits.NvidiaGPU().IsZero() {
return nil, nil
}
nvidiaGPUPaths, err := kl.gpuManager.AllocateGPU(pod, container)
if err != nil {
return nil, err
}
var devices []kubecontainer.DeviceInfo
for _, path := range nvidiaGPUPaths {
// Devices have to be mapped one to one because of nvidia CUDA library requirements.
devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: path, PathInContainer: path, Permissions: "mrw"})
}
return devices, nil
}
func makeAbsolutePath(goos, path string) string {
if goos != "windows" {
return "/" + path
@ -470,12 +450,6 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Contai
volumes := kl.volumeManager.GetMountedVolumesForPod(podName)
opts.PortMappings = kubecontainer.MakePortMappings(container)
// TODO(random-liu): Move following convert functions into pkg/kubelet/container
devices, err := kl.makeGPUDevices(pod, container)
if err != nil {
return nil, nil, err
}
opts.Devices = append(opts.Devices, devices...)
// TODO: remove feature gate check after no longer needed
if utilfeature.DefaultFeatureGate.Enabled(features.BlockVolume) {

View File

@ -49,7 +49,6 @@ import (
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
"k8s.io/kubernetes/pkg/kubelet/eviction"
"k8s.io/kubernetes/pkg/kubelet/gpu"
"k8s.io/kubernetes/pkg/kubelet/images"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/logs"
@ -325,7 +324,6 @@ func newTestKubeletWithImageList(
kubelet.AddPodSyncLoopHandler(activeDeadlineHandler)
kubelet.AddPodSyncHandler(activeDeadlineHandler)
kubelet.gpuManager = gpu.NewGPUManagerStub()
return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient, fakeClock, nil, plug}
}

View File

@ -248,7 +248,6 @@ func sortPodsByQOS(pods []*v1.Pod) (bestEffort, burstable, guaranteed []*v1.Pod)
// returns true if pod1 has a smaller request than pod2
func smallerResourceRequest(pod1 *v1.Pod, pod2 *v1.Pod) bool {
priorityList := []v1.ResourceName{
v1.ResourceNvidiaGPU,
v1.ResourceMemory,
v1.ResourceCPU,
}

View File

@ -682,10 +682,6 @@ func GetResourceRequest(pod *v1.Pod) *schedulercache.Resource {
if cpu := rQuantity.MilliValue(); cpu > result.MilliCPU {
result.MilliCPU = cpu
}
case v1.ResourceNvidiaGPU:
if gpu := rQuantity.Value(); gpu > result.NvidiaGPU {
result.NvidiaGPU = gpu
}
default:
if v1helper.IsScalarResourceName(rName) {
value := rQuantity.Value()
@ -734,7 +730,6 @@ func PodFitsResources(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *s
}
if podRequest.MilliCPU == 0 &&
podRequest.Memory == 0 &&
podRequest.NvidiaGPU == 0 &&
podRequest.EphemeralStorage == 0 &&
len(podRequest.ScalarResources) == 0 {
return len(predicateFails) == 0, predicateFails, nil
@ -747,10 +742,6 @@ func PodFitsResources(pod *v1.Pod, meta algorithm.PredicateMetadata, nodeInfo *s
if allocatable.Memory < podRequest.Memory+nodeInfo.RequestedResource().Memory {
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceMemory, podRequest.Memory, nodeInfo.RequestedResource().Memory, allocatable.Memory))
}
if allocatable.NvidiaGPU < podRequest.NvidiaGPU+nodeInfo.RequestedResource().NvidiaGPU {
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceNvidiaGPU, podRequest.NvidiaGPU, nodeInfo.RequestedResource().NvidiaGPU, allocatable.NvidiaGPU))
}
if allocatable.EphemeralStorage < podRequest.EphemeralStorage+nodeInfo.RequestedResource().EphemeralStorage {
predicateFails = append(predicateFails, NewInsufficientResourceError(v1.ResourceEphemeralStorage, podRequest.EphemeralStorage, nodeInfo.RequestedResource().EphemeralStorage, allocatable.EphemeralStorage))
}

View File

@ -44,13 +44,12 @@ var (
hugePageResourceA = v1helper.HugePageResourceName(resource.MustParse("2Mi"))
)
func makeResources(milliCPU, memory, nvidiaGPUs, pods, extendedA, storage, hugePageA int64) v1.NodeResources {
func makeResources(milliCPU, memory, pods, extendedA, storage, hugePageA int64) v1.NodeResources {
return v1.NodeResources{
Capacity: v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI),
extendedResourceA: *resource.NewQuantity(extendedA, resource.DecimalSI),
v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI),
hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI),
@ -58,12 +57,11 @@ func makeResources(milliCPU, memory, nvidiaGPUs, pods, extendedA, storage, hugeP
}
}
func makeAllocatableResources(milliCPU, memory, nvidiaGPUs, pods, extendedA, storage, hugePageA int64) v1.ResourceList {
func makeAllocatableResources(milliCPU, memory, pods, extendedA, storage, hugePageA int64) v1.ResourceList {
return v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(milliCPU, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(memory, resource.BinarySI),
v1.ResourcePods: *resource.NewQuantity(pods, resource.DecimalSI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(nvidiaGPUs, resource.DecimalSI),
extendedResourceA: *resource.NewQuantity(extendedA, resource.DecimalSI),
v1.ResourceEphemeralStorage: *resource.NewQuantity(storage, resource.BinarySI),
hugePageResourceA: *resource.NewQuantity(hugePageA, resource.BinarySI),
@ -357,7 +355,7 @@ func TestPodFitsResources(t *testing.T) {
}
for _, test := range enoughPodsTests {
node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20, 5)}}
node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 5, 20, 5)}}
test.nodeInfo.SetNode(&node)
RegisterPredicateMetadataProducerWithExtendedResourceOptions(test.ignoredExtendedResources)
meta := PredicateMetadata(test.pod, nil)
@ -414,7 +412,7 @@ func TestPodFitsResources(t *testing.T) {
},
}
for _, test := range notEnoughPodsTests {
node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 0, 1, 0, 0, 0)}}
node := v1.Node{Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: makeAllocatableResources(10, 20, 1, 0, 0, 0)}}
test.nodeInfo.SetNode(&node)
fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo)
if err != nil {
@ -472,7 +470,7 @@ func TestPodFitsResources(t *testing.T) {
}
for _, test := range storagePodsTests {
node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 5, 20, 5)}}
node := v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 5, 20, 5).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 5, 20, 5)}}
test.nodeInfo.SetNode(&node)
fits, reasons, err := PodFitsResources(test.pod, PredicateMetadata(test.pod, nil), test.nodeInfo)
if err != nil {
@ -2062,7 +2060,7 @@ func TestRunGeneralPredicates(t *testing.T) {
newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})),
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)},
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 0, 0, 0)},
},
fits: true,
wErr: nil,
@ -2074,7 +2072,7 @@ func TestRunGeneralPredicates(t *testing.T) {
newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 19})),
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)},
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 0, 0, 0)},
},
fits: false,
wErr: nil,
@ -2084,34 +2082,6 @@ func TestRunGeneralPredicates(t *testing.T) {
},
test: "not enough cpu and memory resource",
},
{
pod: &v1.Pod{},
nodeInfo: schedulercache.NewNodeInfo(
newResourcePod(schedulercache.Resource{MilliCPU: 9, Memory: 19})),
node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}},
fits: true,
wErr: nil,
test: "no resources/port/host requested always fits on GPU machine",
},
{
pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}),
nodeInfo: schedulercache.NewNodeInfo(
newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 1})),
node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}},
fits: false,
wErr: nil,
reasons: []algorithm.PredicateFailureReason{NewInsufficientResourceError(v1.ResourceNvidiaGPU, 1, 1, 1)},
test: "not enough GPU resource",
},
{
pod: newResourcePod(schedulercache.Resource{MilliCPU: 3, Memory: 1, NvidiaGPU: 1}),
nodeInfo: schedulercache.NewNodeInfo(
newResourcePod(schedulercache.Resource{MilliCPU: 5, Memory: 10, NvidiaGPU: 0})),
node: &v1.Node{Status: v1.NodeStatus{Capacity: makeResources(10, 20, 1, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 1, 32, 0, 0, 0)}},
fits: true,
wErr: nil,
test: "enough GPU resource",
},
{
pod: &v1.Pod{
Spec: v1.PodSpec{
@ -2121,7 +2091,7 @@ func TestRunGeneralPredicates(t *testing.T) {
nodeInfo: schedulercache.NewNodeInfo(),
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)},
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 0, 0, 0)},
},
fits: false,
wErr: nil,
@ -2133,7 +2103,7 @@ func TestRunGeneralPredicates(t *testing.T) {
nodeInfo: schedulercache.NewNodeInfo(newPodWithPort(123)),
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "machine1"},
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 0, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 0, 32, 0, 0, 0)},
Status: v1.NodeStatus{Capacity: makeResources(10, 20, 32, 0, 0, 0).Capacity, Allocatable: makeAllocatableResources(10, 20, 32, 0, 0, 0)},
},
fits: false,
wErr: nil,
@ -3443,7 +3413,7 @@ func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) {
ImagePullPolicy: "Always",
// at least one requirement -> burstable pod
Resources: v1.ResourceRequirements{
Requests: makeAllocatableResources(100, 100, 100, 100, 0, 0, 0),
Requests: makeAllocatableResources(100, 100, 100, 0, 0, 0),
},
},
},

View File

@ -109,10 +109,6 @@ func getResourceLimits(pod *v1.Pod) *schedulercache.Resource {
if ephemeralStorage := rQuantity.Value(); ephemeralStorage > result.EphemeralStorage {
result.EphemeralStorage = ephemeralStorage
}
case v1.ResourceNvidiaGPU:
if gpu := rQuantity.Value(); gpu > result.NvidiaGPU {
result.NvidiaGPU = gpu
}
default:
if v1helper.IsScalarResourceName(rName) {
value := rQuantity.Value()

View File

@ -114,7 +114,6 @@ func (transientSchedInfo *transientSchedulerInfo) resetTransientSchedulerInfo()
type Resource struct {
MilliCPU int64
Memory int64
NvidiaGPU int64
EphemeralStorage int64
// We store allowedPodNumber (which is Node.Status.Allocatable.Pods().Value())
// explicitly as int, to avoid conversions and improve performance.
@ -142,8 +141,6 @@ func (r *Resource) Add(rl v1.ResourceList) {
r.MilliCPU += rQuant.MilliValue()
case v1.ResourceMemory:
r.Memory += rQuant.Value()
case v1.ResourceNvidiaGPU:
r.NvidiaGPU += rQuant.Value()
case v1.ResourcePods:
r.AllowedPodNumber += int(rQuant.Value())
case v1.ResourceEphemeralStorage:
@ -161,7 +158,6 @@ func (r *Resource) ResourceList() v1.ResourceList {
result := v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(r.MilliCPU, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(r.Memory, resource.BinarySI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(r.NvidiaGPU, resource.DecimalSI),
v1.ResourcePods: *resource.NewQuantity(int64(r.AllowedPodNumber), resource.BinarySI),
v1.ResourceEphemeralStorage: *resource.NewQuantity(r.EphemeralStorage, resource.BinarySI),
}
@ -180,7 +176,6 @@ func (r *Resource) Clone() *Resource {
res := &Resource{
MilliCPU: r.MilliCPU,
Memory: r.Memory,
NvidiaGPU: r.NvidiaGPU,
AllowedPodNumber: r.AllowedPodNumber,
EphemeralStorage: r.EphemeralStorage,
}
@ -369,7 +364,6 @@ func (n *NodeInfo) AddPod(pod *v1.Pod) {
res, non0CPU, non0Mem := calculateResource(pod)
n.requestedResource.MilliCPU += res.MilliCPU
n.requestedResource.Memory += res.Memory
n.requestedResource.NvidiaGPU += res.NvidiaGPU
n.requestedResource.EphemeralStorage += res.EphemeralStorage
if n.requestedResource.ScalarResources == nil && len(res.ScalarResources) > 0 {
n.requestedResource.ScalarResources = map[v1.ResourceName]int64{}
@ -425,7 +419,6 @@ func (n *NodeInfo) RemovePod(pod *v1.Pod) error {
n.requestedResource.MilliCPU -= res.MilliCPU
n.requestedResource.Memory -= res.Memory
n.requestedResource.NvidiaGPU -= res.NvidiaGPU
n.requestedResource.EphemeralStorage -= res.EphemeralStorage
if len(res.ScalarResources) > 0 && n.requestedResource.ScalarResources == nil {
n.requestedResource.ScalarResources = map[v1.ResourceName]int64{}

View File

@ -41,7 +41,6 @@ func TestNewResource(t *testing.T) {
resourceList: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: *resource.NewScaledQuantity(4, -3),
v1.ResourceMemory: *resource.NewQuantity(2000, resource.BinarySI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(1000, resource.DecimalSI),
v1.ResourcePods: *resource.NewQuantity(80, resource.BinarySI),
v1.ResourceEphemeralStorage: *resource.NewQuantity(5000, resource.BinarySI),
"scalar.test/" + "scalar1": *resource.NewQuantity(1, resource.DecimalSI),
@ -50,7 +49,6 @@ func TestNewResource(t *testing.T) {
expected: &Resource{
MilliCPU: 4,
Memory: 2000,
NvidiaGPU: 1000,
EphemeralStorage: 5000,
AllowedPodNumber: 80,
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
@ -76,7 +74,6 @@ func TestResourceList(t *testing.T) {
expected: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: *resource.NewScaledQuantity(0, -3),
v1.ResourceMemory: *resource.NewQuantity(0, resource.BinarySI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
v1.ResourcePods: *resource.NewQuantity(0, resource.BinarySI),
v1.ResourceEphemeralStorage: *resource.NewQuantity(0, resource.BinarySI),
},
@ -85,7 +82,6 @@ func TestResourceList(t *testing.T) {
resource: &Resource{
MilliCPU: 4,
Memory: 2000,
NvidiaGPU: 1000,
EphemeralStorage: 5000,
AllowedPodNumber: 80,
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
@ -93,7 +89,6 @@ func TestResourceList(t *testing.T) {
expected: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: *resource.NewScaledQuantity(4, -3),
v1.ResourceMemory: *resource.NewQuantity(2000, resource.BinarySI),
v1.ResourceNvidiaGPU: *resource.NewQuantity(1000, resource.DecimalSI),
v1.ResourcePods: *resource.NewQuantity(80, resource.BinarySI),
v1.ResourceEphemeralStorage: *resource.NewQuantity(5000, resource.BinarySI),
"scalar.test/" + "scalar1": *resource.NewQuantity(1, resource.DecimalSI),
@ -123,7 +118,6 @@ func TestResourceClone(t *testing.T) {
resource: &Resource{
MilliCPU: 4,
Memory: 2000,
NvidiaGPU: 1000,
EphemeralStorage: 5000,
AllowedPodNumber: 80,
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
@ -131,7 +125,6 @@ func TestResourceClone(t *testing.T) {
expected: &Resource{
MilliCPU: 4,
Memory: 2000,
NvidiaGPU: 1000,
EphemeralStorage: 5000,
AllowedPodNumber: 80,
ScalarResources: map[v1.ResourceName]int64{"scalar.test/scalar1": 1, "hugepages-test": 2},
@ -168,7 +161,6 @@ func TestResourceAddScalar(t *testing.T) {
resource: &Resource{
MilliCPU: 4,
Memory: 2000,
NvidiaGPU: 1000,
EphemeralStorage: 5000,
AllowedPodNumber: 80,
ScalarResources: map[v1.ResourceName]int64{"hugepages-test": 2},
@ -178,7 +170,6 @@ func TestResourceAddScalar(t *testing.T) {
expected: &Resource{
MilliCPU: 4,
Memory: 2000,
NvidiaGPU: 1000,
EphemeralStorage: 5000,
AllowedPodNumber: 80,
ScalarResources: map[v1.ResourceName]int64{"hugepages-test": 2, "scalar2": 200},
@ -205,7 +196,6 @@ func TestNewNodeInfo(t *testing.T) {
requestedResource: &Resource{
MilliCPU: 300,
Memory: 1524,
NvidiaGPU: 0,
EphemeralStorage: 0,
AllowedPodNumber: 0,
ScalarResources: map[v1.ResourceName]int64(nil),
@ -213,7 +203,6 @@ func TestNewNodeInfo(t *testing.T) {
nonzeroRequest: &Resource{
MilliCPU: 300,
Memory: 1524,
NvidiaGPU: 0,
EphemeralStorage: 0,
AllowedPodNumber: 0,
ScalarResources: map[v1.ResourceName]int64(nil),
@ -516,7 +505,6 @@ func TestNodeInfoAddPod(t *testing.T) {
requestedResource: &Resource{
MilliCPU: 300,
Memory: 1524,
NvidiaGPU: 0,
EphemeralStorage: 0,
AllowedPodNumber: 0,
ScalarResources: map[v1.ResourceName]int64(nil),
@ -524,7 +512,6 @@ func TestNodeInfoAddPod(t *testing.T) {
nonzeroRequest: &Resource{
MilliCPU: 300,
Memory: 1524,
NvidiaGPU: 0,
EphemeralStorage: 0,
AllowedPodNumber: 0,
ScalarResources: map[v1.ResourceName]int64(nil),
@ -630,7 +617,6 @@ func TestNodeInfoRemovePod(t *testing.T) {
requestedResource: &Resource{
MilliCPU: 300,
Memory: 1524,
NvidiaGPU: 0,
EphemeralStorage: 0,
AllowedPodNumber: 0,
ScalarResources: map[v1.ResourceName]int64(nil),
@ -638,7 +624,6 @@ func TestNodeInfoRemovePod(t *testing.T) {
nonzeroRequest: &Resource{
MilliCPU: 300,
Memory: 1524,
NvidiaGPU: 0,
EphemeralStorage: 0,
AllowedPodNumber: 0,
ScalarResources: map[v1.ResourceName]int64(nil),
@ -748,7 +733,6 @@ func TestNodeInfoRemovePod(t *testing.T) {
requestedResource: &Resource{
MilliCPU: 200,
Memory: 1024,
NvidiaGPU: 0,
EphemeralStorage: 0,
AllowedPodNumber: 0,
ScalarResources: map[v1.ResourceName]int64(nil),
@ -756,7 +740,6 @@ func TestNodeInfoRemovePod(t *testing.T) {
nonzeroRequest: &Resource{
MilliCPU: 200,
Memory: 1024,
NvidiaGPU: 0,
EphemeralStorage: 0,
AllowedPodNumber: 0,
ScalarResources: map[v1.ResourceName]int64(nil),

View File

@ -48,13 +48,6 @@ func (self *ResourceList) Pods() *resource.Quantity {
return &resource.Quantity{}
}
func (self *ResourceList) NvidiaGPU() *resource.Quantity {
if val, ok := (*self)[ResourceNvidiaGPU]; ok {
return &val
}
return &resource.Quantity{}
}
func (self *ResourceList) StorageEphemeral() *resource.Quantity {
if val, ok := (*self)[ResourceEphemeralStorage]; ok {
return &val

View File

@ -4076,8 +4076,6 @@ const (
// Local ephemeral storage, in bytes. (500Gi = 500GiB = 500 * 1024 * 1024 * 1024)
// The resource name for ResourceEphemeralStorage is alpha and it can change across releases.
ResourceEphemeralStorage ResourceName = "ephemeral-storage"
// NVIDIA GPU, in devices. Alpha, might change: although fractional and allowing values >1, only one whole device per node is assigned.
ResourceNvidiaGPU ResourceName = "alpha.kubernetes.io/nvidia-gpu"
)
const (

View File

@ -40,54 +40,11 @@ const (
driverInstallTimeout = 10 * time.Minute
)
type podCreationFuncType func() *v1.Pod
var (
gpuResourceName v1.ResourceName
dsYamlUrl string
podCreationFunc podCreationFuncType
)
func makeCudaAdditionTestPod() *v1.Pod {
podName := testPodNamePrefix + string(uuid.NewUUID())
testPod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Name: "vector-addition",
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
gpuResourceName: *resource.NewQuantity(1, resource.DecimalSI),
},
},
VolumeMounts: []v1.VolumeMount{
{
Name: "nvidia-libraries",
MountPath: "/usr/local/nvidia/lib64",
},
},
},
},
Volumes: []v1.Volume{
{
Name: "nvidia-libraries",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/home/kubernetes/bin/nvidia/lib",
},
},
},
},
},
}
return testPod
}
func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
podName := testPodNamePrefix + string(uuid.NewUUID())
testPod := &v1.Pod{
@ -163,11 +120,6 @@ func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *fra
}
framework.Logf("Cluster is running on COS. Proceeding with test")
if f.BaseName == "gpus" {
dsYamlUrl = "https://raw.githubusercontent.com/ContainerEngine/accelerators/master/cos-nvidia-gpu-installer/daemonset.yaml"
gpuResourceName = v1.ResourceNvidiaGPU
podCreationFunc = makeCudaAdditionTestPod
} else {
dsYamlUrlFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
if dsYamlUrlFromEnv != "" {
dsYamlUrl = dsYamlUrlFromEnv
@ -175,8 +127,6 @@ func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *fra
dsYamlUrl = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml"
}
gpuResourceName = framework.NVIDIAGPUResourceName
podCreationFunc = makeCudaAdditionDevicePluginTestPod
}
framework.Logf("Using %v", dsYamlUrl)
// Creates the DaemonSet that installs Nvidia Drivers.
@ -218,7 +168,7 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app")
podList := []*v1.Pod{}
for i := int64(0); i < getGPUsAvailable(f); i++ {
podList = append(podList, f.PodClient().Create(podCreationFunc()))
podList = append(podList, f.PodClient().Create(makeCudaAdditionDevicePluginTestPod()))
}
framework.Logf("Wait for all test pods to succeed")
// Wait for all pods to succeed
@ -234,13 +184,6 @@ func testNvidiaGPUsOnCOS(f *framework.Framework) {
framework.ExpectNoError(err, "getting resource usage summary")
}
var _ = SIGDescribe("[Feature:GPU]", func() {
f := framework.NewDefaultFramework("gpus")
It("run Nvidia GPU tests on Container Optimized OS only", func() {
testNvidiaGPUsOnCOS(f)
})
})
var _ = SIGDescribe("[Feature:GPUDevicePlugin]", func() {
f := framework.NewDefaultFramework("device-plugin-gpus")
It("run Nvidia GPU Device Plugin tests on Container Optimized OS only", func() {

View File

@ -11,7 +11,6 @@ go_library(
"docker_util.go",
"framework.go",
"gpu_device_plugin.go",
"gpus.go",
"image_list.go",
"simple_mount.go",
"util.go",

View File

@ -17,6 +17,7 @@ limitations under the License.
package e2e_node
import (
"os/exec"
"strconv"
"time"
@ -132,6 +133,16 @@ var _ = framework.KubeDescribe("NVIDIA GPU Device Plugin [Feature:GPUDevicePlugi
})
})
func checkIfNvidiaGPUsExistOnNode() bool {
// Cannot use `lspci` because it is not installed on all distros by default.
err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run()
if err != nil {
framework.Logf("check for nvidia GPUs failed. Got Error: %v", err)
return false
}
return true
}
func logDevicePluginMetrics() {
ms, err := metrics.GrabKubeletMetricsWithoutProxy(framework.TestContext.NodeName + ":10255")
framework.ExpectNoError(err)

View File

@ -1,174 +0,0 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e_node
import (
"fmt"
"os/exec"
"time"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
"k8s.io/kubernetes/test/e2e/framework"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
func getGPUsAvailable(f *framework.Framework) int64 {
nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err, "getting node list")
var gpusAvailable int64
for _, node := range nodeList.Items {
gpusAvailable += node.Status.Capacity.NvidiaGPU().Value()
}
return gpusAvailable
}
func gpusExistOnAllNodes(f *framework.Framework) bool {
nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
framework.ExpectNoError(err, "getting node list")
for _, node := range nodeList.Items {
if node.Name == "kubernetes-master" {
continue
}
if node.Status.Capacity.NvidiaGPU().Value() == 0 {
return false
}
}
return true
}
func checkIfNvidiaGPUsExistOnNode() bool {
// Cannot use `lspci` because it is not installed on all distros by default.
err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run()
if err != nil {
framework.Logf("check for nvidia GPUs failed. Got Error: %v", err)
return false
}
return true
}
// Serial because the test updates kubelet configuration.
var _ = framework.KubeDescribe("GPU [Serial]", func() {
f := framework.NewDefaultFramework("gpu-test")
Context("attempt to use GPUs if available", func() {
It("setup the node and create pods to test gpus", func() {
By("ensuring that Nvidia GPUs exist on the node")
if !checkIfNvidiaGPUsExistOnNode() {
Skip("Nvidia GPUs do not exist on the node. Skipping test.")
}
By("ensuring that dynamic kubelet configuration is enabled")
enabled, err := isKubeletConfigEnabled(f)
framework.ExpectNoError(err)
if !enabled {
Skip("Dynamic Kubelet configuration is not enabled. Skipping test.")
}
By("enabling support for GPUs")
var oldCfg *kubeletconfig.KubeletConfiguration
defer func() {
if oldCfg != nil {
framework.ExpectNoError(setKubeletConfiguration(f, oldCfg))
}
}()
// Enable Accelerators
oldCfg, err = getCurrentKubeletConfig()
framework.ExpectNoError(err)
newCfg := oldCfg.DeepCopy()
newCfg.FeatureGates[string(features.Accelerators)] = true
framework.ExpectNoError(setKubeletConfiguration(f, newCfg))
By("Waiting for GPUs to become available on the local node")
Eventually(gpusExistOnAllNodes(f), 10*time.Minute, time.Second).Should(BeTrue())
By("Creating a pod that will consume all GPUs")
podSuccess := makePod(getGPUsAvailable(f), "gpus-success")
podSuccess = f.PodClient().CreateSync(podSuccess)
By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused")
const minContainerRestartCount = 2
Eventually(func() bool {
p, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(podSuccess.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("failed to get pod status: %v", err)
return false
}
if p.Status.ContainerStatuses[0].RestartCount < minContainerRestartCount {
return false
}
return true
}, time.Minute, time.Second).Should(BeTrue())
By("Checking if the pod outputted Success to its logs")
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podSuccess.Name, podSuccess.Name, "Success"))
By("Creating a new pod requesting a GPU and noticing that it is rejected by the Kubelet")
podFailure := makePod(1, "gpu-failure")
framework.WaitForPodCondition(f.ClientSet, f.Namespace.Name, podFailure.Name, "pod rejected", framework.PodStartTimeout, func(pod *v1.Pod) (bool, error) {
if pod.Status.Phase == v1.PodFailed {
return true, nil
}
return false, nil
})
By("stopping the original Pod with GPUs")
gp := int64(0)
deleteOptions := metav1.DeleteOptions{
GracePeriodSeconds: &gp,
}
f.PodClient().DeleteSync(podSuccess.Name, &deleteOptions, framework.DefaultPodDeletionTimeout)
By("attempting to start the failed pod again")
f.PodClient().DeleteSync(podFailure.Name, &deleteOptions, framework.DefaultPodDeletionTimeout)
podFailure = f.PodClient().CreateSync(podFailure)
By("Checking if the pod outputted Success to its logs")
framework.ExpectNoError(f.PodClient().MatchContainerOutput(podFailure.Name, podFailure.Name, "Success"))
})
})
})
func makePod(gpus int64, name string) *v1.Pod {
resources := v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceNvidiaGPU: *resource.NewQuantity(gpus, resource.DecimalSI),
},
}
gpuverificationCmd := fmt.Sprintf("if [[ %d -ne $(ls /dev/ | egrep '^nvidia[0-9]+$' | wc -l) ]]; then exit 1; else echo Success; fi", gpus)
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyAlways,
Containers: []v1.Container{
{
Image: busyboxImage,
Name: name,
Command: []string{"sh", "-c", gpuverificationCmd},
Resources: resources,
},
},
},
}
}