Merge pull request #46087 from tianshapjq/gpu-info-error-in-restart

Automatic merge from submit-queue (batch tested with PRs 45877, 46846, 46630, 46087, 47003)

gpusInUse info error when kubelet restarts

**What this PR does / why we need it**:
In my test, I found 2 errors in the nvidia_gpu_manager.go.
1. the number of activePods in gpusInUse() equals to 0 when kubelet restarts. It seems the Start() method was called before pods recovery which caused this error. So I decide not to call gpusInUse() in the Start() function, just let it happen when new pod needs to be created.
2. the container.ContainerID in line 242 returns the id in format of "docker://<container_id>", this will make the client failed to inspect the container by id. We have to erase the prefix of "docker://".

**Special notes for your reviewer**:

**Release note**:

```
Avoid assigning the same GPU to multiple containers.
```
pull/6/head
Kubernetes Submit Queue 2017-06-07 17:55:50 -07:00 committed by GitHub
commit 56baaaae73
1 changed files with 3 additions and 3 deletions

View File

@ -22,6 +22,7 @@ import (
"os" "os"
"path" "path"
"regexp" "regexp"
"strings"
"sync" "sync"
"github.com/golang/glog" "github.com/golang/glog"
@ -101,8 +102,7 @@ func (ngm *nvidiaGPUManager) Start() error {
if err := ngm.discoverGPUs(); err != nil { if err := ngm.discoverGPUs(); err != nil {
return err return err
} }
// It's possible that the runtime isn't available now.
ngm.allocated = ngm.gpusInUse()
// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up. // We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
return nil return nil
} }
@ -239,7 +239,7 @@ func (ngm *nvidiaGPUManager) gpusInUse() *podGPUs {
var containersToInspect []containerIdentifier var containersToInspect []containerIdentifier
for _, container := range pod.Status.ContainerStatuses { for _, container := range pod.Status.ContainerStatuses {
if containers.Has(container.Name) { if containers.Has(container.Name) {
containersToInspect = append(containersToInspect, containerIdentifier{container.ContainerID, container.Name}) containersToInspect = append(containersToInspect, containerIdentifier{strings.Replace(container.ContainerID, "docker://", "", 1), container.Name})
} }
} }
// add the pod and its containers that need to be inspected. // add the pod and its containers that need to be inspected.