mirror of https://github.com/k3s-io/k3s
Merge pull request #46087 from tianshapjq/gpu-info-error-in-restart
Automatic merge from submit-queue (batch tested with PRs 45877, 46846, 46630, 46087, 47003) gpusInUse info error when kubelet restarts **What this PR does / why we need it**: In my test, I found 2 errors in the nvidia_gpu_manager.go. 1. the number of activePods in gpusInUse() equals to 0 when kubelet restarts. It seems the Start() method was called before pods recovery which caused this error. So I decide not to call gpusInUse() in the Start() function, just let it happen when new pod needs to be created. 2. the container.ContainerID in line 242 returns the id in format of "docker://<container_id>", this will make the client failed to inspect the container by id. We have to erase the prefix of "docker://". **Special notes for your reviewer**: **Release note**: ``` Avoid assigning the same GPU to multiple containers. ```pull/6/head
commit
56baaaae73
|
@ -22,6 +22,7 @@ import (
|
|||
"os"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/golang/glog"
|
||||
|
@ -101,8 +102,7 @@ func (ngm *nvidiaGPUManager) Start() error {
|
|||
if err := ngm.discoverGPUs(); err != nil {
|
||||
return err
|
||||
}
|
||||
// It's possible that the runtime isn't available now.
|
||||
ngm.allocated = ngm.gpusInUse()
|
||||
|
||||
// We ignore errors when identifying allocated GPUs because it is possible that the runtime interfaces may be not be logically up.
|
||||
return nil
|
||||
}
|
||||
|
@ -239,7 +239,7 @@ func (ngm *nvidiaGPUManager) gpusInUse() *podGPUs {
|
|||
var containersToInspect []containerIdentifier
|
||||
for _, container := range pod.Status.ContainerStatuses {
|
||||
if containers.Has(container.Name) {
|
||||
containersToInspect = append(containersToInspect, containerIdentifier{container.ContainerID, container.Name})
|
||||
containersToInspect = append(containersToInspect, containerIdentifier{strings.Replace(container.ContainerID, "docker://", "", 1), container.Name})
|
||||
}
|
||||
}
|
||||
// add the pod and its containers that need to be inspected.
|
||||
|
|
Loading…
Reference in New Issue