2019-01-12 04:58:27 +00:00
/ *
Copyright 2017 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package devicemanager
import (
"context"
"fmt"
"net"
"os"
"path/filepath"
2021-03-18 22:40:29 +00:00
"runtime"
2019-09-27 21:51:53 +00:00
"sort"
2019-01-12 04:58:27 +00:00
"sync"
"time"
2020-12-01 01:06:26 +00:00
cadvisorapi "github.com/google/cadvisor/info/v1"
2019-01-12 04:58:27 +00:00
"google.golang.org/grpc"
2020-08-10 17:43:49 +00:00
"k8s.io/klog/v2"
2019-01-12 04:58:27 +00:00
2019-08-30 18:33:25 +00:00
v1 "k8s.io/api/core/v1"
2019-01-12 04:58:27 +00:00
"k8s.io/apimachinery/pkg/api/resource"
2019-12-12 01:27:03 +00:00
errorsutil "k8s.io/apimachinery/pkg/util/errors"
2019-01-12 04:58:27 +00:00
"k8s.io/apimachinery/pkg/util/sets"
2019-08-30 18:33:25 +00:00
utilfeature "k8s.io/apiserver/pkg/util/feature"
2019-12-12 01:27:03 +00:00
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
2019-01-12 04:58:27 +00:00
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
2019-08-30 18:33:25 +00:00
"k8s.io/kubernetes/pkg/features"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
2019-09-27 21:51:53 +00:00
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
2019-01-12 04:58:27 +00:00
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/metrics"
2019-08-30 18:33:25 +00:00
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
2020-12-01 01:06:26 +00:00
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
2019-08-30 18:33:25 +00:00
"k8s.io/kubernetes/pkg/util/selinux"
2019-01-12 04:58:27 +00:00
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func ( ) [ ] * v1 . Pod
// monitorCallback is the function called when a device's health state changes,
// or new devices are reported, or old devices are deleted.
// Updated contains the most recent state of the Device.
type monitorCallback func ( resourceName string , devices [ ] pluginapi . Device )
// ManagerImpl is the structure in charge of managing Device Plugins.
type ManagerImpl struct {
socketname string
socketdir string
endpoints map [ string ] endpointInfo // Key is ResourceName
mutex sync . Mutex
server * grpc . Server
wg sync . WaitGroup
// activePods is a method for listing active pods on the node
// so the amount of pluginResources requested by existing pods
// could be counted when updating allocated devices
activePods ActivePodsFunc
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can purge inactive pods from checkpointed state.
sourcesReady config . SourcesReady
// callback is used for updating devices' states in one time call.
// e.g. a new device is advertised, two old devices are deleted and a running device fails.
callback monitorCallback
2021-03-18 22:40:29 +00:00
// allDevices holds all the devices currently registered to the device manager
allDevices ResourceDeviceInstances
2019-09-27 21:51:53 +00:00
2019-01-12 04:58:27 +00:00
// healthyDevices contains all of the registered healthy resourceNames and their exported device IDs.
healthyDevices map [ string ] sets . String
// unhealthyDevices contains all of the unhealthy devices and their exported device IDs.
unhealthyDevices map [ string ] sets . String
// allocatedDevices contains allocated deviceIds, keyed by resourceName.
allocatedDevices map [ string ] sets . String
// podDevices contains pod to allocated device mapping.
2020-12-01 01:06:26 +00:00
podDevices * podDevices
2019-01-12 04:58:27 +00:00
checkpointManager checkpointmanager . CheckpointManager
2019-09-27 21:51:53 +00:00
// List of NUMA Nodes available on the underlying machine
numaNodes [ ] int
// Store of Topology Affinties that the Device Manager can query.
topologyAffinityStore topologymanager . Store
2020-03-26 21:07:15 +00:00
// devicesToReuse contains devices that can be reused as they have been allocated to
// init containers.
devicesToReuse PodReusableDevices
2021-07-02 08:43:15 +00:00
// pendingAdmissionPod contain the pod during the admission phase
pendingAdmissionPod * v1 . Pod
2019-01-12 04:58:27 +00:00
}
type endpointInfo struct {
e endpoint
opts * pluginapi . DevicePluginOptions
}
type sourcesReadyStub struct { }
2020-03-26 21:07:15 +00:00
// PodReusableDevices is a map by pod name of devices to reuse.
type PodReusableDevices map [ string ] map [ string ] sets . String
2019-01-12 04:58:27 +00:00
func ( s * sourcesReadyStub ) AddSource ( source string ) { }
func ( s * sourcesReadyStub ) AllReady ( ) bool { return true }
// NewManagerImpl creates a new manager.
2020-12-01 01:06:26 +00:00
func NewManagerImpl ( topology [ ] cadvisorapi . Node , topologyAffinityStore topologymanager . Store ) ( * ManagerImpl , error ) {
2021-03-18 22:40:29 +00:00
socketPath := pluginapi . KubeletSocket
if runtime . GOOS == "windows" {
socketPath = os . Getenv ( "SYSTEMDRIVE" ) + pluginapi . KubeletSocketWindows
}
return newManagerImpl ( socketPath , topology , topologyAffinityStore )
2019-01-12 04:58:27 +00:00
}
2020-12-01 01:06:26 +00:00
func newManagerImpl ( socketPath string , topology [ ] cadvisorapi . Node , topologyAffinityStore topologymanager . Store ) ( * ManagerImpl , error ) {
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Creating Device Plugin manager" , "path" , socketPath )
2019-01-12 04:58:27 +00:00
if socketPath == "" || ! filepath . IsAbs ( socketPath ) {
return nil , fmt . Errorf ( errBadSocket + " %s" , socketPath )
}
2019-09-27 21:51:53 +00:00
var numaNodes [ ] int
2020-12-01 01:06:26 +00:00
for _ , node := range topology {
numaNodes = append ( numaNodes , node . Id )
2019-09-27 21:51:53 +00:00
}
2019-01-12 04:58:27 +00:00
dir , file := filepath . Split ( socketPath )
manager := & ManagerImpl {
endpoints : make ( map [ string ] endpointInfo ) ,
2019-09-27 21:51:53 +00:00
socketname : file ,
socketdir : dir ,
2021-03-18 22:40:29 +00:00
allDevices : NewResourceDeviceInstances ( ) ,
2019-09-27 21:51:53 +00:00
healthyDevices : make ( map [ string ] sets . String ) ,
unhealthyDevices : make ( map [ string ] sets . String ) ,
allocatedDevices : make ( map [ string ] sets . String ) ,
2020-12-01 01:06:26 +00:00
podDevices : newPodDevices ( ) ,
2019-09-27 21:51:53 +00:00
numaNodes : numaNodes ,
topologyAffinityStore : topologyAffinityStore ,
2020-03-26 21:07:15 +00:00
devicesToReuse : make ( PodReusableDevices ) ,
2019-01-12 04:58:27 +00:00
}
manager . callback = manager . genericDeviceUpdateCallback
2019-04-07 17:07:55 +00:00
// The following structures are populated with real implementations in manager.Start()
2019-01-12 04:58:27 +00:00
// Before that, initializes them to perform no-op operations.
manager . activePods = func ( ) [ ] * v1 . Pod { return [ ] * v1 . Pod { } }
manager . sourcesReady = & sourcesReadyStub { }
checkpointManager , err := checkpointmanager . NewCheckpointManager ( dir )
if err != nil {
return nil , fmt . Errorf ( "failed to initialize checkpoint manager: %v" , err )
}
manager . checkpointManager = checkpointManager
return manager , nil
}
func ( m * ManagerImpl ) genericDeviceUpdateCallback ( resourceName string , devices [ ] pluginapi . Device ) {
m . mutex . Lock ( )
m . healthyDevices [ resourceName ] = sets . NewString ( )
m . unhealthyDevices [ resourceName ] = sets . NewString ( )
2019-09-27 21:51:53 +00:00
m . allDevices [ resourceName ] = make ( map [ string ] pluginapi . Device )
2019-01-12 04:58:27 +00:00
for _ , dev := range devices {
2019-09-27 21:51:53 +00:00
m . allDevices [ resourceName ] [ dev . ID ] = dev
2019-01-12 04:58:27 +00:00
if dev . Health == pluginapi . Healthy {
m . healthyDevices [ resourceName ] . Insert ( dev . ID )
} else {
m . unhealthyDevices [ resourceName ] . Insert ( dev . ID )
}
}
m . mutex . Unlock ( )
2020-03-26 21:07:15 +00:00
if err := m . writeCheckpoint ( ) ; err != nil {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( err , "Writing checkpoint encountered" )
2020-03-26 21:07:15 +00:00
}
2019-01-12 04:58:27 +00:00
}
func ( m * ManagerImpl ) removeContents ( dir string ) error {
d , err := os . Open ( dir )
if err != nil {
return err
}
defer d . Close ( )
names , err := d . Readdirnames ( - 1 )
if err != nil {
return err
}
2019-12-12 01:27:03 +00:00
var errs [ ] error
2019-01-12 04:58:27 +00:00
for _ , name := range names {
filePath := filepath . Join ( dir , name )
if filePath == m . checkpointFile ( ) {
continue
}
2021-03-18 22:40:29 +00:00
// TODO: Until the bug - https://github.com/golang/go/issues/33357 is fixed, os.stat wouldn't return the
// right mode(socket) on windows. Hence deleting the file, without checking whether
// its a socket, on windows.
stat , err := os . Lstat ( filePath )
2019-01-12 04:58:27 +00:00
if err != nil {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( err , "Failed to stat file" , "path" , filePath )
2019-01-12 04:58:27 +00:00
continue
}
if stat . IsDir ( ) {
continue
}
err = os . RemoveAll ( filePath )
if err != nil {
2019-12-12 01:27:03 +00:00
errs = append ( errs , err )
2021-03-18 22:40:29 +00:00
klog . ErrorS ( err , "Failed to remove file" , "path" , filePath )
2019-12-12 01:27:03 +00:00
continue
2019-01-12 04:58:27 +00:00
}
}
2019-12-12 01:27:03 +00:00
return errorsutil . NewAggregate ( errs )
2019-01-12 04:58:27 +00:00
}
// checkpointFile returns device plugin checkpoint file path.
func ( m * ManagerImpl ) checkpointFile ( ) string {
return filepath . Join ( m . socketdir , kubeletDeviceManagerCheckpoint )
}
2019-04-07 17:07:55 +00:00
// Start starts the Device Plugin Manager and start initialization of
// podDevices and allocatedDevices information from checkpointed state and
2019-01-12 04:58:27 +00:00
// starts device plugin registration service.
func ( m * ManagerImpl ) Start ( activePods ActivePodsFunc , sourcesReady config . SourcesReady ) error {
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Starting Device Plugin manager" )
2019-01-12 04:58:27 +00:00
m . activePods = activePods
m . sourcesReady = sourcesReady
// Loads in allocatedDevices information from disk.
err := m . readCheckpoint ( )
if err != nil {
2021-03-18 22:40:29 +00:00
klog . InfoS ( "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date" , "err" , err )
2019-01-12 04:58:27 +00:00
}
socketPath := filepath . Join ( m . socketdir , m . socketname )
2019-12-12 01:27:03 +00:00
if err = os . MkdirAll ( m . socketdir , 0750 ) ; err != nil {
return err
}
2019-08-30 18:33:25 +00:00
if selinux . SELinuxEnabled ( ) {
if err := selinux . SetFileLabel ( m . socketdir , config . KubeletPluginsDirSELinuxLabel ) ; err != nil {
2021-03-18 22:40:29 +00:00
klog . InfoS ( "Unprivileged containerized plugins might not work. Could not set selinux context on socket dir" , "path" , m . socketdir , "err" , err )
2019-08-30 18:33:25 +00:00
}
}
2019-01-12 04:58:27 +00:00
// Removes all stale sockets in m.socketdir. Device plugins can monitor
// this and use it as a signal to re-register with the new Kubelet.
if err := m . removeContents ( m . socketdir ) ; err != nil {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( err , "Fail to clean up stale content under socket dir" , "path" , m . socketdir )
2019-01-12 04:58:27 +00:00
}
s , err := net . Listen ( "unix" , socketPath )
if err != nil {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( err , "Failed to listen to socket while starting device plugin registry" )
2019-01-12 04:58:27 +00:00
return err
}
m . wg . Add ( 1 )
m . server = grpc . NewServer ( [ ] grpc . ServerOption { } ... )
pluginapi . RegisterRegistrationServer ( m . server , m )
go func ( ) {
defer m . wg . Done ( )
m . server . Serve ( s )
} ( )
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Serving device plugin registration server on socket" , "path" , socketPath )
2019-01-12 04:58:27 +00:00
return nil
}
// GetWatcherHandler returns the plugin handler
2019-08-30 18:33:25 +00:00
func ( m * ManagerImpl ) GetWatcherHandler ( ) cache . PluginHandler {
2019-01-12 04:58:27 +00:00
if f , err := os . Create ( m . socketdir + "DEPRECATION" ) ; err != nil {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( err , "Failed to create deprecation file at socket dir" , "path" , m . socketdir )
2019-01-12 04:58:27 +00:00
} else {
f . Close ( )
2021-03-18 22:40:29 +00:00
klog . V ( 4 ) . InfoS ( "Created deprecation file" , "path" , f . Name ( ) )
2019-01-12 04:58:27 +00:00
}
2019-08-30 18:33:25 +00:00
return cache . PluginHandler ( m )
2019-01-12 04:58:27 +00:00
}
// ValidatePlugin validates a plugin if the version is correct and the name has the format of an extended resource
2019-12-12 01:27:03 +00:00
func ( m * ManagerImpl ) ValidatePlugin ( pluginName string , endpoint string , versions [ ] string ) error {
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Got Plugin at endpoint with versions" , "plugin" , pluginName , "endpoint" , endpoint , "versions" , versions )
2019-01-12 04:58:27 +00:00
if ! m . isVersionCompatibleWithPlugin ( versions ) {
return fmt . Errorf ( "manager version, %s, is not among plugin supported versions %v" , pluginapi . Version , versions )
}
if ! v1helper . IsExtendedResourceName ( v1 . ResourceName ( pluginName ) ) {
return fmt . Errorf ( "invalid name of device plugin socket: %s" , fmt . Sprintf ( errInvalidResourceName , pluginName ) )
}
return nil
}
// RegisterPlugin starts the endpoint and registers it
// TODO: Start the endpoint and wait for the First ListAndWatch call
// before registering the plugin
func ( m * ManagerImpl ) RegisterPlugin ( pluginName string , endpoint string , versions [ ] string ) error {
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Registering plugin at endpoint" , "plugin" , pluginName , "endpoint" , endpoint )
2019-01-12 04:58:27 +00:00
e , err := newEndpointImpl ( endpoint , pluginName , m . callback )
if err != nil {
2019-09-27 21:51:53 +00:00
return fmt . Errorf ( "failed to dial device plugin with socketPath %s: %v" , endpoint , err )
2019-01-12 04:58:27 +00:00
}
options , err := e . client . GetDevicePluginOptions ( context . Background ( ) , & pluginapi . Empty { } )
if err != nil {
2019-09-27 21:51:53 +00:00
return fmt . Errorf ( "failed to get device plugin options: %v" , err )
2019-01-12 04:58:27 +00:00
}
m . registerEndpoint ( pluginName , options , e )
go m . runEndpoint ( pluginName , e )
return nil
}
// DeRegisterPlugin deregisters the plugin
// TODO work on the behavior for deregistering plugins
// e.g: Should we delete the resource
func ( m * ManagerImpl ) DeRegisterPlugin ( pluginName string ) {
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
// Note: This will mark the resource unhealthy as per the behavior
// in runEndpoint
if eI , ok := m . endpoints [ pluginName ] ; ok {
eI . e . stop ( )
}
}
func ( m * ManagerImpl ) isVersionCompatibleWithPlugin ( versions [ ] string ) bool {
// TODO(vikasc): Currently this is fine as we only have a single supported version. When we do need to support
// multiple versions in the future, we may need to extend this function to return a supported version.
// E.g., say kubelet supports v1beta1 and v1beta2, and we get v1alpha1 and v1beta1 from a device plugin,
// this function should return v1beta1
for _ , version := range versions {
for _ , supportedVersion := range pluginapi . SupportedVersions {
if version == supportedVersion {
return true
}
}
}
return false
}
2020-03-26 21:07:15 +00:00
// Allocate is the call that you can use to allocate a set of devices
// from the registered device plugins.
func ( m * ManagerImpl ) Allocate ( pod * v1 . Pod , container * v1 . Container ) error {
2021-07-02 08:43:15 +00:00
// The pod is during the admission phase. We need to save the pod to avoid it
// being cleaned before the admission ended
m . setPodPendingAdmission ( pod )
2020-03-26 21:07:15 +00:00
if _ , ok := m . devicesToReuse [ string ( pod . UID ) ] ; ! ok {
m . devicesToReuse [ string ( pod . UID ) ] = make ( map [ string ] sets . String )
}
// If pod entries to m.devicesToReuse other than the current pod exist, delete them.
for podUID := range m . devicesToReuse {
if podUID != string ( pod . UID ) {
delete ( m . devicesToReuse , podUID )
2019-01-12 04:58:27 +00:00
}
}
2020-03-26 21:07:15 +00:00
// Allocate resources for init containers first as we know the caller always loops
// through init containers before looping through app containers. Should the caller
// ever change those semantics, this logic will need to be amended.
for _ , initContainer := range pod . Spec . InitContainers {
if container . Name == initContainer . Name {
if err := m . allocateContainerResources ( pod , container , m . devicesToReuse [ string ( pod . UID ) ] ) ; err != nil {
return err
}
m . podDevices . addContainerAllocatedResources ( string ( pod . UID ) , container . Name , m . devicesToReuse [ string ( pod . UID ) ] )
return nil
2019-01-12 04:58:27 +00:00
}
}
2020-03-26 21:07:15 +00:00
if err := m . allocateContainerResources ( pod , container , m . devicesToReuse [ string ( pod . UID ) ] ) ; err != nil {
return err
}
m . podDevices . removeContainerAllocatedResources ( string ( pod . UID ) , container . Name , m . devicesToReuse [ string ( pod . UID ) ] )
2019-03-29 00:03:05 +00:00
return nil
2020-03-26 21:07:15 +00:00
2019-03-29 00:03:05 +00:00
}
2020-03-26 21:07:15 +00:00
// UpdatePluginResources updates node resources based on devices already allocated to pods.
2020-08-10 17:43:49 +00:00
func ( m * ManagerImpl ) UpdatePluginResources ( node * schedulerframework . NodeInfo , attrs * lifecycle . PodAdmitAttributes ) error {
2019-03-29 00:03:05 +00:00
pod := attrs . Pod
2019-01-12 04:58:27 +00:00
// quick return if no pluginResources requested
2020-12-01 01:06:26 +00:00
if ! m . podDevices . hasPod ( string ( pod . UID ) ) {
2019-01-12 04:58:27 +00:00
return nil
}
m . sanitizeNodeAllocatable ( node )
return nil
}
// Register registers a device plugin.
func ( m * ManagerImpl ) Register ( ctx context . Context , r * pluginapi . RegisterRequest ) ( * pluginapi . Empty , error ) {
2021-03-18 22:40:29 +00:00
klog . InfoS ( "Got registration request from device plugin with resource" , "resourceName" , r . ResourceName )
2019-01-12 04:58:27 +00:00
metrics . DevicePluginRegistrationCount . WithLabelValues ( r . ResourceName ) . Inc ( )
var versionCompatible bool
for _ , v := range pluginapi . SupportedVersions {
if r . Version == v {
versionCompatible = true
break
}
}
if ! versionCompatible {
2021-03-18 22:40:29 +00:00
err := fmt . Errorf ( errUnsupportedVersion , r . Version , pluginapi . SupportedVersions )
klog . InfoS ( "Bad registration request from device plugin with resource" , "resourceName" , r . ResourceName , "err" , err )
return & pluginapi . Empty { } , err
2019-01-12 04:58:27 +00:00
}
if ! v1helper . IsExtendedResourceName ( v1 . ResourceName ( r . ResourceName ) ) {
2021-03-18 22:40:29 +00:00
err := fmt . Errorf ( errInvalidResourceName , r . ResourceName )
klog . InfoS ( "Bad registration request from device plugin" , "err" , err )
return & pluginapi . Empty { } , err
2019-01-12 04:58:27 +00:00
}
// TODO: for now, always accepts newest device plugin. Later may consider to
// add some policies here, e.g., verify whether an old device plugin with the
// same resource name is still alive to determine whether we want to accept
// the new registration.
go m . addEndpoint ( r )
return & pluginapi . Empty { } , nil
}
// Stop is the function that can stop the gRPC server.
// Can be called concurrently, more than once, and is safe to call
// without a prior Start.
func ( m * ManagerImpl ) Stop ( ) error {
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
for _ , eI := range m . endpoints {
eI . e . stop ( )
}
if m . server == nil {
return nil
}
m . server . Stop ( )
m . wg . Wait ( )
m . server = nil
return nil
}
func ( m * ManagerImpl ) registerEndpoint ( resourceName string , options * pluginapi . DevicePluginOptions , e endpoint ) {
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
m . endpoints [ resourceName ] = endpointInfo { e : e , opts : options }
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Registered endpoint" , "endpoint" , e )
2019-01-12 04:58:27 +00:00
}
func ( m * ManagerImpl ) runEndpoint ( resourceName string , e endpoint ) {
e . run ( )
e . stop ( )
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
if old , ok := m . endpoints [ resourceName ] ; ok && old . e == e {
m . markResourceUnhealthy ( resourceName )
}
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Endpoint became unhealthy" , "resourceName" , resourceName , "endpoint" , e )
2019-01-12 04:58:27 +00:00
}
func ( m * ManagerImpl ) addEndpoint ( r * pluginapi . RegisterRequest ) {
new , err := newEndpointImpl ( filepath . Join ( m . socketdir , r . Endpoint ) , r . ResourceName , m . callback )
if err != nil {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( err , "Failed to dial device plugin with request" , "request" , r )
2019-01-12 04:58:27 +00:00
return
}
m . registerEndpoint ( r . ResourceName , r . Options , new )
go func ( ) {
m . runEndpoint ( r . ResourceName , new )
} ( )
}
func ( m * ManagerImpl ) markResourceUnhealthy ( resourceName string ) {
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Mark all resources Unhealthy for resource" , "resourceName" , resourceName )
2019-01-12 04:58:27 +00:00
healthyDevices := sets . NewString ( )
if _ , ok := m . healthyDevices [ resourceName ] ; ok {
healthyDevices = m . healthyDevices [ resourceName ]
m . healthyDevices [ resourceName ] = sets . NewString ( )
}
if _ , ok := m . unhealthyDevices [ resourceName ] ; ! ok {
m . unhealthyDevices [ resourceName ] = sets . NewString ( )
}
m . unhealthyDevices [ resourceName ] = m . unhealthyDevices [ resourceName ] . Union ( healthyDevices )
}
// GetCapacity is expected to be called when Kubelet updates its node status.
// The first returned variable contains the registered device plugin resource capacity.
// The second returned variable contains the registered device plugin resource allocatable.
// The third returned variable contains previously registered resources that are no longer active.
// Kubelet uses this information to update resource capacity/allocatable in its node status.
// After the call, device plugin can remove the inactive resources from its internal list as the
// change is already reflected in Kubelet node status.
// Note in the special case after Kubelet restarts, device plugin resource capacities can
// temporarily drop to zero till corresponding device plugins re-register. This is OK because
// cm.UpdatePluginResource() run during predicate Admit guarantees we adjust nodeinfo
// capacity for already allocated pods so that they can continue to run. However, new pods
// requiring device plugin resources will not be scheduled till device plugin re-registers.
func ( m * ManagerImpl ) GetCapacity ( ) ( v1 . ResourceList , v1 . ResourceList , [ ] string ) {
needsUpdateCheckpoint := false
var capacity = v1 . ResourceList { }
var allocatable = v1 . ResourceList { }
deletedResources := sets . NewString ( )
m . mutex . Lock ( )
for resourceName , devices := range m . healthyDevices {
eI , ok := m . endpoints [ resourceName ]
if ( ok && eI . e . stopGracePeriodExpired ( ) ) || ! ok {
// The resources contained in endpoints and (un)healthyDevices
// should always be consistent. Otherwise, we run with the risk
// of failing to garbage collect non-existing resources or devices.
if ! ok {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( nil , "Unexpected: healthyDevices and endpoints are out of sync" )
2019-01-12 04:58:27 +00:00
}
delete ( m . endpoints , resourceName )
delete ( m . healthyDevices , resourceName )
deletedResources . Insert ( resourceName )
needsUpdateCheckpoint = true
} else {
capacity [ v1 . ResourceName ( resourceName ) ] = * resource . NewQuantity ( int64 ( devices . Len ( ) ) , resource . DecimalSI )
allocatable [ v1 . ResourceName ( resourceName ) ] = * resource . NewQuantity ( int64 ( devices . Len ( ) ) , resource . DecimalSI )
}
}
for resourceName , devices := range m . unhealthyDevices {
eI , ok := m . endpoints [ resourceName ]
if ( ok && eI . e . stopGracePeriodExpired ( ) ) || ! ok {
if ! ok {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( nil , "Unexpected: unhealthyDevices and endpoints are out of sync" )
2019-01-12 04:58:27 +00:00
}
delete ( m . endpoints , resourceName )
delete ( m . unhealthyDevices , resourceName )
deletedResources . Insert ( resourceName )
needsUpdateCheckpoint = true
} else {
capacityCount := capacity [ v1 . ResourceName ( resourceName ) ]
unhealthyCount := * resource . NewQuantity ( int64 ( devices . Len ( ) ) , resource . DecimalSI )
capacityCount . Add ( unhealthyCount )
capacity [ v1 . ResourceName ( resourceName ) ] = capacityCount
}
}
m . mutex . Unlock ( )
if needsUpdateCheckpoint {
2020-03-26 21:07:15 +00:00
if err := m . writeCheckpoint ( ) ; err != nil {
2021-03-18 22:40:29 +00:00
klog . ErrorS ( err , "Error on writing checkpoint" )
2020-03-26 21:07:15 +00:00
}
2019-01-12 04:58:27 +00:00
}
return capacity , allocatable , deletedResources . UnsortedList ( )
}
// Checkpoints device to container allocation information to disk.
func ( m * ManagerImpl ) writeCheckpoint ( ) error {
m . mutex . Lock ( )
registeredDevs := make ( map [ string ] [ ] string )
for resource , devices := range m . healthyDevices {
registeredDevs [ resource ] = devices . UnsortedList ( )
}
data := checkpoint . New ( m . podDevices . toCheckpointData ( ) ,
registeredDevs )
m . mutex . Unlock ( )
err := m . checkpointManager . CreateCheckpoint ( kubeletDeviceManagerCheckpoint , data )
if err != nil {
2019-12-12 01:27:03 +00:00
err2 := fmt . Errorf ( "failed to write checkpoint file %q: %v" , kubeletDeviceManagerCheckpoint , err )
2021-03-18 22:40:29 +00:00
klog . InfoS ( "Failed to write checkpoint file" , "err" , err )
2019-12-12 01:27:03 +00:00
return err2
2019-01-12 04:58:27 +00:00
}
return nil
}
// Reads device to container allocation information from disk, and populates
// m.allocatedDevices accordingly.
func ( m * ManagerImpl ) readCheckpoint ( ) error {
registeredDevs := make ( map [ string ] [ ] string )
devEntries := make ( [ ] checkpoint . PodDevicesEntry , 0 )
cp := checkpoint . New ( devEntries , registeredDevs )
err := m . checkpointManager . GetCheckpoint ( kubeletDeviceManagerCheckpoint , cp )
if err != nil {
if err == errors . ErrCheckpointNotFound {
2021-03-18 22:40:29 +00:00
klog . InfoS ( "Failed to retrieve checkpoint" , "checkpoint" , kubeletDeviceManagerCheckpoint , "err" , err )
2019-01-12 04:58:27 +00:00
return nil
}
return err
}
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
podDevices , registeredDevs := cp . GetData ( )
m . podDevices . fromCheckpointData ( podDevices )
m . allocatedDevices = m . podDevices . devices ( )
for resource := range registeredDevs {
// During start up, creates empty healthyDevices list so that the resource capacity
// will stay zero till the corresponding device plugin re-registers.
m . healthyDevices [ resource ] = sets . NewString ( )
m . unhealthyDevices [ resource ] = sets . NewString ( )
m . endpoints [ resource ] = endpointInfo { e : newStoppedEndpointImpl ( resource ) , opts : nil }
}
return nil
}
2020-03-26 21:07:15 +00:00
// UpdateAllocatedDevices frees any Devices that are bound to terminated pods.
func ( m * ManagerImpl ) UpdateAllocatedDevices ( ) {
2019-01-12 04:58:27 +00:00
if ! m . sourcesReady . AllReady ( ) {
return
}
2021-07-02 08:43:15 +00:00
2019-01-12 04:58:27 +00:00
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
2021-07-02 08:43:15 +00:00
activeAndAdmittedPods := m . activePods ( )
if m . pendingAdmissionPod != nil {
activeAndAdmittedPods = append ( activeAndAdmittedPods , m . pendingAdmissionPod )
}
2020-03-26 21:07:15 +00:00
podsToBeRemoved := m . podDevices . pods ( )
2021-07-02 08:43:15 +00:00
for _ , pod := range activeAndAdmittedPods {
2020-03-26 21:07:15 +00:00
podsToBeRemoved . Delete ( string ( pod . UID ) )
2019-01-12 04:58:27 +00:00
}
if len ( podsToBeRemoved ) <= 0 {
return
}
2021-03-18 22:40:29 +00:00
klog . V ( 3 ) . InfoS ( "Pods to be removed" , "podUIDs" , podsToBeRemoved . List ( ) )
2019-01-12 04:58:27 +00:00
m . podDevices . delete ( podsToBeRemoved . List ( ) )
// Regenerated allocatedDevices after we update pod allocation information.
m . allocatedDevices = m . podDevices . devices ( )
}
// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
func ( m * ManagerImpl ) devicesToAllocate ( podUID , contName , resource string , required int , reusableDevices sets . String ) ( sets . String , error ) {
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
needed := required
// Gets list of devices that have already been allocated.
// This can happen if a container restarts for example.
devices := m . podDevices . containerDevices ( podUID , contName , resource )
if devices != nil {
2021-03-18 22:40:29 +00:00
klog . V ( 3 ) . InfoS ( "Found pre-allocated devices for resource on pod" , "resourceName" , resource , "containerName" , contName , "podUID" , string ( podUID ) , "devices" , devices . List ( ) )
2019-01-12 04:58:27 +00:00
needed = needed - devices . Len ( )
// A pod's resource is not expected to change once admitted by the API server,
// so just fail loudly here. We can revisit this part if this no longer holds.
if needed != 0 {
2020-12-01 01:06:26 +00:00
return nil , fmt . Errorf ( "pod %q container %q changed request for resource %q from %d to %d" , string ( podUID ) , contName , resource , devices . Len ( ) , required )
2019-01-12 04:58:27 +00:00
}
}
if needed == 0 {
// No change, no work.
return nil , nil
}
2021-03-18 22:40:29 +00:00
klog . V ( 3 ) . InfoS ( "Need devices to allocate for pod" , "deviceNumber" , needed , "resourceName" , resource , "podUID" , string ( podUID ) , "containerName" , contName )
2020-08-10 17:43:49 +00:00
// Check if resource registered with devicemanager
2019-01-12 04:58:27 +00:00
if _ , ok := m . healthyDevices [ resource ] ; ! ok {
return nil , fmt . Errorf ( "can't allocate unregistered device %s" , resource )
}
2020-08-10 17:43:49 +00:00
// Declare the list of allocated devices.
// This will be populated and returned below.
allocated := sets . NewString ( )
// Create a closure to help with device allocation
// Returns 'true' once no more devices need to be allocated.
allocateRemainingFrom := func ( devices sets . String ) bool {
for device := range devices . Difference ( allocated ) {
m . allocatedDevices [ resource ] . Insert ( device )
allocated . Insert ( device )
needed --
if needed == 0 {
return true
}
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
return false
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
// Allocates from reusableDevices list first.
if allocateRemainingFrom ( reusableDevices ) {
return allocated , nil
}
2019-01-12 04:58:27 +00:00
// Needs to allocate additional devices.
if m . allocatedDevices [ resource ] == nil {
m . allocatedDevices [ resource ] = sets . NewString ( )
}
2020-08-10 17:43:49 +00:00
2019-01-12 04:58:27 +00:00
// Gets Devices in use.
devicesInUse := m . allocatedDevices [ resource ]
2020-08-10 17:43:49 +00:00
// Gets Available devices.
2019-01-12 04:58:27 +00:00
available := m . healthyDevices [ resource ] . Difference ( devicesInUse )
2019-04-07 17:07:55 +00:00
if available . Len ( ) < needed {
2019-01-12 04:58:27 +00:00
return nil , fmt . Errorf ( "requested number of devices unavailable for %s. Requested: %d, Available: %d" , resource , needed , available . Len ( ) )
}
2020-08-10 17:43:49 +00:00
// Filters available Devices based on NUMA affinity.
aligned , unaligned , noAffinity := m . filterByAffinity ( podUID , contName , resource , available )
// If we can allocate all remaining devices from the set of aligned ones, then
// give the plugin the chance to influence which ones to allocate from that set.
if needed < aligned . Len ( ) {
// First allocate from the preferred devices list (if available).
preferred , err := m . callGetPreferredAllocationIfAvailable ( podUID , contName , resource , aligned . Union ( allocated ) , allocated , required )
if err != nil {
return nil , err
}
if allocateRemainingFrom ( preferred . Intersection ( aligned ) ) {
return allocated , nil
}
// Then fallback to allocate from the aligned set if no preferred list
// is returned (or not enough devices are returned in that list).
if allocateRemainingFrom ( aligned ) {
return allocated , nil
}
return nil , fmt . Errorf ( "unexpectedly allocated less resources than required. Requested: %d, Got: %d" , required , required - needed )
}
// If we can't allocate all remaining devices from the set of aligned ones,
// then start by first allocating all of the aligned devices (to ensure
// that the alignment guaranteed by the TopologyManager is honored).
if allocateRemainingFrom ( aligned ) {
return allocated , nil
}
// Then give the plugin the chance to influence the decision on any
// remaining devices to allocate.
preferred , err := m . callGetPreferredAllocationIfAvailable ( podUID , contName , resource , available . Union ( allocated ) , allocated , required )
if err != nil {
return nil , err
}
if allocateRemainingFrom ( preferred . Intersection ( available ) ) {
return allocated , nil
2019-09-27 21:51:53 +00:00
}
2020-08-10 17:43:49 +00:00
// Finally, if the plugin did not return a preferred allocation (or didn't
// return a large enough one), then fall back to allocating the remaining
// devices from the 'unaligned' and 'noAffinity' sets.
if allocateRemainingFrom ( unaligned ) {
return allocated , nil
}
if allocateRemainingFrom ( noAffinity ) {
return allocated , nil
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
return nil , fmt . Errorf ( "unexpectedly allocated less resources than required. Requested: %d, Got: %d" , required , required - needed )
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
func ( m * ManagerImpl ) filterByAffinity ( podUID , contName , resource string , available sets . String ) ( sets . String , sets . String , sets . String ) {
// If alignment information is not available, just pass the available list back.
hint := m . topologyAffinityStore . GetAffinity ( podUID , contName )
if ! m . deviceHasTopologyAlignment ( resource ) || hint . NUMANodeAffinity == nil {
return sets . NewString ( ) , sets . NewString ( ) , available
}
2019-12-12 01:27:03 +00:00
// Build a map of NUMA Nodes to the devices associated with them. A
2019-09-27 21:51:53 +00:00
// device may be associated to multiple NUMA nodes at the same time. If an
// available device does not have any NUMA Nodes associated with it, add it
// to a list of NUMA Nodes for the fake NUMANode -1.
perNodeDevices := make ( map [ int ] sets . String )
2020-03-26 21:07:15 +00:00
nodeWithoutTopology := - 1
2019-09-27 21:51:53 +00:00
for d := range available {
2020-03-26 21:07:15 +00:00
if m . allDevices [ resource ] [ d ] . Topology == nil || len ( m . allDevices [ resource ] [ d ] . Topology . Nodes ) == 0 {
if _ , ok := perNodeDevices [ nodeWithoutTopology ] ; ! ok {
perNodeDevices [ nodeWithoutTopology ] = sets . NewString ( )
2019-09-27 21:51:53 +00:00
}
2020-03-26 21:07:15 +00:00
perNodeDevices [ nodeWithoutTopology ] . Insert ( d )
continue
2019-09-27 21:51:53 +00:00
}
2020-03-26 21:07:15 +00:00
for _ , node := range m . allDevices [ resource ] [ d ] . Topology . Nodes {
if _ , ok := perNodeDevices [ int ( node . ID ) ] ; ! ok {
perNodeDevices [ int ( node . ID ) ] = sets . NewString ( )
2019-09-27 21:51:53 +00:00
}
2020-03-26 21:07:15 +00:00
perNodeDevices [ int ( node . ID ) ] . Insert ( d )
2019-09-27 21:51:53 +00:00
}
}
// Get a flat list of all of the nodes associated with available devices.
var nodes [ ] int
for node := range perNodeDevices {
nodes = append ( nodes , node )
}
2021-07-02 08:43:15 +00:00
// Sort the list of nodes by:
// 1) Nodes contained in the 'hint's affinity set
// 2) Nodes not contained in the 'hint's affinity set
// 3) The fake NUMANode of -1 (assuming it is included in the list)
// Within each of the groups above, sort the nodes by how many devices they contain
2019-09-27 21:51:53 +00:00
sort . Slice ( nodes , func ( i , j int ) bool {
2021-07-02 08:43:15 +00:00
// If one or the other of nodes[i] or nodes[j] is in the 'hint's affinity set
if hint . NUMANodeAffinity . IsSet ( nodes [ i ] ) && hint . NUMANodeAffinity . IsSet ( nodes [ j ] ) {
return perNodeDevices [ nodes [ i ] ] . Len ( ) < perNodeDevices [ nodes [ j ] ] . Len ( )
}
if hint . NUMANodeAffinity . IsSet ( nodes [ i ] ) {
return true
}
if hint . NUMANodeAffinity . IsSet ( nodes [ j ] ) {
return false
}
// If one or the other of nodes[i] or nodes[j] is the fake NUMA node -1 (they can't both be)
if nodes [ i ] == nodeWithoutTopology {
return false
}
if nodes [ j ] == nodeWithoutTopology {
return true
}
// Otherwise both nodes[i] and nodes[j] are real NUMA nodes that are not in the 'hint's' affinity list.
return perNodeDevices [ nodes [ i ] ] . Len ( ) < perNodeDevices [ nodes [ j ] ] . Len ( )
2019-09-27 21:51:53 +00:00
} )
// Generate three sorted lists of devices. Devices in the first list come
// from valid NUMA Nodes contained in the affinity mask. Devices in the
// second list come from valid NUMA Nodes not in the affinity mask. Devices
// in the third list come from devices with no NUMA Node association (i.e.
// those mapped to the fake NUMA Node -1). Because we loop through the
// sorted list of NUMA nodes in order, within each list, devices are sorted
// by their connection to NUMA Nodes with more devices on them.
var fromAffinity [ ] string
var notFromAffinity [ ] string
var withoutTopology [ ] string
for d := range available {
// Since the same device may be associated with multiple NUMA Nodes. We
// need to be careful not to add each device to multiple lists. The
// logic below ensures this by breaking after the first NUMA node that
// has the device is encountered.
for _ , n := range nodes {
if perNodeDevices [ n ] . Has ( d ) {
2020-03-26 21:07:15 +00:00
if n == nodeWithoutTopology {
2019-09-27 21:51:53 +00:00
withoutTopology = append ( withoutTopology , d )
2020-08-10 17:43:49 +00:00
} else if hint . NUMANodeAffinity . IsSet ( n ) {
2019-09-27 21:51:53 +00:00
fromAffinity = append ( fromAffinity , d )
} else {
notFromAffinity = append ( notFromAffinity , d )
}
break
}
}
}
2020-08-10 17:43:49 +00:00
// Return all three lists containing the full set of devices across them.
return sets . NewString ( fromAffinity ... ) , sets . NewString ( notFromAffinity ... ) , sets . NewString ( withoutTopology ... )
2019-09-27 21:51:53 +00:00
}
2019-01-12 04:58:27 +00:00
// allocateContainerResources attempts to allocate all of required device
// plugin resources for the input container, issues an Allocate rpc request
// for each new device resource requirement, processes their AllocateResponses,
// and updates the cached containerDevices on success.
func ( m * ManagerImpl ) allocateContainerResources ( pod * v1 . Pod , container * v1 . Container , devicesToReuse map [ string ] sets . String ) error {
podUID := string ( pod . UID )
contName := container . Name
allocatedDevicesUpdated := false
2020-12-01 01:06:26 +00:00
needsUpdateCheckpoint := false
2019-01-12 04:58:27 +00:00
// Extended resources are not allowed to be overcommitted.
// Since device plugin advertises extended resources,
// therefore Requests must be equal to Limits and iterating
// over the Limits should be sufficient.
for k , v := range container . Resources . Limits {
resource := string ( k )
needed := int ( v . Value ( ) )
2021-03-18 22:40:29 +00:00
klog . V ( 3 ) . InfoS ( "Looking for needed resources" , "needed" , needed , "resourceName" , resource )
2019-01-12 04:58:27 +00:00
if ! m . isDevicePluginResource ( resource ) {
continue
}
// Updates allocatedDevices to garbage collect any stranded resources
// before doing the device plugin allocation.
if ! allocatedDevicesUpdated {
2020-03-26 21:07:15 +00:00
m . UpdateAllocatedDevices ( )
2019-01-12 04:58:27 +00:00
allocatedDevicesUpdated = true
}
allocDevices , err := m . devicesToAllocate ( podUID , contName , resource , needed , devicesToReuse [ resource ] )
if err != nil {
return err
}
if allocDevices == nil || len ( allocDevices ) <= 0 {
continue
}
2020-12-01 01:06:26 +00:00
needsUpdateCheckpoint = true
2019-01-12 04:58:27 +00:00
startRPCTime := time . Now ( )
// Manager.Allocate involves RPC calls to device plugin, which
// could be heavy-weight. Therefore we want to perform this operation outside
// mutex lock. Note if Allocate call fails, we may leave container resources
2020-03-26 21:07:15 +00:00
// partially allocated for the failed container. We rely on UpdateAllocatedDevices()
2019-01-12 04:58:27 +00:00
// to garbage collect these resources later. Another side effect is that if
// we have X resource A and Y resource B in total, and two containers, container1
// and container2 both require X resource A and Y resource B. Both allocation
// requests may fail if we serve them in mixed order.
// TODO: may revisit this part later if we see inefficient resource allocation
2019-04-07 17:07:55 +00:00
// in real use as the result of this. Should also consider to parallelize device
2019-01-12 04:58:27 +00:00
// plugin Allocate grpc calls if it becomes common that a container may require
// resources from multiple device plugins.
m . mutex . Lock ( )
eI , ok := m . endpoints [ resource ]
m . mutex . Unlock ( )
if ! ok {
m . mutex . Lock ( )
m . allocatedDevices = m . podDevices . devices ( )
m . mutex . Unlock ( )
2019-09-27 21:51:53 +00:00
return fmt . Errorf ( "unknown Device Plugin %s" , resource )
2019-01-12 04:58:27 +00:00
}
devs := allocDevices . UnsortedList ( )
// TODO: refactor this part of code to just append a ContainerAllocationRequest
// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
2021-03-18 22:40:29 +00:00
klog . V ( 3 ) . InfoS ( "Making allocation request for device plugin" , "devices" , devs , "resourceName" , resource )
2019-01-12 04:58:27 +00:00
resp , err := eI . e . allocate ( devs )
2019-04-07 17:07:55 +00:00
metrics . DevicePluginAllocationDuration . WithLabelValues ( resource ) . Observe ( metrics . SinceInSeconds ( startRPCTime ) )
2019-01-12 04:58:27 +00:00
if err != nil {
// In case of allocation failure, we want to restore m.allocatedDevices
// to the actual allocated state from m.podDevices.
m . mutex . Lock ( )
m . allocatedDevices = m . podDevices . devices ( )
m . mutex . Unlock ( )
return err
}
if len ( resp . ContainerResponses ) == 0 {
2019-09-27 21:51:53 +00:00
return fmt . Errorf ( "no containers return in allocation response %v" , resp )
2019-01-12 04:58:27 +00:00
}
2020-12-01 01:06:26 +00:00
allocDevicesWithNUMA := checkpoint . NewDevicesPerNUMA ( )
2019-01-12 04:58:27 +00:00
// Update internal cached podDevices state.
m . mutex . Lock ( )
2020-12-01 01:06:26 +00:00
for dev := range allocDevices {
if m . allDevices [ resource ] [ dev ] . Topology == nil || len ( m . allDevices [ resource ] [ dev ] . Topology . Nodes ) == 0 {
allocDevicesWithNUMA [ 0 ] = append ( allocDevicesWithNUMA [ 0 ] , dev )
continue
}
for idx := range m . allDevices [ resource ] [ dev ] . Topology . Nodes {
node := m . allDevices [ resource ] [ dev ] . Topology . Nodes [ idx ]
allocDevicesWithNUMA [ node . ID ] = append ( allocDevicesWithNUMA [ node . ID ] , dev )
}
}
2019-01-12 04:58:27 +00:00
m . mutex . Unlock ( )
2020-12-01 01:06:26 +00:00
m . podDevices . insert ( podUID , contName , resource , allocDevicesWithNUMA , resp . ContainerResponses [ 0 ] )
}
if needsUpdateCheckpoint {
return m . writeCheckpoint ( )
2019-01-12 04:58:27 +00:00
}
2020-12-01 01:06:26 +00:00
return nil
2019-01-12 04:58:27 +00:00
}
// GetDeviceRunContainerOptions checks whether we have cached containerDevices
// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
// for the found one. An empty struct is returned in case no cached state is found.
func ( m * ManagerImpl ) GetDeviceRunContainerOptions ( pod * v1 . Pod , container * v1 . Container ) ( * DeviceRunContainerOptions , error ) {
podUID := string ( pod . UID )
contName := container . Name
2019-03-29 00:03:05 +00:00
needsReAllocate := false
2021-03-18 22:40:29 +00:00
for k , v := range container . Resources . Limits {
2019-01-12 04:58:27 +00:00
resource := string ( k )
2021-03-18 22:40:29 +00:00
if ! m . isDevicePluginResource ( resource ) || v . Value ( ) == 0 {
2019-01-12 04:58:27 +00:00
continue
}
err := m . callPreStartContainerIfNeeded ( podUID , contName , resource )
if err != nil {
return nil , err
}
2019-03-29 00:03:05 +00:00
// This is a device plugin resource yet we don't have cached
// resource state. This is likely due to a race during node
// restart. We re-issue allocate request to cover this race.
if m . podDevices . containerDevices ( podUID , contName , resource ) == nil {
needsReAllocate = true
}
}
if needsReAllocate {
2021-03-18 22:40:29 +00:00
klog . V ( 2 ) . InfoS ( "Needs to re-allocate device plugin resources for pod" , "pod" , klog . KObj ( pod ) , "containerName" , container . Name )
2020-03-26 21:07:15 +00:00
if err := m . Allocate ( pod , container ) ; err != nil {
2019-12-12 01:27:03 +00:00
return nil , err
}
2019-01-12 04:58:27 +00:00
}
return m . podDevices . deviceRunContainerOptions ( string ( pod . UID ) , container . Name ) , nil
}
// callPreStartContainerIfNeeded issues PreStartContainer grpc call for device plugin resource
// with PreStartRequired option set.
func ( m * ManagerImpl ) callPreStartContainerIfNeeded ( podUID , contName , resource string ) error {
m . mutex . Lock ( )
eI , ok := m . endpoints [ resource ]
if ! ok {
m . mutex . Unlock ( )
return fmt . Errorf ( "endpoint not found in cache for a registered resource: %s" , resource )
}
if eI . opts == nil || ! eI . opts . PreStartRequired {
m . mutex . Unlock ( )
2021-03-18 22:40:29 +00:00
klog . V ( 4 ) . InfoS ( "Plugin options indicate to skip PreStartContainer for resource" , "resourceName" , resource )
2019-01-12 04:58:27 +00:00
return nil
}
devices := m . podDevices . containerDevices ( podUID , contName , resource )
if devices == nil {
m . mutex . Unlock ( )
2020-12-01 01:06:26 +00:00
return fmt . Errorf ( "no devices found allocated in local cache for pod %s, container %s, resource %s" , string ( podUID ) , contName , resource )
2019-01-12 04:58:27 +00:00
}
m . mutex . Unlock ( )
devs := devices . UnsortedList ( )
2021-03-18 22:40:29 +00:00
klog . V ( 4 ) . InfoS ( "Issuing a PreStartContainer call for container" , "containerName" , contName , "podUID" , string ( podUID ) )
2019-01-12 04:58:27 +00:00
_ , err := eI . e . preStartContainer ( devs )
if err != nil {
return fmt . Errorf ( "device plugin PreStartContainer rpc failed with err: %v" , err )
}
// TODO: Add metrics support for init RPC
return nil
}
2020-08-10 17:43:49 +00:00
// callGetPreferredAllocationIfAvailable issues GetPreferredAllocation grpc
// call for device plugin resource with GetPreferredAllocationAvailable option set.
func ( m * ManagerImpl ) callGetPreferredAllocationIfAvailable ( podUID , contName , resource string , available , mustInclude sets . String , size int ) ( sets . String , error ) {
eI , ok := m . endpoints [ resource ]
if ! ok {
return nil , fmt . Errorf ( "endpoint not found in cache for a registered resource: %s" , resource )
}
if eI . opts == nil || ! eI . opts . GetPreferredAllocationAvailable {
2021-03-18 22:40:29 +00:00
klog . V ( 4 ) . InfoS ( "Plugin options indicate to skip GetPreferredAllocation for resource" , "resourceName" , resource )
2020-08-10 17:43:49 +00:00
return nil , nil
}
m . mutex . Unlock ( )
2021-03-18 22:40:29 +00:00
klog . V ( 4 ) . InfoS ( "Issuing a GetPreferredAllocation call for container" , "containerName" , contName , "podUID" , string ( podUID ) )
2020-08-10 17:43:49 +00:00
resp , err := eI . e . getPreferredAllocation ( available . UnsortedList ( ) , mustInclude . UnsortedList ( ) , size )
m . mutex . Lock ( )
if err != nil {
return nil , fmt . Errorf ( "device plugin GetPreferredAllocation rpc failed with err: %v" , err )
}
if resp != nil && len ( resp . ContainerResponses ) > 0 {
return sets . NewString ( resp . ContainerResponses [ 0 ] . DeviceIDs ... ) , nil
}
return sets . NewString ( ) , nil
}
2019-01-12 04:58:27 +00:00
// sanitizeNodeAllocatable scans through allocatedDevices in the device manager
// and if necessary, updates allocatableResource in nodeInfo to at least equal to
// the allocated capacity. This allows pods that have already been scheduled on
// the node to pass GeneralPredicates admission checking even upon device plugin failure.
2020-08-10 17:43:49 +00:00
func ( m * ManagerImpl ) sanitizeNodeAllocatable ( node * schedulerframework . NodeInfo ) {
var newAllocatableResource * schedulerframework . Resource
allocatableResource := node . Allocatable
2019-01-12 04:58:27 +00:00
if allocatableResource . ScalarResources == nil {
allocatableResource . ScalarResources = make ( map [ v1 . ResourceName ] int64 )
}
2020-12-01 01:06:26 +00:00
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
2019-01-12 04:58:27 +00:00
for resource , devices := range m . allocatedDevices {
needed := devices . Len ( )
quant , ok := allocatableResource . ScalarResources [ v1 . ResourceName ( resource ) ]
if ok && int ( quant ) >= needed {
continue
}
// Needs to update nodeInfo.AllocatableResource to make sure
// NodeInfo.allocatableResource at least equal to the capacity already allocated.
if newAllocatableResource == nil {
newAllocatableResource = allocatableResource . Clone ( )
}
newAllocatableResource . ScalarResources [ v1 . ResourceName ( resource ) ] = int64 ( needed )
}
if newAllocatableResource != nil {
2020-08-10 17:43:49 +00:00
node . Allocatable = newAllocatableResource
2019-01-12 04:58:27 +00:00
}
}
func ( m * ManagerImpl ) isDevicePluginResource ( resource string ) bool {
2020-12-01 01:06:26 +00:00
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
2019-01-12 04:58:27 +00:00
_ , registeredResource := m . healthyDevices [ resource ]
_ , allocatedResource := m . allocatedDevices [ resource ]
// Return true if this is either an active device plugin resource or
// a resource we have previously allocated.
if registeredResource || allocatedResource {
return true
}
return false
}
2019-08-30 18:33:25 +00:00
2021-03-18 22:40:29 +00:00
// GetAllocatableDevices returns information about all the devices known to the manager
func ( m * ManagerImpl ) GetAllocatableDevices ( ) ResourceDeviceInstances {
m . mutex . Lock ( )
resp := m . allDevices . Clone ( )
m . mutex . Unlock ( )
2021-07-02 08:43:15 +00:00
klog . V ( 4 ) . InfoS ( "Known devices" , "numDevices" , len ( resp ) )
2021-03-18 22:40:29 +00:00
return resp
}
2019-08-30 18:33:25 +00:00
// GetDevices returns the devices used by the specified container
2021-03-18 22:40:29 +00:00
func ( m * ManagerImpl ) GetDevices ( podUID , containerName string ) ResourceDeviceInstances {
2019-08-30 18:33:25 +00:00
return m . podDevices . getContainerDevices ( podUID , containerName )
}
// ShouldResetExtendedResourceCapacity returns whether the extended resources should be zeroed or not,
// depending on whether the node has been recreated. Absence of the checkpoint file strongly indicates the node
// has been recreated.
func ( m * ManagerImpl ) ShouldResetExtendedResourceCapacity ( ) bool {
if utilfeature . DefaultFeatureGate . Enabled ( features . DevicePlugins ) {
checkpoints , err := m . checkpointManager . ListCheckpoints ( )
if err != nil {
return false
}
return len ( checkpoints ) == 0
}
return false
}
2021-07-02 08:43:15 +00:00
func ( m * ManagerImpl ) setPodPendingAdmission ( pod * v1 . Pod ) {
m . mutex . Lock ( )
defer m . mutex . Unlock ( )
m . pendingAdmissionPod = pod
}