Merge pull request #44124 from vmware/VSANPolicySupportPVCScaleCreationFix

Automatic merge from submit-queue (batch tested with PRs 44124, 44510)

Optimize the time taken to create Persistent volumes with VSAN storage capabilities at scale and handle VPXD crashes

Currently creating persistent volumes with VSAN storage capabilities at scale is taking very large amount of time. We have tested at the scale of 500-600 PVC's and its more time for all the PVC requests to go from Pending state to Bound state. 

- In our current design we use a single systemVM - "kubernetes-helper-vm" as a means to create a persistent volume with the VSAN policy configured. 

- Since all the operations are on a single system VM, all requests on scale get queued and executed serially on this system VM. Because of this creating a high number of PVC's is taking very large time.

- Since its a single system VM, all parallel PVC requests most of the time tend to take the same SCSI adapter on the system VM and also same unit number on the SCSI adapter. Therefore the error rate is high.

Inorder to overcome these issues and to optimize the time taken to create persistent volumes with VSAN storage capabilities at scale we have slightly modified the design which is described below:

- In this model, we create a VM on the fly for every persistent volume that is being created. Since all the reconfigure operations to create a disk with the VSAN policy configured are on their individual VM's, all of these PVC's request execute in parallel independent one other.

- With this new design, there will no error rate at all.

Also, we have overcome the problem of vpxd crashes and any other intermediate problems by checking type of the errors.


@kerneltime  @tusharnt @divyenpatel @pdhamdhere

**Release note**:

Kubernetes Submit Queue 2017-04-27 16:14:56 -07:00 committed by GitHub
commit 684df6e421
1 changed files with 231 additions and 75 deletions

View File

@ -28,6 +28,7 @@ import (
@ -51,27 +52,33 @@ import (
const (
ProviderName = "vsphere"
ActivePowerState = "poweredOn"
SCSIControllerType = "scsi"
LSILogicControllerType = "lsiLogic"
BusLogicControllerType = "busLogic"
PVSCSIControllerType = "pvscsi"
LSILogicSASControllerType = "lsiLogic-sas"
SCSIControllerLimit = 4
SCSIControllerDeviceLimit = 15
SCSIDeviceSlots = 16
SCSIReservedSlot = 7
ThinDiskType = "thin"
PreallocatedDiskType = "preallocated"
EagerZeroedThickDiskType = "eagerZeroedThick"
ZeroedThickDiskType = "zeroedThick"
VolDir = "kubevols"
RoundTripperDefaultCount = 3
DummyVMName = "kubernetes-helper-vm"
VSANDatastoreType = "vsan"
MAC_OUI_VC = "00:50:56"
MAC_OUI_ESX = "00:0c:29"
ProviderName = "vsphere"
ActivePowerState = "poweredOn"
SCSIControllerType = "scsi"
LSILogicControllerType = "lsiLogic"
BusLogicControllerType = "busLogic"
PVSCSIControllerType = "pvscsi"
LSILogicSASControllerType = "lsiLogic-sas"
SCSIControllerLimit = 4
SCSIControllerDeviceLimit = 15
SCSIDeviceSlots = 16
SCSIReservedSlot = 7
ThinDiskType = "thin"
PreallocatedDiskType = "preallocated"
EagerZeroedThickDiskType = "eagerZeroedThick"
ZeroedThickDiskType = "zeroedThick"
VolDir = "kubevols"
RoundTripperDefaultCount = 3
DummyVMPrefixName = "vsphere-k8s"
VSANDatastoreType = "vsan"
MAC_OUI_VC = "00:50:56"
MAC_OUI_ESX = "00:0c:29"
DiskNotFoundErrMsg = "No vSphere disk ID found"
NoDiskUUIDFoundErrMsg = "No disk UUID found"
NoDevicesFoundErrMsg = "No devices found"
NonSupportedControllerTypeErrMsg = "Disk is attached to non-supported controller type"
FileAlreadyExistErrMsg = "File requested already exist"
CleanUpDummyVMRoutine_Interval = 5
// Controller types that are currently supported for hot attach of disks
@ -91,14 +98,17 @@ var diskFormatValidType = map[string]string{
var DiskformatValidOptions = generateDiskFormatValidOptions()
var cleanUpRoutineInitialized = false
var ErrNoDiskUUIDFound = errors.New("No disk UUID found")
var ErrNoDiskIDFound = errors.New("No vSphere disk ID found")
var ErrNoDevicesFound = errors.New("No devices found")
var ErrNonSupportedControllerType = errors.New("Disk is attached to non-supported controller type")
var ErrFileAlreadyExist = errors.New("File requested already exist")
var ErrNoDiskUUIDFound = errors.New(NoDiskUUIDFoundErrMsg)
var ErrNoDiskIDFound = errors.New(DiskNotFoundErrMsg)
var ErrNoDevicesFound = errors.New(NoDevicesFoundErrMsg)
var ErrNonSupportedControllerType = errors.New(NonSupportedControllerTypeErrMsg)
var ErrFileAlreadyExist = errors.New(FileAlreadyExistErrMsg)
var clientLock sync.Mutex
var cleanUpRoutineInitLock sync.Mutex
var cleanUpDummyVMLock sync.RWMutex
// VSphere is an implementation of cloud provider Interface for VSphere.
type VSphere struct {
@ -1239,6 +1249,7 @@ func (vs *VSphere) CreateVolume(volumeOptions *VolumeOptions) (volumePath string
// 1. Create dummy VM if not already present.
// 2. Add a new disk to the VM by performing VM reconfigure.
// 3. Detach the new disk from the dummy VM.
// 4. Delete the dummy VM.
if volumeOptions.StorageProfileData != "" {
// Check if the datastore is VSAN if any capability requirements are specified.
// VSphere cloud provider now only supports VSAN capabilities requirements
@ -1253,13 +1264,27 @@ func (vs *VSphere) CreateVolume(volumeOptions *VolumeOptions) (volumePath string
" So, please specify a valid VSAN datastore in Storage class definition.", datastore)
// Check if the DummyVM exists in kubernetes cluster folder.
// Acquire a read lock to ensure multiple PVC requests can be processed simultaneously.
defer cleanUpDummyVMLock.RUnlock()
// Create a new background routine that will delete any dummy VM's that are left stale.
// This routine will get executed for every 5 minutes and gets initiated only once in its entire lifetime.
if !cleanUpRoutineInitialized {
go vs.cleanUpDummyVMs(DummyVMPrefixName)
cleanUpRoutineInitialized = true
// Check if the VM exists in kubernetes cluster folder.
// The kubernetes cluster folder - vs.cfg.Global.WorkingDir is where all the nodes in the kubernetes cluster are created.
vmRegex := vs.cfg.Global.WorkingDir + DummyVMName
dummyVMFullName := DummyVMPrefixName + "-" + volumeOptions.Name
vmRegex := vs.cfg.Global.WorkingDir + dummyVMFullName
dummyVM, err := f.VirtualMachine(ctx, vmRegex)
if err != nil {
// 1. Create dummy VM and return the VM reference.
dummyVM, err = vs.createDummyVM(ctx, dc, ds)
// 1. Create a dummy VM and return the VM reference.
dummyVM, err = vs.createDummyVM(ctx, dc, ds, dummyVMFullName)
if err != nil {
return "", err
@ -1267,17 +1292,37 @@ func (vs *VSphere) CreateVolume(volumeOptions *VolumeOptions) (volumePath string
// 2. Reconfigure the VM to attach the disk with the VSAN policy configured.
vmDiskPath, err := vs.createVirtualDiskWithPolicy(ctx, dc, ds, dummyVM, volumeOptions)
fileAlreadyExist := false
if err != nil {
glog.Errorf("Failed to attach the disk to VM: %q with err: %+v", DummyVMName, err)
return "", err
vmDiskPath = filepath.Clean(ds.Path(VolDir)) + "/" + volumeOptions.Name + ".vmdk"
errorMessage := fmt.Sprintf("Cannot complete the operation because the file or folder %s already exists", vmDiskPath)
if errorMessage == err.Error() {
//Skip error and continue to detach the disk as the disk was already created on the datastore.
fileAlreadyExist = true
glog.V(1).Infof("File: %v already exists", vmDiskPath)
} else {
glog.Errorf("Failed to attach the disk to VM: %q with err: %+v", dummyVMFullName, err)
return "", err
dummyVMNodeName := vmNameToNodeName(DummyVMName)
dummyVMNodeName := vmNameToNodeName(dummyVMFullName)
// 3. Detach the disk from the dummy VM.
err = vs.DetachDisk(vmDiskPath, dummyVMNodeName)
if err != nil {
glog.Errorf("Failed to detach the disk: %q from VM: %q with err: %+v", vmDiskPath, DummyVMName, err)
return "", fmt.Errorf("Failed to create the volume: %q with err: %+v", volumeOptions.Name, err)
if DiskNotFoundErrMsg == err.Error() && fileAlreadyExist {
// Skip error if disk was already detached from the dummy VM but still present on the datastore.
glog.V(1).Infof("File: %v is already detached", vmDiskPath)
} else {
glog.Errorf("Failed to detach the disk: %q from VM: %q with err: %+v", vmDiskPath, dummyVMFullName, err)
return "", fmt.Errorf("Failed to create the volume: %q with err: %+v", volumeOptions.Name, err)
// 4. Delete the dummy VM
err = deleteVM(ctx, dummyVM)
if err != nil {
return "", fmt.Errorf("Failed to destroy the vm: %q with err: %+v", dummyVMFullName, err)
destVolPath = vmDiskPath
} else {
@ -1318,6 +1363,22 @@ func (vs *VSphere) DeleteVolume(vmDiskPath string) error {
if filepath.Ext(vmDiskPath) != ".vmdk" {
vmDiskPath += ".vmdk"
// Get the vmDisk Name
diskNameWithExt := path.Base(vmDiskPath)
diskName := strings.TrimSuffix(diskNameWithExt, filepath.Ext(diskNameWithExt))
// Search for the dummyVM if present and delete it.
dummyVMFullName := DummyVMPrefixName + "-" + diskName
vmRegex := vs.cfg.Global.WorkingDir + dummyVMFullName
dummyVM, err := f.VirtualMachine(ctx, vmRegex)
if err == nil {
err = deleteVM(ctx, dummyVM)
if err != nil {
return fmt.Errorf("Failed to destroy the vm: %q with err: %+v", dummyVMFullName, err)
// Delete virtual disk
task, err := virtualDiskManager.DeleteVirtualDisk(ctx, vmDiskPath, dc)
if err != nil {
@ -1356,26 +1417,144 @@ func (vs *VSphere) NodeExists(nodeName k8stypes.NodeName) (bool, error) {
return false, nil
func (vs *VSphere) createDummyVM(ctx context.Context, datacenter *object.Datacenter, datastore *object.Datastore) (*object.VirtualMachine, error) {
// A background routine which will be responsible for deleting stale dummy VM's.
func (vs *VSphere) cleanUpDummyVMs(dummyVMPrefix string) {
// Create context
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
for {
time.Sleep(CleanUpDummyVMRoutine_Interval * time.Minute)
// Ensure client is logged in and session is valid
err := vSphereLogin(ctx, vs)
if err != nil {
glog.V(4).Infof("[cleanUpDummyVMs] Unable to login to vSphere with err: %+v", err)
// Create a new finder
f := find.NewFinder(vs.client.Client, true)
// Fetch and set data center
dc, err := f.Datacenter(ctx, vs.cfg.Global.Datacenter)
if err != nil {
glog.V(4).Infof("[cleanUpDummyVMs] Unable to fetch the datacenter: %q with err: %+v", vs.cfg.Global.Datacenter, err)
// Get the folder reference for global working directory where the dummy VM needs to be created.
vmFolder, err := getFolder(ctx, vs.client, vs.cfg.Global.Datacenter, vs.cfg.Global.WorkingDir)
if err != nil {
glog.V(4).Infof("[cleanUpDummyVMs] Unable to get the kubernetes folder: %q reference with err: %+v", vs.cfg.Global.WorkingDir, err)
// A write lock is acquired to make sure the cleanUp routine doesn't delete any VM's created by ongoing PVC requests.
dummyVMRefList, err := getDummyVMList(ctx, vs.client, vmFolder, dummyVMPrefix)
if err != nil {
glog.V(4).Infof("[cleanUpDummyVMs] Unable to get dummy VM list in the kubernetes cluster: %q reference with err: %+v", vs.cfg.Global.WorkingDir, err)
for _, dummyVMRef := range dummyVMRefList {
err = deleteVM(ctx, dummyVMRef)
if err != nil {
glog.V(4).Infof("[cleanUpDummyVMs] Unable to delete dummy VM: %q with err: %+v", dummyVMRef.Name(), err)
// Get the dummy VM list from the kubernetes working directory.
func getDummyVMList(ctx context.Context, c *govmomi.Client, vmFolder *object.Folder, dummyVMPrefix string) ([]*object.VirtualMachine, error) {
vmFolders, err := vmFolder.Children(ctx)
if err != nil {
glog.V(4).Infof("Unable to retrieve the virtual machines from the kubernetes cluster: %+v", vmFolder)
return nil, err
var dummyVMRefList []*object.VirtualMachine
pc := property.DefaultCollector(c.Client)
for _, vmFolder := range vmFolders {
if vmFolder.Reference().Type == "VirtualMachine" {
var vmRefs []types.ManagedObjectReference
var vmMorefs []mo.VirtualMachine
vmRefs = append(vmRefs, vmFolder.Reference())
err = pc.Retrieve(ctx, vmRefs, []string{"name"}, &vmMorefs)
if err != nil {
return nil, err
if strings.HasPrefix(vmMorefs[0].Name, dummyVMPrefix) {
dummyVMRefList = append(dummyVMRefList, object.NewVirtualMachine(c.Client, vmRefs[0]))
return dummyVMRefList, nil
func (vs *VSphere) createDummyVM(ctx context.Context, datacenter *object.Datacenter, datastore *object.Datastore, vmName string) (*object.VirtualMachine, error) {
// Create a virtual machine config spec with 1 SCSI adapter.
virtualMachineConfigSpec := types.VirtualMachineConfigSpec{
Name: DummyVMName,
Name: vmName,
Files: &types.VirtualMachineFileInfo{
VmPathName: "[" + datastore.Name() + "]",
NumCPUs: 1,
MemoryMB: 4,
DeviceChange: []types.BaseVirtualDeviceConfigSpec{
Operation: types.VirtualDeviceConfigSpecOperationAdd,
Device: &types.ParaVirtualSCSIController{
VirtualSCSIController: types.VirtualSCSIController{
SharedBus: types.VirtualSCSISharingNoSharing,
VirtualController: types.VirtualController{
BusNumber: 0,
VirtualDevice: types.VirtualDevice{
Key: 1000,
// Create a new finder
f := find.NewFinder(vs.client.Client, true)
// Get the resource pool for current node. This is where dummy VM will be created.
resourcePool, err := vs.getCurrentNodeResourcePool(ctx, datacenter)
if err != nil {
return nil, err
// Get the folder reference for global working directory where the dummy VM needs to be created.
vmFolder, err := getFolder(ctx, vs.client, vs.cfg.Global.Datacenter, vs.cfg.Global.WorkingDir)
if err != nil {
return nil, fmt.Errorf("Failed to get the folder reference for %q", vs.cfg.Global.WorkingDir)
return nil, fmt.Errorf("Failed to get the folder reference for %q with err: %+v", vs.cfg.Global.WorkingDir, err)
task, err := vmFolder.CreateVM(ctx, virtualMachineConfigSpec, resourcePool, nil)
if err != nil {
return nil, err
dummyVMTaskInfo, err := task.WaitForResult(ctx, nil)
if err != nil {
return nil, err
vmRef := dummyVMTaskInfo.Result.(object.Reference)
dummyVM := object.NewVirtualMachine(vs.client.Client, vmRef.Reference())
return dummyVM, nil
func (vs *VSphere) getCurrentNodeResourcePool(ctx context.Context, datacenter *object.Datacenter) (*object.ResourcePool, error) {
// Create a new finder
f := find.NewFinder(vs.client.Client, true)
vmRegex := vs.cfg.Global.WorkingDir + vs.localInstanceID
currentVM, err := f.VirtualMachine(ctx, vmRegex)
if err != nil {
@ -1394,18 +1573,7 @@ func (vs *VSphere) createDummyVM(ctx context.Context, datacenter *object.Datacen
return nil, err
task, err := vmFolder.CreateVM(ctx, virtualMachineConfigSpec, resourcePool, nil)
if err != nil {
return nil, err
dummyVMTaskInfo, err := task.WaitForResult(ctx, nil)
if err != nil {
return nil, err
dummyVM := dummyVMTaskInfo.Result.(*object.VirtualMachine)
return dummyVM, nil
return resourcePool, nil
// Creates a virtual disk with the policy configured to the disk.
@ -1421,28 +1589,7 @@ func (vs *VSphere) createVirtualDiskWithPolicy(ctx context.Context, datacenter *
var diskControllerType = vs.cfg.Disk.SCSIControllerType
// find SCSI controller of particular type from VM devices
scsiControllersOfRequiredType := getSCSIControllersOfType(vmDevices, diskControllerType)
scsiController := getAvailableSCSIController(scsiControllersOfRequiredType)
var newSCSIController types.BaseVirtualDevice
if scsiController == nil {
newSCSIController, err = createAndAttachSCSIControllerToVM(ctx, virtualMachine, diskControllerType)
if err != nil {
glog.Errorf("Failed to create SCSI controller for VM :%q with err: %+v", virtualMachine.Name(), err)
return "", err
// verify scsi controller in virtual machine
vmDevices, err := virtualMachine.Device(ctx)
if err != nil {
return "", err
scsiController = getSCSIController(vmDevices, diskControllerType)
if scsiController == nil {
glog.Errorf("cannot find SCSI controller in VM")
// attempt clean up of scsi controller
cleanUpController(ctx, newSCSIController, vmDevices, virtualMachine)
return "", fmt.Errorf("cannot find SCSI controller in VM")
scsiController := scsiControllersOfRequiredType[0]
kubeVolsPath := filepath.Clean(datastore.Path(VolDir)) + "/"
// Create a kubevols directory in the datastore if one doesn't exist.
@ -1673,3 +1820,12 @@ func getFolder(ctx context.Context, c *govmomi.Client, datacenterName string, fo
return resultFolder, nil
// Delete the VM.
func deleteVM(ctx context.Context, vm *object.VirtualMachine) error {
destroyTask, err := vm.Destroy(ctx)
if err != nil {
return err
return destroyTask.Wait(ctx)