Get VirtualMachineScaleSets and VirtualMachineScaleSetVMs from cached resource groups

2018-08-20 14:15:02 +08:00 · 2018-08-20 14:15:02 +08:00 · 8aaeafa166
parent f83fb14452
commit 8aaeafa166
5 changed files with 164 additions and 57 deletions
--- a/pkg/cloudprovider/providers/azure/azure_backoff.go
+++ b/pkg/cloudprovider/providers/azure/azure_backoff.go
@ -69,20 +69,20 @@ func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.Virtua
 }

 // VirtualMachineClientListWithRetry invokes az.VirtualMachinesClient.List with exponential backoff retry
-func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine, error) {
+func (az *Cloud) VirtualMachineClientListWithRetry(resourceGroup string) ([]compute.VirtualMachine, error) {
 	allNodes := []compute.VirtualMachine{}
 	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		ctx, cancel := getContextWithCancel()
 		defer cancel()
-		allNodes, retryErr = az.VirtualMachinesClient.List(ctx, az.ResourceGroup)
+		allNodes, retryErr = az.VirtualMachinesClient.List(ctx, resourceGroup)
 		if retryErr != nil {
 			glog.Errorf("VirtualMachinesClient.List(%v) - backoff: failure, will retry,err=%v",
-				az.ResourceGroup,
+				resourceGroup,
 				retryErr)
 			return false, retryErr
 		}
-		glog.V(2).Infof("VirtualMachinesClient.List(%v) - backoff: success", az.ResourceGroup)
+		glog.V(2).Infof("VirtualMachinesClient.List(%v) - backoff: success", resourceGroup)
 		return true, nil
 	})
 	if err != nil {
--- a/pkg/cloudprovider/providers/azure/azure_controller_vmss.go
+++ b/pkg/cloudprovider/providers/azure/azure_controller_vmss.go
@ -29,7 +29,13 @@ import (
 // AttachDisk attaches a vhd to vm
 // the vhd must exist, can be identified by diskName, diskURI, and lun.
 func (ss *scaleSet) AttachDisk(isManagedDisk bool, diskName, diskURI string, nodeName types.NodeName, lun int32, cachingMode compute.CachingTypes) error {
-	ssName, instanceID, vm, err := ss.getVmssVM(string(nodeName))
+	vmName := mapNodeNameToVMName(nodeName)
+	ssName, instanceID, vm, err := ss.getVmssVM(vmName)
+	if err != nil {
+		return err
+	}
+
+	nodeResourceGroup, err := ss.GetNodeResourceGroup(vmName)
 	if err != nil {
 		return err
 	}
@ -65,14 +71,14 @@ func (ss *scaleSet) AttachDisk(isManagedDisk bool, diskName, diskURI string, nod

 	ctx, cancel := getContextWithCancel()
 	defer cancel()
-	glog.V(2).Infof("azureDisk - update(%s): vm(%s) - attach disk", ss.resourceGroup, nodeName)
-	resp, err := ss.VirtualMachineScaleSetVMsClient.Update(ctx, ss.resourceGroup, ssName, instanceID, vm)
+	glog.V(2).Infof("azureDisk - update(%s): vm(%s) - attach disk", nodeResourceGroup, nodeName)
+	resp, err := ss.VirtualMachineScaleSetVMsClient.Update(ctx, nodeResourceGroup, ssName, instanceID, vm)
 	if ss.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) {
-		glog.V(2).Infof("azureDisk - update(%s) backing off: vm(%s)", ss.resourceGroup, nodeName)
-		retryErr := ss.UpdateVmssVMWithRetry(ctx, ss.resourceGroup, ssName, instanceID, vm)
+		glog.V(2).Infof("azureDisk - update(%s) backing off: vm(%s)", nodeResourceGroup, nodeName)
+		retryErr := ss.UpdateVmssVMWithRetry(ctx, nodeResourceGroup, ssName, instanceID, vm)
 		if retryErr != nil {
 			err = retryErr
-			glog.V(2).Infof("azureDisk - update(%s) abort backoff: vm(%s)", ss.resourceGroup, nodeName)
+			glog.V(2).Infof("azureDisk - update(%s) abort backoff: vm(%s)", nodeResourceGroup, nodeName)
 		}
 	}
 	if err != nil {
@ -85,7 +91,8 @@ func (ss *scaleSet) AttachDisk(isManagedDisk bool, diskName, diskURI string, nod
 	} else {
 		glog.V(4).Info("azureDisk - azure attach succeeded")
 		// Invalidate the cache right after updating
-		ss.vmssVMCache.Delete(ss.makeVmssVMName(ssName, instanceID))
+		key := buildVmssCacheKey(nodeResourceGroup, ss.makeVmssVMName(ssName, instanceID))
+		ss.vmssVMCache.Delete(key)
 	}
 	return err
 }
@ -93,7 +100,13 @@ func (ss *scaleSet) AttachDisk(isManagedDisk bool, diskName, diskURI string, nod
 // DetachDiskByName detaches a vhd from host
 // the vhd can be identified by diskName or diskURI
 func (ss *scaleSet) DetachDiskByName(diskName, diskURI string, nodeName types.NodeName) error {
-	ssName, instanceID, vm, err := ss.getVmssVM(string(nodeName))
+	vmName := mapNodeNameToVMName(nodeName)
+	ssName, instanceID, vm, err := ss.getVmssVM(vmName)
+	if err != nil {
+		return err
+	}
+
+	nodeResourceGroup, err := ss.GetNodeResourceGroup(vmName)
 	if err != nil {
 		return err
 	}
@ -122,14 +135,14 @@ func (ss *scaleSet) DetachDiskByName(diskName, diskURI string, nodeName types.No
 	vm.StorageProfile.DataDisks = &disks
 	ctx, cancel := getContextWithCancel()
 	defer cancel()
-	glog.V(2).Infof("azureDisk - update(%s): vm(%s) - detach disk", ss.resourceGroup, nodeName)
-	resp, err := ss.VirtualMachineScaleSetVMsClient.Update(ctx, ss.resourceGroup, ssName, instanceID, vm)
+	glog.V(2).Infof("azureDisk - update(%s): vm(%s) - detach disk", nodeResourceGroup, nodeName)
+	resp, err := ss.VirtualMachineScaleSetVMsClient.Update(ctx, nodeResourceGroup, ssName, instanceID, vm)
 	if ss.CloudProviderBackoff && shouldRetryHTTPRequest(resp, err) {
-		glog.V(2).Infof("azureDisk - update(%s) backing off: vm(%s)", ss.resourceGroup, nodeName)
-		retryErr := ss.UpdateVmssVMWithRetry(ctx, ss.resourceGroup, ssName, instanceID, vm)
+		glog.V(2).Infof("azureDisk - update(%s) backing off: vm(%s)", nodeResourceGroup, nodeName)
+		retryErr := ss.UpdateVmssVMWithRetry(ctx, nodeResourceGroup, ssName, instanceID, vm)
 		if retryErr != nil {
 			err = retryErr
-			glog.V(2).Infof("azureDisk - update(%s) abort backoff: vm(%s)", ss.resourceGroup, nodeName)
+			glog.V(2).Infof("azureDisk - update(%s) abort backoff: vm(%s)", nodeResourceGroup, nodeName)
 		}
 	}
 	if err != nil {
@ -137,7 +150,8 @@ func (ss *scaleSet) DetachDiskByName(diskName, diskURI string, nodeName types.No
 	} else {
 		glog.V(4).Info("azureDisk - azure detach succeeded")
 		// Invalidate the cache right after updating
-		ss.vmssVMCache.Delete(ss.makeVmssVMName(ssName, instanceID))
+		key := buildVmssCacheKey(nodeResourceGroup, ss.makeVmssVMName(ssName, instanceID))
+		ss.vmssVMCache.Delete(key)
 	}

 	return err
--- a/pkg/cloudprovider/providers/azure/azure_standard.go
+++ b/pkg/cloudprovider/providers/azure/azure_standard.go
@ -440,7 +440,7 @@ func (as *availabilitySet) GetIPByNodeName(name string) (string, string, error)
 // getAgentPoolAvailabiliySets lists the virtual machines for the resource group and then builds
 // a list of availability sets that match the nodes available to k8s.
 func (as *availabilitySet) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAvailabilitySets *[]string, err error) {
-	vms, err := as.VirtualMachineClientListWithRetry()
+	vms, err := as.VirtualMachineClientListWithRetry(as.ResourceGroup)
 	if err != nil {
 		glog.Errorf("as.getNodeAvailabilitySet - VirtualMachineClientListWithRetry failed, err=%v", err)
 		return nil, err
--- a/pkg/cloudprovider/providers/azure/azure_vmss.go
+++ b/pkg/cloudprovider/providers/azure/azure_vmss.go
@ -40,8 +40,10 @@ var (
 	// ErrorNotVmssInstance indicates an instance is not belongint to any vmss.
 	ErrorNotVmssInstance = errors.New("not a vmss instance")

-	scaleSetNameRE        = regexp.MustCompile(`.*/subscriptions/(?:.*)/Microsoft.Compute/virtualMachineScaleSets/(.+)/virtualMachines(?:.*)`)
-	vmssMachineIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s"
+	scaleSetNameRE         = regexp.MustCompile(`.*/subscriptions/(?:.*)/Microsoft.Compute/virtualMachineScaleSets/(.+)/virtualMachines(?:.*)`)
+	resourceGroupRE        = regexp.MustCompile(`.*/subscriptions/(?:.*)/resourceGroups/(.+)/providers/Microsoft.Compute/virtualMachineScaleSets/(?:.*)/virtualMachines(?:.*)`)
+	vmssNicResourceGroupRE = regexp.MustCompile(`.*/subscriptions/(?:.*)/resourceGroups/(.+)/providers/Microsoft.Compute/virtualMachineScaleSets/(?:.*)/virtualMachines/(?:.*)/networkInterfaces/(?:.*)`)
+	vmssMachineIDTemplate  = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s"
 )

 // scaleSet implements VMSet interface for Azure scale set.
@ -106,8 +108,14 @@ func (ss *scaleSet) getVmssVM(nodeName string) (ssName, instanceID string, vm co
 		return "", "", vm, cloudprovider.InstanceNotFound
 	}

+	resourceGroup, err := ss.GetNodeResourceGroup(nodeName)
+	if err != nil {
+		return "", "", vm, err
+	}
+
 	glog.V(4).Infof("getVmssVM gets scaleSetName (%q) and instanceID (%q) for node %q", ssName, instanceID, nodeName)
-	cachedVM, err := ss.vmssVMCache.Get(ss.makeVmssVMName(ssName, instanceID))
+	key := buildVmssCacheKey(resourceGroup, ss.makeVmssVMName(ssName, instanceID))
+	cachedVM, err := ss.vmssVMCache.Get(key)
 	if err != nil {
 		return ssName, instanceID, vm, err
 	}
@ -122,9 +130,10 @@ func (ss *scaleSet) getVmssVM(nodeName string) (ssName, instanceID string, vm co

 // getCachedVirtualMachineByInstanceID gets scaleSetVMInfo from cache.
 // The node must belong to one of scale sets.
-func (ss *scaleSet) getVmssVMByInstanceID(scaleSetName, instanceID string) (vm compute.VirtualMachineScaleSetVM, err error) {
+func (ss *scaleSet) getVmssVMByInstanceID(resourceGroup, scaleSetName, instanceID string) (vm compute.VirtualMachineScaleSetVM, err error) {
 	vmName := ss.makeVmssVMName(scaleSetName, instanceID)
-	cachedVM, err := ss.vmssVMCache.Get(vmName)
+	key := buildVmssCacheKey(resourceGroup, vmName)
+	cachedVM, err := ss.vmssVMCache.Get(key)
 	if err != nil {
 		return vm, err
 	}
@ -168,13 +177,18 @@ func (ss *scaleSet) GetNodeNameByProviderID(providerID string) (types.NodeName,
 		return ss.availabilitySet.GetNodeNameByProviderID(providerID)
 	}

+	resourceGroup, err := extractResourceGroupByProviderID(providerID)
+	if err != nil {
+		return "", fmt.Errorf("error of extracting resource group for node %q", providerID)
+	}
+
 	instanceID, err := getLastSegment(providerID)
 	if err != nil {
 		glog.V(4).Infof("Can not extract instanceID from providerID (%s), assuming it is mananaged by availability set: %v", providerID, err)
 		return ss.availabilitySet.GetNodeNameByProviderID(providerID)
 	}

-	vm, err := ss.getVmssVMByInstanceID(scaleSetName, instanceID)
+	vm, err := ss.getVmssVMByInstanceID(resourceGroup, scaleSetName, instanceID)
 	if err != nil {
 		return "", err
 	}
@ -308,7 +322,7 @@ func getScaleSetVMInstanceID(machineName string) (string, error) {
 	return fmt.Sprintf("%d", instanceID), nil
 }

-// extractScaleSetNameByProviderID extracts the scaleset name by node's ProviderID.
+// extractScaleSetNameByProviderID extracts the scaleset name by vmss node's ProviderID.
 func extractScaleSetNameByProviderID(providerID string) (string, error) {
 	matches := scaleSetNameRE.FindStringSubmatch(providerID)
 	if len(matches) != 2 {
@ -318,13 +332,23 @@ func extractScaleSetNameByProviderID(providerID string) (string, error) {
 	return matches[1], nil
 }

+// extractResourceGroupByProviderID extracts the resource group name by vmss node's ProviderID.
+func extractResourceGroupByProviderID(providerID string) (string, error) {
+	matches := resourceGroupRE.FindStringSubmatch(providerID)
+	if len(matches) != 2 {
+		return "", ErrorNotVmssInstance
+	}
+
+	return matches[1], nil
+}
+
 // listScaleSets lists all scale sets.
-func (ss *scaleSet) listScaleSets() ([]string, error) {
+func (ss *scaleSet) listScaleSets(resourceGroup string) ([]string, error) {
 	var err error
 	ctx, cancel := getContextWithCancel()
 	defer cancel()

-	allScaleSets, err := ss.VirtualMachineScaleSetsClient.List(ctx, ss.ResourceGroup)
+	allScaleSets, err := ss.VirtualMachineScaleSetsClient.List(ctx, resourceGroup)
 	if err != nil {
 		glog.Errorf("VirtualMachineScaleSetsClient.List failed: %v", err)
 		return nil, err
@ -339,12 +363,12 @@ func (ss *scaleSet) listScaleSets() ([]string, error) {
 }

 // listScaleSetVMs lists VMs belonging to the specified scale set.
-func (ss *scaleSet) listScaleSetVMs(scaleSetName string) ([]compute.VirtualMachineScaleSetVM, error) {
+func (ss *scaleSet) listScaleSetVMs(scaleSetName, resourceGroup string) ([]compute.VirtualMachineScaleSetVM, error) {
 	var err error
 	ctx, cancel := getContextWithCancel()
 	defer cancel()

-	allVMs, err := ss.VirtualMachineScaleSetVMsClient.List(ctx, ss.ResourceGroup, scaleSetName, "", "", string(compute.InstanceView))
+	allVMs, err := ss.VirtualMachineScaleSetVMsClient.List(ctx, resourceGroup, scaleSetName, "", "", string(compute.InstanceView))
 	if err != nil {
 		glog.Errorf("VirtualMachineScaleSetVMsClient.List failed: %v", err)
 		return nil, err
@ -362,6 +386,10 @@ func (ss *scaleSet) getAgentPoolScaleSets(nodes []*v1.Node) (*[]string, error) {
 			continue
 		}

+		if ss.ShouldNodeExcludedFromLoadBalancer(nodes[nx]) {
+			continue
+		}
+
 		nodeName := nodes[nx].Name
 		ssName, err := ss.getScaleSetNameByNodeName(nodeName)
 		if err != nil {
@ -429,6 +457,16 @@ func (ss *scaleSet) GetVMSetNames(service *v1.Service, nodes []*v1.Node) (vmSetN
 	return vmSetNames, nil
 }

+// extractResourceGroupByVMSSNicID extracts the resource group name by vmss nicID.
+func extractResourceGroupByVMSSNicID(nicID string) (string, error) {
+	matches := vmssNicResourceGroupRE.FindStringSubmatch(nicID)
+	if len(matches) != 2 {
+		return "", fmt.Errorf("error of extracting resourceGroup from nicID %q", nicID)
+	}
+
+	return matches[1], nil
+}
+
 // GetPrimaryInterface gets machine primary network interface by node name and vmSet.
 func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, error) {
 	managedByAS, err := ss.isNodeManagedByAvailabilitySet(nodeName)
@ -443,6 +481,11 @@ func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, err

 	ssName, instanceID, vm, err := ss.getVmssVM(nodeName)
 	if err != nil {
+		// VM is availability set, but not cached yet in availabilitySetNodesCache.
+		if err == ErrorNotVmssInstance {
+			return ss.availabilitySet.GetPrimaryInterface(nodeName)
+		}
+
 		glog.Errorf("error: ss.GetPrimaryInterface(%s), ss.getVmssVM(%s), err=%v", nodeName, nodeName, err)
 		return network.Interface{}, err
 	}
@ -458,12 +501,16 @@ func (ss *scaleSet) GetPrimaryInterface(nodeName string) (network.Interface, err
 		glog.Errorf("error: ss.GetPrimaryInterface(%s), getLastSegment(%s), err=%v", nodeName, primaryInterfaceID, err)
 		return network.Interface{}, err
 	}
+	resourceGroup, err := extractResourceGroupByVMSSNicID(primaryInterfaceID)
+	if err != nil {
+		return network.Interface{}, err
+	}

 	ctx, cancel := getContextWithCancel()
 	defer cancel()
-	nic, err := ss.InterfacesClient.GetVirtualMachineScaleSetNetworkInterface(ctx, ss.ResourceGroup, ssName, instanceID, nicName, "")
+	nic, err := ss.InterfacesClient.GetVirtualMachineScaleSetNetworkInterface(ctx, resourceGroup, ssName, instanceID, nicName, "")
 	if err != nil {
-		glog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, ss.ResourceGroup, ssName, nicName, err)
+		glog.Errorf("error: ss.GetPrimaryInterface(%s), ss.GetVirtualMachineScaleSetNetworkInterface.Get(%s, %s, %s), err=%v", nodeName, resourceGroup, ssName, nicName, err)
 		return network.Interface{}, err
 	}

@ -566,6 +613,11 @@ func (ss *scaleSet) getNodesScaleSets(nodes []*v1.Node) (map[string]sets.String,
 			continue
 		}

+		if ss.ShouldNodeExcludedFromLoadBalancer(curNode) {
+			glog.V(4).Infof("Excluding unmanaged/external-resource-group node %q", curNode.Name)
+			continue
+		}
+
 		curScaleSetName, err := extractScaleSetNameByProviderID(curNode.Spec.ProviderID)
 		if err != nil {
 			glog.V(4).Infof("Node %q is not belonging to any scale sets, assuming it is belong to availability sets", curNode.Name)
@ -884,11 +936,11 @@ func (ss *scaleSet) EnsureBackendPoolDeleted(poolID, vmSetName string, backendAd
 }

 // getVmssMachineID returns the full identifier of a vmss virtual machine.
-func (az *Cloud) getVmssMachineID(scaleSetName, instanceID string) string {
+func (az *Cloud) getVmssMachineID(resourceGroup, scaleSetName, instanceID string) string {
 	return fmt.Sprintf(
 		vmssMachineIDTemplate,
 		az.SubscriptionID,
-		az.ResourceGroup,
+		resourceGroup,
 		scaleSetName,
 		instanceID)
 }
--- a/pkg/cloudprovider/providers/azure/azure_vmss_cache.go
+++ b/pkg/cloudprovider/providers/azure/azure_vmss_cache.go
@ -27,15 +27,16 @@ import (
 )

 var (
-	vmssNameSeparator = "_"
+	vmssNameSeparator  = "_"
+	vmssCacheSeparator = "#"

 	nodeNameToScaleSetMappingKey = "k8sNodeNameToScaleSetMappingKey"
 	availabilitySetNodesKey      = "k8sAvailabilitySetNodesKey"

 	vmssCacheTTL                      = time.Minute
 	vmssVMCacheTTL                    = time.Minute
-	availabilitySetNodesCacheTTL      = 15 * time.Minute
-	nodeNameToScaleSetMappingCacheTTL = 15 * time.Minute
+	availabilitySetNodesCacheTTL      = 5 * time.Minute
+	nodeNameToScaleSetMappingCacheTTL = 5 * time.Minute
 )

 // nodeNameToScaleSetMapping maps nodeName to scaleSet name.
@ -49,19 +50,19 @@ func (ss *scaleSet) makeVmssVMName(scaleSetName, instanceID string) string {
 func extractVmssVMName(name string) (string, string, error) {
 	split := strings.SplitAfter(name, vmssNameSeparator)
 	if len(split) < 2 {
-		glog.Errorf("Failed to extract vmssVMName %q", name)
+		glog.V(3).Infof("Failed to extract vmssVMName %q", name)
 		return "", "", ErrorNotVmssInstance
 	}

 	ssName := strings.Join(split[0:len(split)-1], "")
 	// removing the trailing `vmssNameSeparator` since we used SplitAfter
 	ssName = ssName[:len(ssName)-1]
-
 	instanceID := split[len(split)-1]
-
 	return ssName, instanceID, nil
 }

+// vmssCache only holds vmss from ss.ResourceGroup because nodes from other resourceGroups
+// will be excluded from LB backends.
 func (ss *scaleSet) newVmssCache() (*timedCache, error) {
 	getter := func(key string) (interface{}, error) {
 		ctx, cancel := getContextWithCancel()
@ -85,26 +86,34 @@ func (ss *scaleSet) newVmssCache() (*timedCache, error) {

 func (ss *scaleSet) newNodeNameToScaleSetMappingCache() (*timedCache, error) {
 	getter := func(key string) (interface{}, error) {
-		scaleSetNames, err := ss.listScaleSets()
+		localCache := make(nodeNameToScaleSetMapping)
+
+		allResourceGroups, err := ss.GetResourceGroups()
 		if err != nil {
 			return nil, err
 		}

-		localCache := make(nodeNameToScaleSetMapping)
-		for _, ssName := range scaleSetNames {
-			vms, err := ss.listScaleSetVMs(ssName)
+		for _, resourceGroup := range allResourceGroups.List() {
+			scaleSetNames, err := ss.listScaleSets(resourceGroup)
 			if err != nil {
 				return nil, err
 			}

-			for _, vm := range vms {
-				if vm.OsProfile == nil || vm.OsProfile.ComputerName == nil {
-					glog.Warningf("failed to get computerName for vmssVM (%q)", ssName)
-					continue
+			for _, ssName := range scaleSetNames {
+				vms, err := ss.listScaleSetVMs(ssName, resourceGroup)
+				if err != nil {
+					return nil, err
 				}

-				computerName := strings.ToLower(*vm.OsProfile.ComputerName)
-				localCache[computerName] = ssName
+				for _, vm := range vms {
+					if vm.OsProfile == nil || vm.OsProfile.ComputerName == nil {
+						glog.Warningf("failed to get computerName for vmssVM (%q)", ssName)
+						continue
+					}
+
+					computerName := strings.ToLower(*vm.OsProfile.ComputerName)
+					localCache[computerName] = ssName
+				}
 			}
 		}

@ -116,14 +125,23 @@ func (ss *scaleSet) newNodeNameToScaleSetMappingCache() (*timedCache, error) {

 func (ss *scaleSet) newAvailabilitySetNodesCache() (*timedCache, error) {
 	getter := func(key string) (interface{}, error) {
-		vmList, err := ss.Cloud.VirtualMachineClientListWithRetry()
+		localCache := sets.NewString()
+		resourceGroups, err := ss.GetResourceGroups()
 		if err != nil {
 			return nil, err
 		}

-		localCache := sets.NewString()
-		for _, vm := range vmList {
-			localCache.Insert(*vm.Name)
+		for _, resourceGroup := range resourceGroups.List() {
+			vmList, err := ss.Cloud.VirtualMachineClientListWithRetry(resourceGroup)
+			if err != nil {
+				return nil, err
+			}
+
+			for _, vm := range vmList {
+				if vm.Name != nil {
+					localCache.Insert(*vm.Name)
+				}
+			}
 		}

 		return localCache, nil
@ -132,10 +150,33 @@ func (ss *scaleSet) newAvailabilitySetNodesCache() (*timedCache, error) {
 	return newTimedcache(availabilitySetNodesCacheTTL, getter)
 }

+func buildVmssCacheKey(resourceGroup, name string) string {
+	// key is composed of <resourceGroup>#<vmName>
+	return fmt.Sprintf("%s%s%s", resourceGroup, vmssCacheSeparator, name)
+}
+
+func extractVmssCacheKey(key string) (string, string, error) {
+	// key is composed of <resourceGroup>#<vmName>
+	keyItems := strings.Split(key, vmssCacheSeparator)
+	if len(keyItems) != 2 {
+		return "", "", fmt.Errorf("key %q is not in format '<resouceGroup>#<vmName>'", key)
+	}
+
+	resourceGroup := keyItems[0]
+	vmName := keyItems[1]
+	return resourceGroup, vmName, nil
+}
+
 func (ss *scaleSet) newVmssVMCache() (*timedCache, error) {
 	getter := func(key string) (interface{}, error) {
-		// vmssVM name's format is 'scaleSetName_instanceID'
-		ssName, instanceID, err := extractVmssVMName(key)
+		// key is composed of <resourceGroup>#<vmName>
+		resourceGroup, vmName, err := extractVmssCacheKey(key)
+		if err != nil {
+			return nil, err
+		}
+
+		// vmName's format is 'scaleSetName_instanceID'
+		ssName, instanceID, err := extractVmssVMName(vmName)
 		if err != nil {
 			return nil, err
 		}
@ -147,7 +188,7 @@ func (ss *scaleSet) newVmssVMCache() (*timedCache, error) {

 		ctx, cancel := getContextWithCancel()
 		defer cancel()
-		result, err := ss.VirtualMachineScaleSetVMsClient.Get(ctx, ss.ResourceGroup, ssName, instanceID)
+		result, err := ss.VirtualMachineScaleSetVMsClient.Get(ctx, resourceGroup, ssName, instanceID)
 		exists, message, realErr := checkResourceExistsFromError(err)
 		if realErr != nil {
 			return nil, realErr