From 6e3f8f38908ec9c1d54825467403bbbc0a834e30 Mon Sep 17 00:00:00 2001
From: Martin Polednik <mpolednik@redhat.com>
Date: Thu, 16 Nov 2017 15:01:07 +0100
Subject: [PATCH 01/33] deviceplugin: fix race when multiple plugins are
 registered

Signed-off-by: Martin Polednik <mpolednik@redhat.com>
---
 pkg/kubelet/cm/deviceplugin/device_plugin_handler.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go b/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go
index 23d67c3fd6..38248733c9 100644
--- a/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go
+++ b/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go
@@ -77,6 +77,10 @@ func NewHandlerImpl(updateCapacityFunc func(v1.ResourceList)) (*HandlerImpl, err
 	deviceManagerMonitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {
 		var capacity = v1.ResourceList{}
 		kept := append(updated, added...)
+
+		handler.Lock()
+		defer handler.Unlock()
+
 		if _, ok := handler.allDevices[resourceName]; !ok {
 			handler.allDevices[resourceName] = sets.NewString()
 		}

From 2006fe524bee74a4e7709c727e02dcd65ff7884a Mon Sep 17 00:00:00 2001
From: Mike Danese <mikedanese@google.com>
Date: Tue, 24 Oct 2017 09:28:17 -0700
Subject: [PATCH 02/33] certs: start deprecation of signing asset default paths

---
 .../app/certificates.go                       | 46 ++++++++++++++++++-
 .../app/options/options.go                    | 14 +++++-
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/cmd/kube-controller-manager/app/certificates.go b/cmd/kube-controller-manager/app/certificates.go
index 79c9390b58..7ef2958217 100644
--- a/cmd/kube-controller-manager/app/certificates.go
+++ b/cmd/kube-controller-manager/app/certificates.go
@@ -21,9 +21,13 @@ limitations under the License.
 package app
 
 import (
+	"fmt"
+	"os"
+
 	"github.com/golang/glog"
 
 	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/kubernetes/cmd/kube-controller-manager/app/options"
 	"k8s.io/kubernetes/pkg/controller/certificates/approver"
 	"k8s.io/kubernetes/pkg/controller/certificates/cleaner"
 	"k8s.io/kubernetes/pkg/controller/certificates/signer"
@@ -36,6 +40,45 @@ func startCSRSigningController(ctx ControllerContext) (bool, error) {
 	if ctx.Options.ClusterSigningCertFile == "" || ctx.Options.ClusterSigningKeyFile == "" {
 		return false, nil
 	}
+
+	// Deprecation warning for old defaults.
+	//
+	// * If the signing cert and key are the default paths but the files
+	// exist, warn that the paths need to be specified explicitly in a
+	// later release and the defaults will be removed. We don't expect this
+	// to be the case.
+	//
+	// * If the signing cert and key are default paths but the files don't exist,
+	// bail out of startController without logging.
+	var keyFileExists, keyUsesDefault, certFileExists, certUsesDefault bool
+
+	_, err := os.Stat(ctx.Options.ClusterSigningCertFile)
+	certFileExists = !os.IsNotExist(err)
+
+	certUsesDefault = (ctx.Options.ClusterSigningCertFile == options.DefaultClusterSigningCertFile)
+
+	_, err = os.Stat(ctx.Options.ClusterSigningKeyFile)
+	keyFileExists = !os.IsNotExist(err)
+
+	keyUsesDefault = (ctx.Options.ClusterSigningKeyFile == options.DefaultClusterSigningKeyFile)
+
+	switch {
+	case (keyFileExists && keyUsesDefault) || (certFileExists && certUsesDefault):
+		glog.Warningf("You might be using flag defaulting for --cluster-signing-cert-file and" +
+			" --cluster-signing-key-file. These defaults are deprecated and will be removed" +
+			" in a subsequent release. Please pass these options explicitly.")
+	case (!keyFileExists && keyUsesDefault) && (!certFileExists && certUsesDefault):
+		// This is what we expect right now if people aren't
+		// setting up the signing controller. This isn't
+		// actually a problem since the signer is not a
+		// required controller.
+		return false, nil
+	default:
+		// Note that '!filesExist && !usesDefaults' is obviously
+		// operator error. We don't handle this case here and instead
+		// allow it to be handled by NewCSR... below.
+	}
+
 	c := ctx.ClientBuilder.ClientOrDie("certificate-controller")
 
 	signer, err := signer.NewCSRSigningController(
@@ -46,8 +89,7 @@ func startCSRSigningController(ctx ControllerContext) (bool, error) {
 		ctx.Options.ClusterSigningDuration.Duration,
 	)
 	if err != nil {
-		glog.Errorf("Failed to start certificate controller: %v", err)
-		return false, nil
+		return false, fmt.Errorf("failed to start certificate controller: %v", err)
 	}
 	go signer.Run(1, ctx.Stop)
 
diff --git a/cmd/kube-controller-manager/app/options/options.go b/cmd/kube-controller-manager/app/options/options.go
index d5765e88b7..d681bfd189 100644
--- a/cmd/kube-controller-manager/app/options/options.go
+++ b/cmd/kube-controller-manager/app/options/options.go
@@ -39,6 +39,16 @@ import (
 	"github.com/spf13/pflag"
 )
 
+const (
+	// These defaults are deprecated and exported so that we can warn if
+	// they are being used.
+
+	// DefaultClusterSigningCertFile is deprecated. Do not use.
+	DefaultClusterSigningCertFile = "/etc/kubernetes/ca/ca.pem"
+	// DefaultClusterSigningKeyFile is deprecated. Do not use.
+	DefaultClusterSigningKeyFile = "/etc/kubernetes/ca/ca.key"
+)
+
 // CMServer is the main context object for the controller manager.
 type CMServer struct {
 	componentconfig.KubeControllerManagerConfiguration
@@ -111,8 +121,8 @@ func NewCMServer() *CMServer {
 			EnableGarbageCollector:                true,
 			ConcurrentGCSyncs:                     20,
 			GCIgnoredResources:                    gcIgnoredResources,
-			ClusterSigningCertFile:                "/etc/kubernetes/ca/ca.pem",
-			ClusterSigningKeyFile:                 "/etc/kubernetes/ca/ca.key",
+			ClusterSigningCertFile:                DefaultClusterSigningCertFile,
+			ClusterSigningKeyFile:                 DefaultClusterSigningKeyFile,
 			ClusterSigningDuration:                metav1.Duration{Duration: helpers.OneYear},
 			ReconcilerSyncLoopPeriod:              metav1.Duration{Duration: 60 * time.Second},
 			EnableTaintManager:                    true,

From f780eefd396cd801c4c4c30d2c4c0cc9e91db338 Mon Sep 17 00:00:00 2001
From: David Zhu <dyzz@google.com>
Date: Wed, 4 Oct 2017 15:01:25 -0700
Subject: [PATCH 03/33] Set up alternate mount point for SCSI/NVMe local SSD by
 UUID in /mnt/disks/by-uuid/, set up ability to have unformatted disk
 symlinked in /dev/disk/by-uuid/. Added tests. Preserved backwards
 compatibility.

---
 cluster/common.sh                            |   1 +
 cluster/gce/config-default.sh                |   5 +
 cluster/gce/gci/configure-helper.sh          | 216 ++++++++++++++-
 cluster/gce/util.sh                          |  44 +++
 test/e2e/storage/persistent_volumes-local.go | 275 +++++++++++--------
 5 files changed, 422 insertions(+), 119 deletions(-)

diff --git a/cluster/common.sh b/cluster/common.sh
index c1c9261b59..200f7a4aae 100755
--- a/cluster/common.sh
+++ b/cluster/common.sh
@@ -668,6 +668,7 @@ ENABLE_PROMETHEUS_TO_SD: $(yaml-quote ${ENABLE_PROMETHEUS_TO_SD:-false})
 ENABLE_POD_PRIORITY: $(yaml-quote ${ENABLE_POD_PRIORITY:-})
 CONTAINER_RUNTIME: $(yaml-quote ${CONTAINER_RUNTIME:-})
 CONTAINER_RUNTIME_ENDPOINT: $(yaml-quote ${CONTAINER_RUNTIME_ENDPOINT:-})
+NODE_LOCAL_SSDS_EXT: $(yaml-quote ${NODE_LOCAL_SSDS_EXT:-})
 LOAD_IMAGE_COMMAND: $(yaml-quote ${LOAD_IMAGE_COMMAND:-})
 EOF
   if [ -n "${KUBELET_PORT:-}" ]; then
diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh
index 3c9cb99af9..6d592aa976 100755
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@@ -36,6 +36,11 @@ MASTER_ROOT_DISK_SIZE=${MASTER_ROOT_DISK_SIZE:-$(get-master-root-disk-size)}
 NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
 NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
 NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
+# An extension to local SSDs allowing users to specify block/fs and SCSI/NVMe devices
+# Format of this variable will be "#,scsi/nvme,block/fs" you can specify multiple
+# configurations by seperating them by a semi-colon ex. "2,scsi,fs;1,nvme,block"
+# is a request for 2 SCSI formatted and mounted SSDs and 1 NVMe block device SSD.
+NODE_LOCAL_SSDS_EXT=${NODE_LOCAL_SSDS_EXT:-}
 # Accelerators to be attached to each node. Format "type=<accelerator-type>,count=<accelerator-count>"
 # More information on available GPUs here - https://cloud.google.com/compute/docs/gpus/
 NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh
index 93b678da28..1ea3681e27 100644
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@@ -25,6 +25,9 @@ set -o errexit
 set -o nounset
 set -o pipefail
 
+readonly UUID_MNT_PREFIX="/mnt/disks/by-uuid/google-local-ssds"
+readonly UUID_BLOCK_PREFIX="/dev/disk/by-uuid/google-local-ssds"
+
 function setup-os-params {
   # Reset core_pattern. On GCI, the default core_pattern pipes the core dumps to
   # /sbin/crash_reporter which is more restrictive in saving crash dumps. So for
@@ -85,11 +88,85 @@ function create-dirs {
   fi
 }
 
-# Formats the given device ($1) if needed and mounts it at given mount point
+# Gets the total number of $(1) and $(2) type disks specified
+# by the user in ${NODE_LOCAL_SSDS_EXT}
+function get-local-disk-num() {
+  local interface="${1}"
+  local format="${2}"
+
+  localdisknum=0
+  if [[ ! -z "${NODE_LOCAL_SSDS_EXT:-}" ]]; then
+    IFS=";" read -r -a ssdgroups <<< "${NODE_LOCAL_SSDS_EXT:-}"
+    for ssdgroup in "${ssdgroups[@]}"; do
+      IFS="," read -r -a ssdopts <<< "${ssdgroup}"
+      local opnum="${ssdopts[0]}"
+      local opinterface="${ssdopts[1]}"
+      local opformat="${ssdopts[2]}"
+
+      if [[ "${opformat,,}" == "${format,,}" && "${opinterface,,}" == "${interface,,}" ]]; then
+        localdisknum=$((localdisknum+opnum))
+      fi
+    done
+  fi
+}
+
+# Creates a symlink for a ($1) so that it may be used as block storage
+function safe-block-symlink(){
+  local device="${1}"
+  local symdir="${2}"
+  
+  mkdir -p "${symdir}"
+
+  get-or-generate-uuid "${device}"
+  local myuuid="${retuuid}"
+
+  local sym="${symdir}/local-ssd-${myuuid}"
+  # Do not "mkdir -p ${sym}" as that will cause unintended symlink behavior
+  ln -s "${device}" "${sym}"
+  echo "Created a symlink for SSD $ssd at ${sym}"
+  chmod a+w "${sym}"
+}
+
+# Gets a pregenerated UUID from ${ssdmap} if it exists, otherwise generates a new
+# UUID and places it inside ${ssdmap}
+function get-or-generate-uuid(){
+  local device="${1}"
+
+  local ssdmap="/home/kubernetes/localssdmap.txt"
+  echo "Generating or getting UUID from ${ssdmap}"
+
+  if [[ ! -e "${ssdmap}" ]]; then
+    touch "${ssdmap}"
+    chmod +w "${ssdmap}"
+  fi
+
+  # each line of the ssdmap looks like "${device} persistent-uuid"
+  if [[ ! -z $(grep ${device} ${ssdmap}) ]]; then
+    #create symlink based on saved uuid
+    local myuuid=$(grep ${device} ${ssdmap} | cut -d ' ' -f 2)
+  else
+    # generate new uuid and add it to the map
+    local myuuid=$(uuidgen)
+    if [[ ! ${?} -eq 0 ]]; then
+      echo "Failed to generate valid UUID with uuidgen" >&2
+      exit 2
+    fi
+    echo "${device} ${myuuid}" >> "${ssdmap}"
+  fi
+
+  if [[ -z "${myuuid}" ]]; then
+    echo "Failed to get a uuid for device ${device} when symlinking." >&2
+    exit 2
+  fi
+
+  retuuid="${myuuid}"
+}
+
+#Formats the given device ($1) if needed and mounts it at given mount point
 # ($2).
 function safe-format-and-mount() {
-  device=$1
-  mountpoint=$2
+  local device="${1}"
+  local mountpoint="${2}"
 
   # Format only if the disk is not already formatted.
   if ! tune2fs -l "${device}" ; then
@@ -102,18 +179,135 @@ function safe-format-and-mount() {
   mount -o discard,defaults "${device}" "${mountpoint}"
 }
 
-# Local ssds, if present, are mounted at /mnt/disks/ssdN.
+# Gets a devices UUID and bind mounts the device to mount location in
+# /mnt/disks/by-id/
+function unique-uuid-bind-mount(){
+  local mountpoint="${1}"
+  local actual_device="${2}"
+
+  # Trigger udev refresh so that newly formatted devices are propagated in by-uuid
+  udevadm control --reload-rules
+  udevadm trigger
+  udevadm settle 
+
+  # grep the exact match of actual device, prevents substring matching
+  local myuuid=$(ls -l /dev/disk/by-uuid/ | grep "/${actual_device}$" | tr -s ' ' | cut -d ' ' -f 9)
+  # myuuid should be the uuid of the device as found in /dev/disk/by-uuid/ 
+  if [[ -z "${myuuid}" ]]; then
+    echo "Failed to get a uuid for device ${actual_device} when mounting." >&2
+    exit 2
+  fi
+
+  # bindpoint should be the full path of the to-be-bound device
+  local bindpoint="${UUID_MNT_PREFIX}-${interface}-fs/local-ssd-${myuuid}"
+
+  safe-bind-mount "${mountpoint}" "${bindpoint}"
+}
+
+# Bind mounts device at mountpoint to bindpoint
+function safe-bind-mount(){
+  local mountpoint="${1}"
+  local bindpoint="${2}"
+
+  # Mount device to the mountpoint
+  mkdir -p "${bindpoint}"
+  echo "Binding '${mountpoint}' at '${bindpoint}'"
+  mount --bind "${mountpoint}" "${bindpoint}"
+  chmod a+w "${bindpoint}"
+}
+
+
+# Mounts, bindmounts, or symlinks depending on the interface and format
+# of the incoming device
+function mount-ext(){
+  local ssd="${1}"
+  local devicenum="${2}"
+  local interface="${3}"
+  local format="${4}"
+  
+
+  if [[ -z "${devicenum}" ]]; then
+    echo "Failed to get the local disk number for device ${ssd}" >&2
+    exit 2
+  fi
+
+  # TODO: Handle partitioned disks. Right now this code just ignores partitions
+  if [[ "${format}" == "fs" ]]; then
+    if [[ "${interface}" == "scsi" ]]; then
+      local actual_device=$(readlink -f "${ssd}" | cut -d '/' -f 3)
+      # Error checking
+      if [[ "${actual_device}" != sd* ]]; then
+        echo "'actual_device' is not of the correct format. It must be the kernel name of the device, got ${actual_device} instead" >&2
+        exit 1
+      fi
+      local mountpoint="/mnt/disks/ssd${devicenum}"
+    else
+      # This path is required because the existing Google images do not
+      # expose NVMe devices in /dev/disk/by-id so we are using the /dev/nvme instead
+      local actual_device=$(echo ${ssd} | cut -d '/' -f 3)
+      # Error checking
+      if [[ "${actual_device}" != nvme* ]]; then
+        echo "'actual_device' is not of the correct format. It must be the kernel name of the device, got ${actual_device} instead" >&2
+        exit 1
+      fi
+      local mountpoint="/mnt/disks/ssd-nvme${devicenum}"
+    fi
+
+    safe-format-and-mount "${ssd}" "${mountpoint}"
+    # We only do the bindmount if users are using the new local ssd request method
+    # see https://github.com/kubernetes/kubernetes/pull/53466#discussion_r146431894
+    if [[ ! -z "${NODE_LOCAL_SSDS_EXT:-}" ]]; then
+      unique-uuid-bind-mount "${mountpoint}" "${actual_device}"
+    fi
+  elif [[ "${format}" == "block" ]]; then
+    local symdir="${UUID_BLOCK_PREFIX}-${interface}-block"
+    safe-block-symlink "${ssd}" "${symdir}"
+  else
+    echo "Disk format must be either fs or block, got ${format}"
+  fi
+}
+
+# Local ssds, if present, are mounted or symlinked to their appropriate
+# locations
 function ensure-local-ssds() {
+  get-local-disk-num "scsi" "block"
+  local scsiblocknum="${localdisknum}"
+  local i=0
   for ssd in /dev/disk/by-id/google-local-ssd-*; do
     if [ -e "${ssd}" ]; then
-      ssdnum=`echo ${ssd} | sed -e 's/\/dev\/disk\/by-id\/google-local-ssd-\([0-9]*\)/\1/'`
-      ssdmount="/mnt/disks/ssd${ssdnum}/"
-      mkdir -p ${ssdmount}
-      safe-format-and-mount "${ssd}" ${ssdmount}
-      echo "Mounted local SSD $ssd at ${ssdmount}"
-      chmod a+w ${ssdmount}
+      local devicenum=`echo ${ssd} | sed -e 's/\/dev\/disk\/by-id\/google-local-ssd-\([0-9]*\)/\1/'`
+      if [[ "${i}" -lt "${scsiblocknum}" ]]; then
+        mount-ext "${ssd}" "${devicenum}" "scsi" "block"
+      else
+        # GKE does not set NODE_LOCAL_SSDS so all non-block devices
+        # are assumed to be filesystem devices
+        mount-ext "${ssd}" "${devicenum}" "scsi" "fs"
+      fi
+      i=$((i+1))
     else
-      echo "No local SSD disks found."
+      echo "No local SCSI SSD disks found."
+    fi
+  done
+
+  # The following mounts or symlinks NVMe devices
+  get-local-disk-num "nvme" "block"
+  local nvmeblocknum="${localdisknum}"
+  local i=0
+  for ssd in /dev/nvme*; do
+    if [ -e "${ssd}" ]; then
+      # This workaround to find if the NVMe device is a disk is required because
+      # the existing Google images does not expose NVMe devices in /dev/disk/by-id
+      if [[ `udevadm info --query=property --name=${ssd} | grep DEVTYPE | sed "s/DEVTYPE=//"` == "disk" ]]; then
+        local devicenum=`echo ${ssd} | sed -e 's/\/dev\/nvme0n\([0-9]*\)/\1/'`
+        if [[ "${i}" -lt "${nvmeblocknum}" ]]; then
+          mount-ext "${ssd}" "${devicenum}" "nvme" "block"
+        else
+          mount-ext "${ssd}" "${devicenum}" "nvme" "fs"
+        fi
+        i=$((i+1))
+      fi
+    else
+      echo "No local NVMe SSD disks found."
     fi
   done
 }
diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh
index 8a761118f7..f329a2600e 100755
--- a/cluster/gce/util.sh
+++ b/cluster/gce/util.sh
@@ -18,6 +18,8 @@
 
 # Use the config file specified in $KUBE_CONFIG_FILE, or default to
 # config-default.sh.
+readonly GCE_MAX_LOCAL_SSD=8
+
 KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../..
 source "${KUBE_ROOT}/cluster/gce/${KUBE_CONFIG_FILE-"config-default.sh"}"
 source "${KUBE_ROOT}/cluster/common.sh"
@@ -37,6 +39,11 @@ else
   exit 1
 fi
 
+if [[ ${NODE_LOCAL_SSDS:-} -ge 1 ]] && [[ ! -z ${NODE_LOCAL_SSDS_EXT:-} ]] ; then
+  echo -e "${color_red}Local SSD: Only one of NODE_LOCAL_SSDS and NODE_LOCAL_SSDS_EXT can be specified at once${color_norm}" >&2
+  exit 2
+fi
+
 if [[ "${MASTER_OS_DISTRIBUTION}" == "gci" ]]; then
     DEFAULT_GCI_PROJECT=google-containers
     if [[ "${GCI_VERSION}" == "cos"* ]]; then
@@ -546,6 +553,29 @@ function get-template-name-from-version() {
   echo "${NODE_INSTANCE_PREFIX}-template-${1}" | cut -c 1-63 | sed 's/[\.\+]/-/g;s/-*$//g'
 }
 
+# validates the NODE_LOCAL_SSDS_EXT variable 
+function validate-node-local-ssds-ext(){
+  ssdopts="${1}"
+
+  if [[ -z "${ssdopts[0]}" || -z "${ssdopts[1]}" || -z "${ssdopts[2]}" ]]; then
+	  echo -e "${color_red}Local SSD: NODE_LOCAL_SSDS_EXT is malformed, found ${ssdopts[0]-_},${ssdopts[1]-_},${ssdopts[2]-_} ${color_norm}" >&2
+    exit 2
+  fi
+  if [[ "${ssdopts[1]}" != "scsi" && "${ssdopts[1]}" != "nvme" ]]; then
+    echo -e "${color_red}Local SSD: Interface must be scsi or nvme, found: ${ssdopts[1]} ${color_norm}" >&2
+    exit 2
+  fi
+  if [[ "${ssdopts[2]}" != "fs" && "${ssdopts[2]}" != "block" ]]; then
+    echo -e "${color_red}Local SSD: Filesystem type must be fs or block, found: ${ssdopts[2]} ${color_norm}"  >&2
+    exit 2
+  fi
+  local_ssd_ext_count=$((local_ssd_ext_count+ssdopts[0]))
+  if [[ "${local_ssd_ext_count}" -gt "${GCE_MAX_LOCAL_SSD}" || "${local_ssd_ext_count}" -lt 1 ]]; then
+    echo -e "${color_red}Local SSD: Total number of local ssds must range from 1 to 8, found: ${local_ssd_ext_count} ${color_norm}" >&2
+    exit 2
+  fi
+}
+
 # Robustly try to create an instance template.
 # $1: The name of the instance template.
 # $2: The scopes flag.
@@ -587,6 +617,19 @@ function create-node-template() {
   fi
 
   local local_ssds=""
+  local_ssd_ext_count=0
+  if [[ ! -z ${NODE_LOCAL_SSDS_EXT:-} ]]; then
+    IFS=";" read -r -a ssdgroups <<< "${NODE_LOCAL_SSDS_EXT:-}"
+    for ssdgroup in "${ssdgroups[@]}"
+    do
+      IFS="," read -r -a ssdopts <<< "${ssdgroup}"
+      validate-node-local-ssds-ext "${ssdopts}"
+      for i in $(seq ${ssdopts[0]}); do
+        local_ssds="$local_ssds--local-ssd=interface=${ssdopts[1]} "
+      done
+    done
+  fi
+  
   if [[ ! -z ${NODE_LOCAL_SSDS+x} ]]; then
     # The NODE_LOCAL_SSDS check below fixes issue #49171
     # Some versions of seq will count down from 1 if "seq 0" is specified
@@ -596,6 +639,7 @@ function create-node-template() {
       done
     fi
   fi
+  
 
   local network=$(make-gcloud-network-argument \
     "${NETWORK_PROJECT}" \
diff --git a/test/e2e/storage/persistent_volumes-local.go b/test/e2e/storage/persistent_volumes-local.go
index 6744fc78cb..49b3dbe91d 100644
--- a/test/e2e/storage/persistent_volumes-local.go
+++ b/test/e2e/storage/persistent_volumes-local.go
@@ -21,6 +21,8 @@ import (
 	"fmt"
 	"path"
 	"path/filepath"
+	"strconv"
+	"strings"
 	"time"
 
 	. "github.com/onsi/ginkgo"
@@ -54,8 +56,22 @@ const (
 	DirectoryLocalVolumeType LocalVolumeType = "dir"
 	// creates a tmpfs and mounts it
 	TmpfsLocalVolumeType LocalVolumeType = "tmpfs"
+	// tests based on local ssd at /mnt/disks/by-uuid/
+	GCELocalSSDVolumeType LocalVolumeType = "gce-localssd-scsi-fs"
 )
 
+var setupLocalVolumeMap = map[LocalVolumeType]func(*localTestConfig) *localTestVolume{
+	GCELocalSSDVolumeType:    setupLocalVolumeGCELocalSSD,
+	TmpfsLocalVolumeType:     setupLocalVolumeTmpfs,
+	DirectoryLocalVolumeType: setupLocalVolumeDirectory,
+}
+
+var cleanupLocalVolumeMap = map[LocalVolumeType]func(*localTestConfig, *localTestVolume){
+	GCELocalSSDVolumeType:    cleanupLocalVolumeGCELocalSSD,
+	TmpfsLocalVolumeType:     cleanupLocalVolumeTmpfs,
+	DirectoryLocalVolumeType: cleanupLocalVolumeDirectory,
+}
+
 type localTestVolume struct {
 	// Node that the volume is on
 	node *v1.Node
@@ -199,105 +215,100 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S
 		})
 	})
 
-	LocalVolumeTypes := []LocalVolumeType{DirectoryLocalVolumeType, TmpfsLocalVolumeType}
-
-	Context("when two pods mount a local volume at the same time", func() {
-		It("should be able to write from pod1 and read from pod2", func() {
-			for _, testVolType := range LocalVolumeTypes {
-				var testVol *localTestVolume
-				By(fmt.Sprintf("local-volume-type: %s", testVolType))
-				testVol = setupLocalVolumePVCPV(config, testVolType)
-				twoPodsReadWriteTest(config, testVol)
-				cleanupLocalVolume(config, testVol)
-			}
-		})
-	})
-
-	Context("when two pods mount a local volume one after the other", func() {
-		It("should be able to write from pod1 and read from pod2", func() {
-			for _, testVolType := range LocalVolumeTypes {
-				var testVol *localTestVolume
-				By(fmt.Sprintf("local-volume-type: %s", testVolType))
-				testVol = setupLocalVolumePVCPV(config, testVolType)
-				twoPodsReadWriteSerialTest(config, testVol)
-				cleanupLocalVolume(config, testVol)
-			}
-		})
-	})
-
-	Context("when pod using local volume with non-existant path", func() {
-		ep := &eventPatterns{
-			reason:  "FailedMount",
-			pattern: make([]string, 2)}
-		ep.pattern = append(ep.pattern, "MountVolume.SetUp failed")
-		ep.pattern = append(ep.pattern, "does not exist")
-
-		It("should not be able to mount", func() {
-			for _, testVolType := range LocalVolumeTypes {
-				By(fmt.Sprintf("local-volume-type: %s", testVolType))
-				testVol := &localTestVolume{
-					node:            config.node0,
-					hostDir:         "/non-existent/location/nowhere",
-					localVolumeType: testVolType,
+	LocalVolumeTypes := []LocalVolumeType{DirectoryLocalVolumeType, TmpfsLocalVolumeType, GCELocalSSDVolumeType}
+	for _, tempTestVolType := range LocalVolumeTypes {
+		// New variable required for gingko test closures
+		testVolType := tempTestVolType
+		ctxString := fmt.Sprintf("when using volume type %s", testVolType)
+		Context(ctxString, func() {
+			BeforeEach(func() {
+				if testVolType == GCELocalSSDVolumeType {
+					SkipUnlessLocalSSDExists("scsi", "fs", config.node0)
 				}
-				By("Creating local PVC and PV")
-				createLocalPVCPV(config, testVol)
-				pod, err := createLocalPod(config, testVol)
-				Expect(err).To(HaveOccurred())
-				checkPodEvents(config, pod.Name, ep)
-			}
-		})
-	})
-
-	Context("when pod's node is different from PV's NodeAffinity", func() {
-
-		BeforeEach(func() {
-			if len(config.nodes.Items) < 2 {
-				framework.Skipf("Runs only when number of nodes >= 2")
-			}
-		})
-
-		ep := &eventPatterns{
-			reason:  "FailedScheduling",
-			pattern: make([]string, 2)}
-		ep.pattern = append(ep.pattern, "MatchNodeSelector")
-		ep.pattern = append(ep.pattern, "NoVolumeNodeConflict")
-		for _, testVolType := range LocalVolumeTypes {
-
-			It("should not be able to mount due to different NodeAffinity", func() {
-
-				testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeAffinity)
 			})
 
-			It("should not be able to mount due to different NodeSelector", func() {
+			Context("when two pods mount a local volume at the same time", func() {
+				It("should be able to write from pod1 and read from pod2", func() {
+					var testVol *localTestVolume
+					testVol = setupLocalVolumePVCPV(config, testVolType)
+					twoPodsReadWriteTest(config, testVol)
+					cleanupLocalVolume(config, testVol)
+				})
+
+			})
+			Context("when two pods mount a local volume one after the other", func() {
+				It("should be able to write from pod1 and read from pod2", func() {
+					var testVol *localTestVolume
+					testVol = setupLocalVolumePVCPV(config, testVolType)
+					twoPodsReadWriteSerialTest(config, testVol)
+					cleanupLocalVolume(config, testVol)
+				})
+			})
+			Context("when pod using local volume with non-existant path", func() {
+
+				ep := &eventPatterns{
+					reason:  "FailedMount",
+					pattern: make([]string, 2)}
+				ep.pattern = append(ep.pattern, "MountVolume.SetUp failed")
+				ep.pattern = append(ep.pattern, "does not exist")
+
+				It("should not be able to mount", func() {
+					testVol := &localTestVolume{
+						node:            config.node0,
+						hostDir:         "/non-existent/location/nowhere",
+						localVolumeType: testVolType,
+					}
+					By("Creating local PVC and PV")
+					createLocalPVCPV(config, testVol)
+					pod, err := createLocalPod(config, testVol)
+					Expect(err).To(HaveOccurred())
+					checkPodEvents(config, pod.Name, ep)
+				})
+
+			})
+			Context("when pod's node is different from PV's NodeAffinity", func() {
+
+				BeforeEach(func() {
+					if len(config.nodes.Items) < 2 {
+						framework.Skipf("Runs only when number of nodes >= 2")
+					}
+				})
+
+				ep := &eventPatterns{
+					reason:  "FailedScheduling",
+					pattern: make([]string, 2)}
+				ep.pattern = append(ep.pattern, "MatchNodeSelector")
+				ep.pattern = append(ep.pattern, "NoVolumeNodeConflict")
+
+				It("should not be able to mount due to different NodeAffinity", func() {
+					testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeAffinity)
+				})
+				It("should not be able to mount due to different NodeSelector", func() {
+					testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeSelector)
+				})
+
+			})
+			Context("when pod's node is different from PV's NodeName", func() {
+
+				BeforeEach(func() {
+					if len(config.nodes.Items) < 2 {
+						framework.Skipf("Runs only when number of nodes >= 2")
+					}
+				})
+
+				ep := &eventPatterns{
+					reason:  "FailedMount",
+					pattern: make([]string, 2)}
+				ep.pattern = append(ep.pattern, "NodeSelectorTerm")
+				ep.pattern = append(ep.pattern, "Storage node affinity check failed")
+				It("should not be able to mount due to different NodeName", func() {
+					testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeName)
+				})
 
-				testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeSelector)
 			})
 
-		}
-	})
-
-	Context("when pod's node is different from PV's NodeName", func() {
-
-		BeforeEach(func() {
-			if len(config.nodes.Items) < 2 {
-				framework.Skipf("Runs only when number of nodes >= 2")
-			}
 		})
-
-		ep := &eventPatterns{
-			reason:  "FailedMount",
-			pattern: make([]string, 2)}
-		ep.pattern = append(ep.pattern, "NodeSelectorTerm")
-		ep.pattern = append(ep.pattern, "Storage node affinity check failed")
-		for _, testVolType := range LocalVolumeTypes {
-
-			It("should not be able to mount due to different NodeName", func() {
-
-				testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeName)
-			})
-		}
-	})
+	}
 
 	Context("when using local volume provisioner", func() {
 		var volumePath string
@@ -362,7 +373,6 @@ type makeLocalPodWith func(config *localTestConfig, volume *localTestVolume, nod
 
 func testPodWithNodeName(config *localTestConfig, testVolType LocalVolumeType, ep *eventPatterns, nodeName string, makeLocalPodFunc makeLocalPodWith) {
 	var testVol *localTestVolume
-	By(fmt.Sprintf("local-volume-type: %s", testVolType))
 	testVol = setupLocalVolumePVCPV(config, testVolType)
 
 	pod := makeLocalPodFunc(config, testVol, nodeName)
@@ -486,16 +496,7 @@ func podNodeName(config *localTestConfig, pod *v1.Pod) (string, error) {
 	return runtimePod.Spec.NodeName, runtimePodErr
 }
 
-// setupLocalVolume setups a directory to user for local PV
-func setupLocalVolume(config *localTestConfig, localVolumeType LocalVolumeType) *localTestVolume {
-	testDirName := "local-volume-test-" + string(uuid.NewUUID())
-	hostDir := filepath.Join(hostBase, testDirName)
-
-	if localVolumeType == TmpfsLocalVolumeType {
-		createAndMountTmpfsLocalVolume(config, hostDir)
-	}
-
-	// populate volume with testFile containing testFileContent
+func setupWriteTestFile(hostDir string, config *localTestConfig, localVolumeType LocalVolumeType) *localTestVolume {
 	writeCmd, _ := createWriteAndReadCmds(hostDir, testFile, testFileContent)
 	By(fmt.Sprintf("Creating local volume on node %q at path %q", config.node0.Name, hostDir))
 	err := framework.IssueSSHCommand(writeCmd, framework.TestContext.Provider, config.node0)
@@ -507,7 +508,30 @@ func setupLocalVolume(config *localTestConfig, localVolumeType LocalVolumeType)
 	}
 }
 
-// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory
+func setupLocalVolumeTmpfs(config *localTestConfig) *localTestVolume {
+	testDirName := "local-volume-test-" + string(uuid.NewUUID())
+	hostDir := filepath.Join(hostBase, testDirName)
+	createAndMountTmpfsLocalVolume(config, hostDir)
+	// populate volume with testFile containing testFileContent
+	return setupWriteTestFile(hostDir, config, TmpfsLocalVolumeType)
+}
+
+func setupLocalVolumeGCELocalSSD(config *localTestConfig) *localTestVolume {
+	res, err := framework.IssueSSHCommandWithResult("ls /mnt/disks/by-uuid/google-local-ssds-scsi-fs/", framework.TestContext.Provider, config.node0)
+	Expect(err).NotTo(HaveOccurred())
+	dirName := strings.Fields(res.Stdout)[0]
+	hostDir := "/mnt/disks/by-uuid/google-local-ssds-scsi-fs/" + dirName
+	// populate volume with testFile containing testFileContent
+	return setupWriteTestFile(hostDir, config, GCELocalSSDVolumeType)
+}
+
+func setupLocalVolumeDirectory(config *localTestConfig) *localTestVolume {
+	testDirName := "local-volume-test-" + string(uuid.NewUUID())
+	hostDir := filepath.Join(hostBase, testDirName)
+	// populate volume with testFile containing testFileContent
+	return setupWriteTestFile(hostDir, config, DirectoryLocalVolumeType)
+}
+
 func cleanupLocalVolume(config *localTestConfig, volume *localTestVolume) {
 	if volume == nil {
 		return
@@ -519,10 +543,30 @@ func cleanupLocalVolume(config *localTestConfig, volume *localTestVolume) {
 		framework.Failf("Failed to delete PV and/or PVC: %v", utilerrors.NewAggregate(errs))
 	}
 
-	if volume.localVolumeType == TmpfsLocalVolumeType {
-		unmountTmpfsLocalVolume(config, volume.hostDir)
-	}
+	cleanup := cleanupLocalVolumeMap[volume.localVolumeType]
+	cleanup(config, volume)
+}
 
+// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory
+func cleanupLocalVolumeGCELocalSSD(config *localTestConfig, volume *localTestVolume) {
+	By("Removing the test directory")
+	removeCmd := fmt.Sprintf("rm %s", volume.hostDir+"/"+testFile)
+	err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0)
+	Expect(err).NotTo(HaveOccurred())
+}
+
+// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory
+func cleanupLocalVolumeTmpfs(config *localTestConfig, volume *localTestVolume) {
+	unmountTmpfsLocalVolume(config, volume.hostDir)
+
+	By("Removing the test directory")
+	removeCmd := fmt.Sprintf("rm -r %s", volume.hostDir)
+	err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0)
+	Expect(err).NotTo(HaveOccurred())
+}
+
+// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory
+func cleanupLocalVolumeDirectory(config *localTestConfig, volume *localTestVolume) {
 	By("Removing the test directory")
 	removeCmd := fmt.Sprintf("rm -r %s", volume.hostDir)
 	err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0)
@@ -703,7 +747,9 @@ func podRWCmdExec(pod *v1.Pod, cmd string) string {
 // and create local PVC and PV
 func setupLocalVolumePVCPV(config *localTestConfig, localVolumeType LocalVolumeType) *localTestVolume {
 	By("Initializing test volume")
-	testVol := setupLocalVolume(config, localVolumeType)
+	setupLocalVolume, ok := setupLocalVolumeMap[localVolumeType]
+	Expect(ok).To(BeTrue())
+	testVol := setupLocalVolume(config)
 
 	By("Creating local PVC and PV")
 	createLocalPVCPV(config, testVol)
@@ -921,3 +967,16 @@ func findLocalPersistentVolume(c clientset.Interface, volumePath string) (*v1.Pe
 	}
 	return nil, fmt.Errorf("Unable to find local persistent volume with path %v", volumePath)
 }
+
+// SkipUnlessLocalSSDExists takes in an ssdInterface (scsi/nvme) and a filesystemType (fs/block)
+// and skips if a disk of that type does not exist on the node
+func SkipUnlessLocalSSDExists(ssdInterface, filesystemType string, node *v1.Node) {
+	ssdCmd := fmt.Sprintf("ls -1 /mnt/disks/by-uuid/google-local-ssds-%s-%s/ | wc -l", ssdInterface, filesystemType)
+	res, err := framework.IssueSSHCommandWithResult(ssdCmd, framework.TestContext.Provider, node)
+	Expect(err).NotTo(HaveOccurred())
+	num, err := strconv.Atoi(strings.TrimSpace(res.Stdout))
+	Expect(err).NotTo(HaveOccurred())
+	if num < 1 {
+		framework.Skipf("Requires at least 1 %s %s localSSD ", ssdInterface, filesystemType)
+	}
+}

From 79e1da68d2f93f8becc903f84f4da515989eaa69 Mon Sep 17 00:00:00 2001
From: rohitjogvmw <rohitj@vmware.com>
Date: Wed, 15 Nov 2017 22:24:23 -0800
Subject: [PATCH 04/33] Updating vSphere Cloud Provider (VCP) to support k8s
 cluster spead across multiple ESXi clusters, datacenters or even vSphere
 vCenters

   - vsphere.conf (cloud-config) is now needed only on master node
   - VCP uses OS hostname and not vSphere inventory name
   - VCP is now resilient to VM inventory name change and VM migration
---
 pkg/cloudprovider/providers/vsphere/BUILD     |   5 +-
 .../providers/vsphere/nodemanager.go          | 295 +++++++
 .../providers/vsphere/vclib/custom_errors.go  |   2 +
 .../providers/vsphere/vclib/datacenter.go     |  70 +-
 .../providers/vsphere/vclib/datastore.go      |  11 +
 .../vsphere/vclib/diskmanagers/vdm.go         |   8 +-
 .../vsphere/vclib/diskmanagers/virtualdisk.go |   6 +-
 .../vsphere/vclib/diskmanagers/vmdm.go        |   2 +-
 .../providers/vsphere/vclib/pbm.go            |  10 +-
 .../providers/vsphere/vclib/utils.go          |  43 +-
 .../providers/vsphere/vclib/virtualmachine.go |  44 +-
 .../providers/vsphere/vsphere.go              | 768 +++++++++++++-----
 .../providers/vsphere/vsphere_test.go         |  19 +-
 .../providers/vsphere/vsphere_util.go         | 341 ++++++--
 pkg/volume/vsphere_volume/attacher.go         |   2 +-
 .../e2e/storage/persistent_volumes-vsphere.go |  12 +-
 test/e2e/storage/pv_reclaimpolicy.go          |  14 +-
 test/e2e/storage/pvc_label_selector.go        |   5 +-
 test/e2e/storage/volumes.go                   |   7 +-
 test/e2e/storage/vsphere_scale.go             |   6 +-
 test/e2e/storage/vsphere_statefulsets.go      |   7 +-
 test/e2e/storage/vsphere_stress.go            |  10 +-
 test/e2e/storage/vsphere_utils.go             |  44 +-
 test/e2e/storage/vsphere_volume_cluster_ds.go |   7 +-
 test/e2e/storage/vsphere_volume_datastore.go  |   2 +-
 test/e2e/storage/vsphere_volume_diskformat.go |   4 +-
 test/e2e/storage/vsphere_volume_fstype.go     |  10 +-
 .../storage/vsphere_volume_master_restart.go  |   9 +-
 .../storage/vsphere_volume_node_poweroff.go   |   8 +-
 test/e2e/storage/vsphere_volume_ops_storm.go  |   6 +-
 test/e2e/storage/vsphere_volume_perf.go       |   7 +-
 test/e2e/storage/vsphere_volume_placement.go  |   8 +-
 .../e2e/storage/vsphere_volume_vsan_policy.go |   6 +-
 33 files changed, 1412 insertions(+), 386 deletions(-)
 create mode 100644 pkg/cloudprovider/providers/vsphere/nodemanager.go

diff --git a/pkg/cloudprovider/providers/vsphere/BUILD b/pkg/cloudprovider/providers/vsphere/BUILD
index f5b75b0c9e..91de27e7ad 100644
--- a/pkg/cloudprovider/providers/vsphere/BUILD
+++ b/pkg/cloudprovider/providers/vsphere/BUILD
@@ -9,6 +9,7 @@ load(
 go_library(
     name = "go_default_library",
     srcs = [
+        "nodemanager.go",
         "vsphere.go",
         "vsphere_util.go",
     ],
@@ -21,13 +22,15 @@ go_library(
         "//pkg/controller:go_default_library",
         "//vendor/github.com/golang/glog:go_default_library",
         "//vendor/github.com/vmware/govmomi:go_default_library",
-        "//vendor/github.com/vmware/govmomi/object:go_default_library",
         "//vendor/github.com/vmware/govmomi/vim25:go_default_library",
         "//vendor/github.com/vmware/govmomi/vim25/mo:go_default_library",
         "//vendor/golang.org/x/net/context:go_default_library",
         "//vendor/gopkg.in/gcfg.v1:go_default_library",
         "//vendor/k8s.io/api/core/v1:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
+        "//vendor/k8s.io/client-go/informers:go_default_library",
+        "//vendor/k8s.io/client-go/tools/cache:go_default_library",
     ],
 )
 
diff --git a/pkg/cloudprovider/providers/vsphere/nodemanager.go b/pkg/cloudprovider/providers/vsphere/nodemanager.go
new file mode 100644
index 0000000000..493ea61045
--- /dev/null
+++ b/pkg/cloudprovider/providers/vsphere/nodemanager.go
@@ -0,0 +1,295 @@
+/*
+Copyright 2016 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vsphere
+
+import (
+	"fmt"
+	"github.com/golang/glog"
+	"golang.org/x/net/context"
+	"k8s.io/api/core/v1"
+	k8stypes "k8s.io/apimachinery/pkg/types"
+	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib"
+	"strings"
+	"sync"
+)
+
+// Stores info about the kubernetes node
+type NodeInfo struct {
+	dataCenter *vclib.Datacenter
+	vm         *vclib.VirtualMachine
+	vcServer   string
+}
+
+type NodeManager struct {
+	// TODO: replace map with concurrent map when k8s supports go v1.9
+
+	// Maps the VC server to VSphereInstance
+	vsphereInstanceMap map[string]*VSphereInstance
+	// Maps node name to node info.
+	nodeInfoMap map[string]*NodeInfo
+	// Maps node name to node structure
+	registeredNodes map[string]*v1.Node
+
+	// Mutexes
+	registeredNodesLock sync.RWMutex
+	nodeInfoLock        sync.RWMutex
+}
+
+type NodeDetails struct {
+	NodeName string
+	vm       *vclib.VirtualMachine
+}
+
+// TODO: Make it configurable in vsphere.conf
+const (
+	POOL_SIZE  = 8
+	QUEUE_SIZE = POOL_SIZE * 10
+)
+
+func (nm *NodeManager) DiscoverNode(node *v1.Node) error {
+	type VmSearch struct {
+		vc         string
+		datacenter *vclib.Datacenter
+	}
+
+	var mutex = &sync.Mutex{}
+	var globalErrMutex = &sync.Mutex{}
+	var queueChannel chan *VmSearch
+	var wg sync.WaitGroup
+	var globalErr *error
+
+	queueChannel = make(chan *VmSearch, QUEUE_SIZE)
+	nodeUUID := node.Status.NodeInfo.SystemUUID
+	vmFound := false
+	globalErr = nil
+
+	setGlobalErr := func(err error) {
+		globalErrMutex.Lock()
+		globalErr = &err
+		globalErrMutex.Unlock()
+	}
+
+	setVMFound := func(found bool) {
+		mutex.Lock()
+		vmFound = found
+		mutex.Unlock()
+	}
+
+	getVMFound := func() bool {
+		mutex.Lock()
+		found := vmFound
+		mutex.Unlock()
+		return found
+	}
+
+	go func() {
+		var datacenterObjs []*vclib.Datacenter
+		for vc, vsi := range nm.vsphereInstanceMap {
+
+			found := getVMFound()
+			if found == true {
+				break
+			}
+
+			// Create context
+			ctx, cancel := context.WithCancel(context.Background())
+			defer cancel()
+
+			err := vsi.conn.Connect(ctx)
+			if err != nil {
+				glog.V(4).Info("Discovering node error vc:", err)
+				setGlobalErr(err)
+				continue
+			}
+
+			if vsi.cfg.Datacenters == "" {
+				datacenterObjs, err = vclib.GetAllDatacenter(ctx, vsi.conn)
+				if err != nil {
+					glog.V(4).Info("Discovering node error dc:", err)
+					setGlobalErr(err)
+					continue
+				}
+			} else {
+				datacenters := strings.Split(vsi.cfg.Datacenters, ",")
+				for _, dc := range datacenters {
+					dc = strings.TrimSpace(dc)
+					if dc == "" {
+						continue
+					}
+					datacenterObj, err := vclib.GetDatacenter(ctx, vsi.conn, dc)
+					if err != nil {
+						glog.V(4).Info("Discovering node error dc:", err)
+						setGlobalErr(err)
+						continue
+					}
+					datacenterObjs = append(datacenterObjs, datacenterObj)
+				}
+			}
+
+			for _, datacenterObj := range datacenterObjs {
+				found := getVMFound()
+				if found == true {
+					break
+				}
+
+				glog.V(4).Infof("Finding node %s in vc=%s and datacenter=%s", node.Name, vc, datacenterObj.Name())
+				queueChannel <- &VmSearch{
+					vc:         vc,
+					datacenter: datacenterObj,
+				}
+			}
+		}
+		close(queueChannel)
+	}()
+
+	for i := 0; i < POOL_SIZE; i++ {
+		go func() {
+			for res := range queueChannel {
+				ctx, cancel := context.WithCancel(context.Background())
+				defer cancel()
+				vm, err := res.datacenter.GetVMByUUID(ctx, nodeUUID)
+				if err != nil {
+					glog.V(4).Infof("Error %q while looking for vm=%+v in vc=%s and datacenter=%s",
+						err, node.Name, vm, res.vc, res.datacenter.Name())
+					if err != vclib.ErrNoVMFound {
+						setGlobalErr(err)
+					} else {
+						glog.V(4).Infof("Did not find node %s in vc=%s and datacenter=%s",
+							node.Name, res.vc, res.datacenter.Name(), err)
+					}
+					continue
+				}
+				if vm != nil {
+					glog.V(4).Infof("Found node %s as vm=%+v in vc=%s and datacenter=%s",
+						node.Name, vm, res.vc, res.datacenter.Name())
+
+					nodeInfo := &NodeInfo{dataCenter: res.datacenter, vm: vm, vcServer: res.vc}
+					nm.addNodeInfo(node.ObjectMeta.Name, nodeInfo)
+					for range queueChannel {
+					}
+					setVMFound(true)
+					break
+				}
+			}
+			wg.Done()
+		}()
+		wg.Add(1)
+	}
+	wg.Wait()
+	if vmFound {
+		return nil
+	}
+	if globalErr != nil {
+		return *globalErr
+	}
+
+	glog.V(4).Infof("Discovery Node: %q vm not found", node.Name)
+	return vclib.ErrNoVMFound
+}
+
+func (nm *NodeManager) RegisterNode(node *v1.Node) error {
+	nm.addNode(node)
+	nm.DiscoverNode(node)
+	return nil
+}
+
+func (nm *NodeManager) UnRegisterNode(node *v1.Node) error {
+	nm.removeNode(node)
+	return nil
+}
+
+func (nm *NodeManager) RediscoverNode(nodeName k8stypes.NodeName) error {
+	node, err := nm.GetNode(nodeName)
+
+	if err != nil {
+		return err
+	}
+	return nm.DiscoverNode(&node)
+}
+
+func (nm *NodeManager) GetNode(nodeName k8stypes.NodeName) (v1.Node, error) {
+	nm.registeredNodesLock.RLock()
+	node := nm.registeredNodes[convertToString(nodeName)]
+	nm.registeredNodesLock.RUnlock()
+	if node == nil {
+		return v1.Node{}, vclib.ErrNoVMFound
+	}
+	return *node, nil
+}
+
+func (nm *NodeManager) addNode(node *v1.Node) {
+	nm.registeredNodesLock.Lock()
+	nm.registeredNodes[node.ObjectMeta.Name] = node
+	nm.registeredNodesLock.Unlock()
+}
+
+func (nm *NodeManager) removeNode(node *v1.Node) {
+	nm.registeredNodesLock.Lock()
+	delete(nm.registeredNodes, node.ObjectMeta.Name)
+	nm.registeredNodesLock.Unlock()
+}
+
+// GetNodeInfo returns a NodeInfo which datacenter, vm and vc server ip address.
+// This method returns an error if it is unable find node VCs and DCs listed in vSphere.conf
+// NodeInfo returned may not be updated to reflect current VM location.
+func (nm *NodeManager) GetNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error) {
+	getNodeInfo := func(nodeName k8stypes.NodeName) *NodeInfo {
+		nm.nodeInfoLock.RLock()
+		nodeInfo := nm.nodeInfoMap[convertToString(nodeName)]
+		nm.nodeInfoLock.RUnlock()
+		return nodeInfo
+	}
+	nodeInfo := getNodeInfo(nodeName)
+	if nodeInfo == nil {
+		err := nm.RediscoverNode(nodeName)
+		if err != nil {
+			glog.V(4).Infof("error %q node info for node %q not found", err, convertToString(nodeName))
+			return NodeInfo{}, err
+		}
+		nodeInfo = getNodeInfo(nodeName)
+	}
+	return *nodeInfo, nil
+}
+
+func (nm *NodeManager) GetNodeDetails() []NodeDetails {
+	nm.nodeInfoLock.RLock()
+	defer nm.nodeInfoLock.RUnlock()
+	var nodeDetails []NodeDetails
+	for nodeName, nodeInfo := range nm.nodeInfoMap {
+		nodeDetails = append(nodeDetails, NodeDetails{nodeName, nodeInfo.vm})
+	}
+	return nodeDetails
+}
+
+func (nm *NodeManager) addNodeInfo(nodeName string, nodeInfo *NodeInfo) {
+	nm.nodeInfoLock.Lock()
+	nm.nodeInfoMap[nodeName] = nodeInfo
+	nm.nodeInfoLock.Unlock()
+}
+
+func (nm *NodeManager) GetVSphereInstance(nodeName k8stypes.NodeName) (VSphereInstance, error) {
+	nodeInfo, err := nm.GetNodeInfo(nodeName)
+	if err != nil {
+		glog.V(4).Infof("node info for node %q not found", convertToString(nodeName))
+		return VSphereInstance{}, err
+	}
+	vsphereInstance := nm.vsphereInstanceMap[nodeInfo.vcServer]
+	if vsphereInstance == nil {
+		return VSphereInstance{}, fmt.Errorf("vSphereInstance for vc server %q not found while looking for node %q", nodeInfo.vcServer, convertToString(nodeName))
+	}
+	return *vsphereInstance, nil
+}
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/custom_errors.go b/pkg/cloudprovider/providers/vsphere/vclib/custom_errors.go
index 391f328f42..6709c4cf21 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/custom_errors.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/custom_errors.go
@@ -25,6 +25,7 @@ const (
 	NoDevicesFoundErrMsg       = "No devices found"
 	DiskNotFoundErrMsg         = "No vSphere disk ID found"
 	InvalidVolumeOptionsErrMsg = "VolumeOptions verification failed"
+	NoVMFoundErrMsg            = "No VM found"
 )
 
 // Error constants
@@ -34,4 +35,5 @@ var (
 	ErrNoDevicesFound       = errors.New(NoDevicesFoundErrMsg)
 	ErrNoDiskIDFound        = errors.New(DiskNotFoundErrMsg)
 	ErrInvalidVolumeOptions = errors.New(InvalidVolumeOptionsErrMsg)
+	ErrNoVMFound            = errors.New(NoVMFoundErrMsg)
 )
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/datacenter.go b/pkg/cloudprovider/providers/vsphere/vclib/datacenter.go
index ebb54b9431..d325c72dfe 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/datacenter.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/datacenter.go
@@ -49,6 +49,22 @@ func GetDatacenter(ctx context.Context, connection *VSphereConnection, datacente
 	return &dc, nil
 }
 
+// GetAllDatacenter returns all the DataCenter Objects
+func GetAllDatacenter(ctx context.Context, connection *VSphereConnection) ([]*Datacenter, error) {
+	var dc []*Datacenter
+	finder := find.NewFinder(connection.GoVmomiClient.Client, true)
+	datacenters, err := finder.DatacenterList(ctx, "*")
+	if err != nil {
+		glog.Errorf("Failed to find the datacenter. err: %+v", err)
+		return nil, err
+	}
+	for _, datacenter := range datacenters {
+		dc = append(dc, &(Datacenter{datacenter}))
+	}
+
+	return dc, nil
+}
+
 // GetVMByUUID gets the VM object from the given vmUUID
 func (dc *Datacenter) GetVMByUUID(ctx context.Context, vmUUID string) (*VirtualMachine, error) {
 	s := object.NewSearchIndex(dc.Client())
@@ -60,7 +76,7 @@ func (dc *Datacenter) GetVMByUUID(ctx context.Context, vmUUID string) (*VirtualM
 	}
 	if svm == nil {
 		glog.Errorf("Unable to find VM by UUID. VM UUID: %s", vmUUID)
-		return nil, fmt.Errorf("Failed to find VM by UUID: %s", vmUUID)
+		return nil, ErrNoVMFound
 	}
 	virtualMachine := VirtualMachine{object.NewVirtualMachine(dc.Client(), svm.Reference()), dc}
 	return &virtualMachine, nil
@@ -79,6 +95,41 @@ func (dc *Datacenter) GetVMByPath(ctx context.Context, vmPath string) (*VirtualM
 	return &virtualMachine, nil
 }
 
+// GetAllDatastores gets the datastore URL to DatastoreInfo map for all the datastores in
+// the datacenter.
+func (dc *Datacenter) GetAllDatastores(ctx context.Context) (map[string]*DatastoreInfo, error) {
+	finder := getFinder(dc)
+	datastores, err := finder.DatastoreList(ctx, "*")
+	if err != nil {
+		glog.Errorf("Failed to get all the datastores. err: %+v", err)
+		return nil, err
+	}
+	var dsList []types.ManagedObjectReference
+	for _, ds := range datastores {
+		dsList = append(dsList, ds.Reference())
+	}
+
+	var dsMoList []mo.Datastore
+	pc := property.DefaultCollector(dc.Client())
+	properties := []string{DatastoreInfoProperty}
+	err = pc.Retrieve(ctx, dsList, properties, &dsMoList)
+	if err != nil {
+		glog.Errorf("Failed to get Datastore managed objects from datastore objects."+
+			" dsObjList: %+v, properties: %+v, err: %v", dsList, properties, err)
+		return nil, err
+	}
+
+	dsURLInfoMap := make(map[string]*DatastoreInfo)
+	for _, dsMo := range dsMoList {
+		dsURLInfoMap[dsMo.Info.GetDatastoreInfo().Url] = &DatastoreInfo{
+			&Datastore{object.NewDatastore(dc.Client(), dsMo.Reference()),
+				dc},
+			dsMo.Info.GetDatastoreInfo()}
+	}
+	glog.V(9).Infof("dsURLInfoMap : %+v", dsURLInfoMap)
+	return dsURLInfoMap, nil
+}
+
 // GetDatastoreByPath gets the Datastore object from the given vmDiskPath
 func (dc *Datacenter) GetDatastoreByPath(ctx context.Context, vmDiskPath string) (*Datastore, error) {
 	datastorePathObj := new(object.DatastorePath)
@@ -109,6 +160,23 @@ func (dc *Datacenter) GetDatastoreByName(ctx context.Context, name string) (*Dat
 	return &datastore, nil
 }
 
+// GetResourcePool gets the resource pool for the given path
+func (dc *Datacenter) GetResourcePool(ctx context.Context, computePath string) (*object.ResourcePool, error) {
+	finder := getFinder(dc)
+	var computeResource *object.ComputeResource
+	var err error
+	if computePath == "" {
+		computeResource, err = finder.DefaultComputeResource(ctx)
+	} else {
+		computeResource, err = finder.ComputeResource(ctx, computePath)
+	}
+	if err != nil {
+		glog.Errorf("Failed to get the ResourcePool for computePath '%s'. err: %+v", computePath, err)
+		return nil, err
+	}
+	return computeResource.ResourcePool(ctx)
+}
+
 // GetFolderByPath gets the Folder Object from the given folder path
 // folderPath should be the full path to folder
 func (dc *Datacenter) GetFolderByPath(ctx context.Context, folderPath string) (*Folder, error) {
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/datastore.go b/pkg/cloudprovider/providers/vsphere/vclib/datastore.go
index 1901af1890..8fba424bbd 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/datastore.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/datastore.go
@@ -17,6 +17,7 @@ limitations under the License.
 package vclib
 
 import (
+	"fmt"
 	"github.com/golang/glog"
 	"github.com/vmware/govmomi/object"
 	"github.com/vmware/govmomi/property"
@@ -32,6 +33,16 @@ type Datastore struct {
 	Datacenter *Datacenter
 }
 
+// DatastoreInfo is a structure to store the Datastore and it's Info.
+type DatastoreInfo struct {
+	*Datastore
+	Info *types.DatastoreInfo
+}
+
+func (di DatastoreInfo) String() string {
+	return fmt.Sprintf("Datastore: %+v, datastore URL: %s", di.Datastore, di.Info.Url)
+}
+
 // CreateDirectory creates the directory at location specified by directoryPath.
 // If the intermediate level folders do not exist, and the parameter createParents is true, all the non-existent folders are created.
 // directoryPath must be in the format "[vsanDatastore] kubevols"
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vdm.go b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vdm.go
index 8d860b9d54..3e6d9b2ecd 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vdm.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vdm.go
@@ -70,13 +70,13 @@ func (diskManager virtualDiskManager) Create(ctx context.Context, datastore *vcl
 }
 
 // Delete implements Disk's Delete interface
-func (diskManager virtualDiskManager) Delete(ctx context.Context, datastore *vclib.Datastore) error {
+func (diskManager virtualDiskManager) Delete(ctx context.Context, datacenter *vclib.Datacenter) error {
 	// Create a virtual disk manager
-	virtualDiskManager := object.NewVirtualDiskManager(datastore.Client())
-	diskPath := vclib.RemoveClusterFromVDiskPath(diskManager.diskPath)
+	virtualDiskManager := object.NewVirtualDiskManager(datacenter.Client())
+	diskPath := vclib.RemoveStorageClusterORFolderNameFromVDiskPath(diskManager.diskPath)
 	requestTime := time.Now()
 	// Delete virtual disk
-	task, err := virtualDiskManager.DeleteVirtualDisk(ctx, diskPath, datastore.Datacenter.Datacenter)
+	task, err := virtualDiskManager.DeleteVirtualDisk(ctx, diskPath, datacenter.Datacenter)
 	if err != nil {
 		glog.Errorf("Failed to delete virtual disk. err: %v", err)
 		vclib.RecordvSphereMetric(vclib.APIDeleteVolume, requestTime, err)
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/virtualdisk.go b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/virtualdisk.go
index fbe14b5fbb..533f49ece3 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/virtualdisk.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/virtualdisk.go
@@ -40,7 +40,7 @@ const (
 // VirtualDiskProvider defines interfaces for creating disk
 type VirtualDiskProvider interface {
 	Create(ctx context.Context, datastore *vclib.Datastore) (string, error)
-	Delete(ctx context.Context, datastore *vclib.Datastore) error
+	Delete(ctx context.Context, datacenter *vclib.Datacenter) error
 }
 
 // getDiskManager returns vmDiskManager or vdmDiskManager based on given volumeoptions
@@ -75,6 +75,6 @@ func (virtualDisk *VirtualDisk) Create(ctx context.Context, datastore *vclib.Dat
 }
 
 // Delete gets appropriate disk manager and calls respective delete method
-func (virtualDisk *VirtualDisk) Delete(ctx context.Context, datastore *vclib.Datastore) error {
-	return getDiskManager(virtualDisk, VirtualDiskDeleteOperation).Delete(ctx, datastore)
+func (virtualDisk *VirtualDisk) Delete(ctx context.Context, datacenter *vclib.Datacenter) error {
+	return getDiskManager(virtualDisk, VirtualDiskDeleteOperation).Delete(ctx, datacenter)
 }
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vmdm.go b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vmdm.go
index 62c7018c5c..6942dffb7f 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vmdm.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vmdm.go
@@ -157,7 +157,7 @@ func (vmdisk vmDiskManager) Create(ctx context.Context, datastore *vclib.Datasto
 	return vmdisk.diskPath, nil
 }
 
-func (vmdisk vmDiskManager) Delete(ctx context.Context, datastore *vclib.Datastore) error {
+func (vmdisk vmDiskManager) Delete(ctx context.Context, datacenter *vclib.Datacenter) error {
 	return fmt.Errorf("vmDiskManager.Delete is not supported")
 }
 
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/pbm.go b/pkg/cloudprovider/providers/vsphere/vclib/pbm.go
index df749fb896..5ec83c62b3 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/pbm.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/pbm.go
@@ -85,7 +85,7 @@ func (pbmClient *PbmClient) IsDatastoreCompatible(ctx context.Context, storagePo
 
 // GetCompatibleDatastores filters and returns compatible list of datastores for given storage policy id
 // For Non Compatible Datastores, fault message with the Datastore Name is also returned
-func (pbmClient *PbmClient) GetCompatibleDatastores(ctx context.Context, storagePolicyID string, datastores []*Datastore) ([]*Datastore, string, error) {
+func (pbmClient *PbmClient) GetCompatibleDatastores(ctx context.Context, dc *Datacenter, storagePolicyID string, datastores []*DatastoreInfo) ([]*DatastoreInfo, string, error) {
 	var (
 		dsMorNameMap                                = getDsMorNameMap(ctx, datastores)
 		localizedMessagesForNotCompatibleDatastores = ""
@@ -96,7 +96,7 @@ func (pbmClient *PbmClient) GetCompatibleDatastores(ctx context.Context, storage
 		return nil, "", err
 	}
 	compatibleHubs := compatibilityResult.CompatibleDatastores()
-	var compatibleDatastoreList []*Datastore
+	var compatibleDatastoreList []*DatastoreInfo
 	for _, hub := range compatibleHubs {
 		compatibleDatastoreList = append(compatibleDatastoreList, getDatastoreFromPlacementHub(datastores, hub))
 	}
@@ -121,7 +121,7 @@ func (pbmClient *PbmClient) GetCompatibleDatastores(ctx context.Context, storage
 }
 
 // GetPlacementCompatibilityResult gets placement compatibility result based on storage policy requirements.
-func (pbmClient *PbmClient) GetPlacementCompatibilityResult(ctx context.Context, storagePolicyID string, datastore []*Datastore) (pbm.PlacementCompatibilityResult, error) {
+func (pbmClient *PbmClient) GetPlacementCompatibilityResult(ctx context.Context, storagePolicyID string, datastore []*DatastoreInfo) (pbm.PlacementCompatibilityResult, error) {
 	var hubs []pbmtypes.PbmPlacementHub
 	for _, ds := range datastore {
 		hubs = append(hubs, pbmtypes.PbmPlacementHub{
@@ -145,7 +145,7 @@ func (pbmClient *PbmClient) GetPlacementCompatibilityResult(ctx context.Context,
 }
 
 // getDataStoreForPlacementHub returns matching datastore associated with given pbmPlacementHub
-func getDatastoreFromPlacementHub(datastore []*Datastore, pbmPlacementHub pbmtypes.PbmPlacementHub) *Datastore {
+func getDatastoreFromPlacementHub(datastore []*DatastoreInfo, pbmPlacementHub pbmtypes.PbmPlacementHub) *DatastoreInfo {
 	for _, ds := range datastore {
 		if ds.Reference().Type == pbmPlacementHub.HubType && ds.Reference().Value == pbmPlacementHub.HubId {
 			return ds
@@ -155,7 +155,7 @@ func getDatastoreFromPlacementHub(datastore []*Datastore, pbmPlacementHub pbmtyp
 }
 
 // getDsMorNameMap returns map of ds Mor and Datastore Object Name
-func getDsMorNameMap(ctx context.Context, datastores []*Datastore) map[string]string {
+func getDsMorNameMap(ctx context.Context, datastores []*DatastoreInfo) map[string]string {
 	dsMorNameMap := make(map[string]string)
 	for _, ds := range datastores {
 		dsObjectName, err := ds.ObjectName(ctx)
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/utils.go b/pkg/cloudprovider/providers/vsphere/vclib/utils.go
index 791d05d33d..bac429d6de 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/utils.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/utils.go
@@ -25,6 +25,8 @@ import (
 	"github.com/golang/glog"
 	"github.com/vmware/govmomi/find"
 	"github.com/vmware/govmomi/object"
+	"github.com/vmware/govmomi/vim25/mo"
+	"github.com/vmware/govmomi/vim25/soap"
 	"github.com/vmware/govmomi/vim25/types"
 )
 
@@ -121,10 +123,10 @@ func getSCSIControllers(vmDevices object.VirtualDeviceList) []*types.VirtualCont
 	return scsiControllers
 }
 
-// RemoveClusterFromVDiskPath removes the cluster or folder path from the vDiskPath
+// RemoveStorageClusterORFolderNameFromVDiskPath removes the cluster or folder path from the vDiskPath
 // for vDiskPath [DatastoreCluster/sharedVmfs-0] kubevols/e2e-vmdk-1234.vmdk, return value is [sharedVmfs-0] kubevols/e2e-vmdk-1234.vmdk
 // for vDiskPath [sharedVmfs-0] kubevols/e2e-vmdk-1234.vmdk, return value remains same [sharedVmfs-0] kubevols/e2e-vmdk-1234.vmdk
-func RemoveClusterFromVDiskPath(vDiskPath string) string {
+func RemoveStorageClusterORFolderNameFromVDiskPath(vDiskPath string) string {
 	datastore := regexp.MustCompile("\\[(.*?)\\]").FindStringSubmatch(vDiskPath)[1]
 	if filepath.Base(datastore) != datastore {
 		vDiskPath = strings.Replace(vDiskPath, datastore, filepath.Base(datastore), 1)
@@ -172,3 +174,40 @@ func IsValidUUID(uuid string) bool {
 	r := regexp.MustCompile("^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$")
 	return r.MatchString(uuid)
 }
+
+// IsManagedObjectNotFoundError returns true if error is of type ManagedObjectNotFound
+func IsManagedObjectNotFoundError(err error) bool {
+	isManagedObjectNotFoundError := false
+	if soap.IsSoapFault(err) {
+		_, isManagedObjectNotFoundError = soap.ToSoapFault(err).VimFault().(types.ManagedObjectNotFound)
+	}
+	return isManagedObjectNotFoundError
+}
+
+// VerifyVolumePathsForVM verifies if the volume paths (volPaths) are attached to VM.
+func VerifyVolumePathsForVM(vmMo mo.VirtualMachine, volPaths []string, nodeName string, nodeVolumeMap map[string]map[string]bool) {
+	// Verify if the volume paths are present on the VM backing virtual disk devices
+	vmDevices := object.VirtualDeviceList(vmMo.Config.Hardware.Device)
+	VerifyVolumePathsForVMDevices(vmDevices, volPaths, nodeName, nodeVolumeMap)
+
+}
+
+// VerifyVolumePathsForVMDevices verifies if the volume paths (volPaths) are attached to VM.
+func VerifyVolumePathsForVMDevices(vmDevices object.VirtualDeviceList, volPaths []string, nodeName string, nodeVolumeMap map[string]map[string]bool) {
+	volPathsMap := make(map[string]bool)
+	for _, volPath := range volPaths {
+		volPathsMap[volPath] = true
+	}
+	// Verify if the volume paths are present on the VM backing virtual disk devices
+	for _, device := range vmDevices {
+		if vmDevices.TypeName(device) == "VirtualDisk" {
+			virtualDevice := device.GetVirtualDevice()
+			if backing, ok := virtualDevice.Backing.(*types.VirtualDiskFlatVer2BackingInfo); ok {
+				if volPathsMap[backing.FileName] {
+					setNodeVolumeMap(nodeVolumeMap, backing.FileName, nodeName, true)
+				}
+			}
+		}
+	}
+
+}
diff --git a/pkg/cloudprovider/providers/vsphere/vclib/virtualmachine.go b/pkg/cloudprovider/providers/vsphere/vclib/virtualmachine.go
index 2796f6b687..8077b5583e 100644
--- a/pkg/cloudprovider/providers/vsphere/vclib/virtualmachine.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/virtualmachine.go
@@ -23,6 +23,7 @@ import (
 
 	"github.com/golang/glog"
 	"github.com/vmware/govmomi/object"
+	"github.com/vmware/govmomi/property"
 	"github.com/vmware/govmomi/vim25/mo"
 	"github.com/vmware/govmomi/vim25/types"
 )
@@ -63,7 +64,7 @@ func (vm *VirtualMachine) AttachDisk(ctx context.Context, vmDiskPath string, vol
 		return "", fmt.Errorf("Not a valid SCSI Controller Type. Valid options are %q", SCSIControllerTypeValidOptions())
 	}
 	vmDiskPathCopy := vmDiskPath
-	vmDiskPath = RemoveClusterFromVDiskPath(vmDiskPath)
+	vmDiskPath = RemoveStorageClusterORFolderNameFromVDiskPath(vmDiskPath)
 	attached, err := vm.IsDiskAttached(ctx, vmDiskPath)
 	if err != nil {
 		glog.Errorf("Error occurred while checking if disk is attached on VM: %q. vmDiskPath: %q, err: %+v", vm.InventoryPath, vmDiskPath, err)
@@ -75,6 +76,20 @@ func (vm *VirtualMachine) AttachDisk(ctx context.Context, vmDiskPath string, vol
 		return diskUUID, nil
 	}
 
+	if volumeOptions.StoragePolicyName != "" {
+		pbmClient, err := NewPbmClient(ctx, vm.Client())
+		if err != nil {
+			glog.Errorf("Error occurred while creating new pbmClient. err: %+v", err)
+			return "", err
+		}
+
+		volumeOptions.StoragePolicyID, err = pbmClient.ProfileIDByName(ctx, volumeOptions.StoragePolicyName)
+		if err != nil {
+			glog.Errorf("Failed to get Profile ID by name: %s. err: %+v", volumeOptions.StoragePolicyName, err)
+			return "", err
+		}
+	}
+
 	dsObj, err := vm.Datacenter.GetDatastoreByPath(ctx, vmDiskPathCopy)
 	if err != nil {
 		glog.Errorf("Failed to get datastore from vmDiskPath: %q. err: %+v", vmDiskPath, err)
@@ -139,7 +154,7 @@ func (vm *VirtualMachine) AttachDisk(ctx context.Context, vmDiskPath string, vol
 
 // DetachDisk detaches the disk specified by vmDiskPath
 func (vm *VirtualMachine) DetachDisk(ctx context.Context, vmDiskPath string) error {
-	vmDiskPath = RemoveClusterFromVDiskPath(vmDiskPath)
+	vmDiskPath = RemoveStorageClusterORFolderNameFromVDiskPath(vmDiskPath)
 	device, err := vm.getVirtualDeviceByPath(ctx, vmDiskPath)
 	if err != nil {
 		glog.Errorf("Disk ID not found for VM: %q with diskPath: %q", vm.InventoryPath, vmDiskPath)
@@ -186,7 +201,7 @@ func (vm *VirtualMachine) IsActive(ctx context.Context) (bool, error) {
 }
 
 // GetAllAccessibleDatastores gets the list of accessible Datastores for the given Virtual Machine
-func (vm *VirtualMachine) GetAllAccessibleDatastores(ctx context.Context) ([]*Datastore, error) {
+func (vm *VirtualMachine) GetAllAccessibleDatastores(ctx context.Context) ([]*DatastoreInfo, error) {
 	host, err := vm.HostSystem(ctx)
 	if err != nil {
 		glog.Errorf("Failed to get host system for VM: %q. err: %+v", vm.InventoryPath, err)
@@ -199,9 +214,28 @@ func (vm *VirtualMachine) GetAllAccessibleDatastores(ctx context.Context) ([]*Da
 		glog.Errorf("Failed to retrieve datastores for host: %+v. err: %+v", host, err)
 		return nil, err
 	}
-	var dsObjList []*Datastore
+	var dsRefList []types.ManagedObjectReference
 	for _, dsRef := range hostSystemMo.Datastore {
-		dsObjList = append(dsObjList, &Datastore{object.NewDatastore(vm.Client(), dsRef), vm.Datacenter})
+		dsRefList = append(dsRefList, dsRef)
+	}
+
+	var dsMoList []mo.Datastore
+	pc := property.DefaultCollector(vm.Client())
+	properties := []string{DatastoreInfoProperty}
+	err = pc.Retrieve(ctx, dsRefList, properties, &dsMoList)
+	if err != nil {
+		glog.Errorf("Failed to get Datastore managed objects from datastore objects."+
+			" dsObjList: %+v, properties: %+v, err: %v", dsRefList, properties, err)
+		return nil, err
+	}
+	glog.V(9).Infof("Result dsMoList: %+v", dsMoList)
+	var dsObjList []*DatastoreInfo
+	for _, dsMo := range dsMoList {
+		dsObjList = append(dsObjList,
+			&DatastoreInfo{
+				&Datastore{object.NewDatastore(vm.Client(), dsMo.Reference()),
+					vm.Datacenter},
+				dsMo.Info.GetDatastoreInfo()})
 	}
 	return dsObjList, nil
 }
diff --git a/pkg/cloudprovider/providers/vsphere/vsphere.go b/pkg/cloudprovider/providers/vsphere/vsphere.go
index 31d2b64ec0..77f80e2354 100644
--- a/pkg/cloudprovider/providers/vsphere/vsphere.go
+++ b/pkg/cloudprovider/providers/vsphere/vsphere.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"io"
 	"net"
+	"os"
 	"path"
 	"path/filepath"
 	"runtime"
@@ -34,6 +35,9 @@ import (
 	"golang.org/x/net/context"
 	"k8s.io/api/core/v1"
 	k8stypes "k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/informers"
+	"k8s.io/client-go/tools/cache"
 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
 	"k8s.io/kubernetes/pkg/cloudprovider"
 	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib"
@@ -47,7 +51,6 @@ const (
 	VolDir                        = "kubevols"
 	RoundTripperDefaultCount      = 3
 	DummyVMPrefixName             = "vsphere-k8s"
-	VSANDatastoreType             = "vsan"
 	MacOuiVC                      = "00:50:56"
 	MacOuiEsx                     = "00:0c:29"
 	CleanUpDummyVMRoutineInterval = 5
@@ -58,25 +61,49 @@ const (
 var cleanUpRoutineInitialized = false
 var datastoreFolderIDMap = make(map[string]map[string]string)
 
-var clientLock sync.Mutex
 var cleanUpRoutineInitLock sync.Mutex
 var cleanUpDummyVMLock sync.RWMutex
 
 // VSphere is an implementation of cloud provider Interface for VSphere.
 type VSphere struct {
-	conn *vclib.VSphereConnection
-	cfg  *VSphereConfig
-	// InstanceID of the server where this VSphere object is instantiated.
-	localInstanceID string
+	cfg      *VSphereConfig
+	hostName string
+	// Maps the VSphere IP address to VSphereInstance
+	vsphereInstanceMap map[string]*VSphereInstance
+	// Responsible for managing discovery of k8s node, their location etc.
+	nodeManager *NodeManager
 }
 
-// VSphereConfig information that is used by vSphere Cloud Provider to connect to VC
+// Represents a vSphere instance where one or more kubernetes nodes are running.
+type VSphereInstance struct {
+	conn *vclib.VSphereConnection
+	cfg  *VirtualCenterConfig
+}
+
+// Structure that represents Virtual Center configuration
+type VirtualCenterConfig struct {
+	// vCenter username.
+	User string `gcfg:"user"`
+	// vCenter password in clear text.
+	Password string `gcfg:"password"`
+	// vCenter port.
+	VCenterPort string `gcfg:"port"`
+	// Datacenter in which VMs are located.
+	Datacenters string `gcfg:"datacenters"`
+	// Soap round tripper count (retries = RoundTripper - 1)
+	RoundTripperCount uint `gcfg:"soap-roundtrip-count"`
+}
+
+// Structure that represents the content of vsphere.conf file.
+// Users specify the configuration of one or more Virtual Centers in vsphere.conf where
+// the Kubernetes master and worker nodes are running.
 type VSphereConfig struct {
 	Global struct {
 		// vCenter username.
 		User string `gcfg:"user"`
 		// vCenter password in clear text.
 		Password string `gcfg:"password"`
+		// Deprecated. Use VirtualCenter to specify multiple vCenter Servers.
 		// vCenter IP.
 		VCenterIP string `gcfg:"server"`
 		// vCenter port.
@@ -84,23 +111,32 @@ type VSphereConfig struct {
 		// True if vCenter uses self-signed cert.
 		InsecureFlag bool `gcfg:"insecure-flag"`
 		// Datacenter in which VMs are located.
+		// Deprecated. Use "datacenters" instead.
 		Datacenter string `gcfg:"datacenter"`
+		// Datacenter in which VMs are located.
+		Datacenters string `gcfg:"datacenters"`
 		// Datastore in which vmdks are stored.
-		Datastore string `gcfg:"datastore"`
-		// WorkingDir is path where VMs can be found.
+		// Deprecated. See Workspace.DefaultDatastore
+		DefaultDatastore string `gcfg:"datastore"`
+		// WorkingDir is path where VMs can be found. Also used to create dummy VMs.
+		// Deprecated.
 		WorkingDir string `gcfg:"working-dir"`
 		// Soap round tripper count (retries = RoundTripper - 1)
 		RoundTripperCount uint `gcfg:"soap-roundtrip-count"`
+		// Deprecated as the virtual machines will be automatically discovered.
 		// VMUUID is the VM Instance UUID of virtual machine which can be retrieved from instanceUuid
 		// property in VmConfigInfo, or also set as vc.uuid in VMX file.
 		// If not set, will be fetched from the machine via sysfs (requires root)
 		VMUUID string `gcfg:"vm-uuid"`
+		// Deprecated as virtual machine will be automatically discovered.
 		// VMName is the VM name of virtual machine
 		// Combining the WorkingDir and VMName can form a unique InstanceID.
 		// When vm-name is set, no username/password is required on worker nodes.
 		VMName string `gcfg:"vm-name"`
 	}
 
+	VirtualCenter map[string]*VirtualCenterConfig
+
 	Network struct {
 		// PublicNetwork is name of the network the VMs are joined to.
 		PublicNetwork string `gcfg:"public-network"`
@@ -110,12 +146,21 @@ type VSphereConfig struct {
 		// SCSIControllerType defines SCSI controller to be used.
 		SCSIControllerType string `dcfg:"scsicontrollertype"`
 	}
+
+	// Endpoint used to create volumes
+	Workspace struct {
+		VCenterIP        string `gcfg:"server"`
+		Datacenter       string `gcfg:"datacenter"`
+		Folder           string `gcfg:"folder"`
+		DefaultDatastore string `gcfg:"default-datastore"`
+		ResourcePoolPath string `gcfg:"resourcepool-path"`
+	}
 }
 
 type Volumes interface {
 	// AttachDisk attaches given disk to given node. Current node
 	// is used when nodeName is empty string.
-	AttachDisk(vmDiskPath string, storagePolicyID string, nodeName k8stypes.NodeName) (diskUUID string, err error)
+	AttachDisk(vmDiskPath string, storagePolicyName string, nodeName k8stypes.NodeName) (diskUUID string, err error)
 
 	// DetachDisk detaches given disk to given node. Current node
 	// is used when nodeName is empty string.
@@ -152,19 +197,169 @@ func readConfig(config io.Reader) (VSphereConfig, error) {
 func init() {
 	vclib.RegisterMetrics()
 	cloudprovider.RegisterCloudProvider(ProviderName, func(config io.Reader) (cloudprovider.Interface, error) {
+		// If vSphere.conf file is not present then it is worker node.
+		if config == nil {
+			return newWorkerNode()
+		}
 		cfg, err := readConfig(config)
 		if err != nil {
 			return nil, err
 		}
-		return newVSphere(cfg)
+		return newControllerNode(cfg)
 	})
 }
 
 // Initialize passes a Kubernetes clientBuilder interface to the cloud provider
-func (vs *VSphere) Initialize(clientBuilder controller.ControllerClientBuilder) {}
+func (vs *VSphere) Initialize(clientBuilder controller.ControllerClientBuilder) {
+	if vs.cfg == nil {
+		return
+	}
 
-func newVSphere(cfg VSphereConfig) (*VSphere, error) {
+	// Only on controller node it is required to register listeners.
+	// Register callbacks for node updates
+	client := clientBuilder.ClientOrDie("vSphere-cloud-provider")
+	factory := informers.NewSharedInformerFactory(client, 5*time.Minute)
+	nodeInformer := factory.Core().V1().Nodes()
+	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+		AddFunc:    vs.NodeAdded,
+		DeleteFunc: vs.NodeDeleted,
+	})
+	go nodeInformer.Informer().Run(wait.NeverStop)
+	glog.V(4).Infof("vSphere cloud provider initialized")
+}
+
+// Creates new worker node interface and returns
+func newWorkerNode() (*VSphere, error) {
 	var err error
+	vs := VSphere{}
+	vs.hostName, err = os.Hostname()
+	if err != nil {
+		glog.Errorf("Failed to get hostname. err: %+v", err)
+		return nil, err
+	}
+
+	return &vs, nil
+}
+
+func populateVsphereInstanceMap(cfg *VSphereConfig) (map[string]*VSphereInstance, error) {
+	vsphereInstanceMap := make(map[string]*VSphereInstance)
+
+	// Check if the vsphere.conf is in old format. In this
+	// format the cfg.VirtualCenter will be nil or empty.
+	if cfg.VirtualCenter == nil || len(cfg.VirtualCenter) == 0 {
+		glog.V(4).Infof("Config is not per virtual center and is in old format.")
+		if cfg.Global.User == "" {
+			glog.Error("Global.User is empty!")
+			return nil, errors.New("Global.User is empty!")
+		}
+		if cfg.Global.Password == "" {
+			glog.Error("Global.Password is empty!")
+			return nil, errors.New("Global.Password is empty!")
+		}
+		if cfg.Global.WorkingDir == "" {
+			glog.Error("Global.WorkingDir is empty!")
+			return nil, errors.New("Global.WorkingDir is empty!")
+		}
+		if cfg.Global.VCenterIP == "" {
+			glog.Error("Global.VCenterIP is empty!")
+			return nil, errors.New("Global.VCenterIP is empty!")
+		}
+		if cfg.Global.Datacenter == "" {
+			glog.Error("Global.Datacenter is empty!")
+			return nil, errors.New("Global.Datacenter is empty!")
+		}
+		cfg.Workspace.VCenterIP = cfg.Global.VCenterIP
+		cfg.Workspace.Datacenter = cfg.Global.Datacenter
+		cfg.Workspace.Folder = cfg.Global.WorkingDir
+		cfg.Workspace.DefaultDatastore = cfg.Global.DefaultDatastore
+
+		vcConfig := VirtualCenterConfig{
+			User:              cfg.Global.User,
+			Password:          cfg.Global.Password,
+			VCenterPort:       cfg.Global.VCenterPort,
+			Datacenters:       cfg.Global.Datacenter,
+			RoundTripperCount: cfg.Global.RoundTripperCount,
+		}
+
+		vSphereConn := vclib.VSphereConnection{
+			Username:          vcConfig.User,
+			Password:          vcConfig.Password,
+			Hostname:          cfg.Global.VCenterIP,
+			Insecure:          cfg.Global.InsecureFlag,
+			RoundTripperCount: vcConfig.RoundTripperCount,
+			Port:              vcConfig.VCenterPort,
+		}
+		vsphereIns := VSphereInstance{
+			conn: &vSphereConn,
+			cfg:  &vcConfig,
+		}
+		vsphereInstanceMap[cfg.Global.VCenterIP] = &vsphereIns
+	} else {
+		if cfg.Workspace.VCenterIP == "" || cfg.Workspace.Folder == "" || cfg.Workspace.Datacenter == "" {
+			msg := fmt.Sprintf("All fields in workspace are mandatory."+
+				" vsphere.conf does not have the workspace specified correctly. cfg.Workspace: %+v", cfg.Workspace)
+			glog.Error(msg)
+			return nil, errors.New(msg)
+		}
+		for vcServer, vcConfig := range cfg.VirtualCenter {
+			glog.V(4).Infof("Initializing vc server %s", vcServer)
+			if vcServer == "" {
+				glog.Error("vsphere.conf does not have the VirtualCenter IP address specified")
+				return nil, errors.New("vsphere.conf does not have the VirtualCenter IP address specified")
+			}
+			if vcConfig.User == "" {
+				vcConfig.User = cfg.Global.User
+			}
+			if vcConfig.Password == "" {
+				vcConfig.Password = cfg.Global.Password
+			}
+			if vcConfig.User == "" {
+				msg := fmt.Sprintf("vcConfig.User is empty for vc %s!", vcServer)
+				glog.Error(msg)
+				return nil, errors.New(msg)
+			}
+			if vcConfig.Password == "" {
+				msg := fmt.Sprintf("vcConfig.Password is empty for vc %s!", vcServer)
+				glog.Error(msg)
+				return nil, errors.New(msg)
+			}
+			if vcConfig.VCenterPort == "" {
+				vcConfig.VCenterPort = cfg.Global.VCenterPort
+			}
+			if vcConfig.Datacenters == "" {
+				if cfg.Global.Datacenters != "" {
+					vcConfig.Datacenters = cfg.Global.Datacenters
+				} else {
+					// cfg.Global.Datacenter is deprecated, so giving it the last preference.
+					vcConfig.Datacenters = cfg.Global.Datacenter
+				}
+			}
+			if vcConfig.RoundTripperCount == 0 {
+				vcConfig.RoundTripperCount = cfg.Global.RoundTripperCount
+			}
+
+			vSphereConn := vclib.VSphereConnection{
+				Username:          vcConfig.User,
+				Password:          vcConfig.Password,
+				Hostname:          vcServer,
+				Insecure:          cfg.Global.InsecureFlag,
+				RoundTripperCount: vcConfig.RoundTripperCount,
+				Port:              vcConfig.VCenterPort,
+			}
+			vsphereIns := VSphereInstance{
+				conn: &vSphereConn,
+				cfg:  vcConfig,
+			}
+			vsphereInstanceMap[vcServer] = &vsphereIns
+		}
+	}
+	return vsphereInstanceMap, nil
+}
+
+// Creates new Contreoller node interface and returns
+func newControllerNode(cfg VSphereConfig) (*VSphere, error) {
+	var err error
+
 	if cfg.Disk.SCSIControllerType == "" {
 		cfg.Disk.SCSIControllerType = vclib.PVSCSIControllerType
 	} else if !vclib.CheckControllerSupported(cfg.Disk.SCSIControllerType) {
@@ -188,56 +383,37 @@ func newVSphere(cfg VSphereConfig) (*VSphere, error) {
 			return nil, err
 		}
 	}
-	vSphereConn := vclib.VSphereConnection{
-		Username:          cfg.Global.User,
-		Password:          cfg.Global.Password,
-		Hostname:          cfg.Global.VCenterIP,
-		Insecure:          cfg.Global.InsecureFlag,
-		RoundTripperCount: cfg.Global.RoundTripperCount,
-		Port:              cfg.Global.VCenterPort,
+	vsphereInstanceMap, err := populateVsphereInstanceMap(&cfg)
+	if err != nil {
+		return nil, err
 	}
-	var instanceID string
 
-	if cfg.Global.VMName == "" {
-		// if VMName is not set in the cloud config file, each nodes (including worker nodes) need credentials to obtain VMName from vCenter
-		glog.V(4).Infof("Cannot find VMName from cloud config file, start obtaining it from vCenter")
-		// Create context
-		ctx, cancel := context.WithCancel(context.TODO())
-		defer cancel()
-		err = vSphereConn.Connect(ctx)
-		if err != nil {
-			glog.Errorf("Failed to connect to vSphere")
-			return nil, err
-		}
-		dc, err := vclib.GetDatacenter(ctx, &vSphereConn, cfg.Global.Datacenter)
-		if err != nil {
-			return nil, err
-		}
-		vm, err := dc.GetVMByUUID(ctx, cfg.Global.VMUUID)
-		if err != nil {
-			return nil, err
-		}
-		vmName, err := vm.ObjectName(ctx)
-		if err != nil {
-			return nil, err
-		}
-		instanceID = vmName
-	} else {
-		instanceID = cfg.Global.VMName
-	}
 	vs := VSphere{
-		conn:            &vSphereConn,
-		cfg:             &cfg,
-		localInstanceID: instanceID,
+		vsphereInstanceMap: vsphereInstanceMap,
+		nodeManager: &NodeManager{
+			vsphereInstanceMap: vsphereInstanceMap,
+			nodeInfoMap:        make(map[string]*NodeInfo),
+			registeredNodes:    make(map[string]*v1.Node),
+		},
+		cfg: &cfg,
+	}
+
+	vs.hostName, err = os.Hostname()
+	if err != nil {
+		glog.Errorf("Failed to get hostname. err: %+v", err)
+		return nil, err
 	}
 	runtime.SetFinalizer(&vs, logout)
 	return &vs, nil
 }
 
 func logout(vs *VSphere) {
-	if vs.conn.GoVmomiClient != nil {
-		vs.conn.GoVmomiClient.Logout(context.TODO())
+	for _, vsphereIns := range vs.vsphereInstanceMap {
+		if vsphereIns.conn.GoVmomiClient != nil {
+			vsphereIns.conn.GoVmomiClient.Logout(context.TODO())
+		}
 	}
+
 }
 
 // Instances returns an implementation of Instances for vSphere.
@@ -284,43 +460,74 @@ func getLocalIP() ([]v1.NodeAddress, error) {
 	return addrs, nil
 }
 
+func (vs *VSphere) getVSphereInstance(nodeName k8stypes.NodeName) (*VSphereInstance, error) {
+	vsphereIns, err := vs.nodeManager.GetVSphereInstance(nodeName)
+	if err != nil {
+		glog.Errorf("Cannot find node %q in cache. Node not found!!!", nodeName)
+		return nil, err
+	}
+	return &vsphereIns, nil
+}
+
+func (vs *VSphere) getVSphereInstanceForServer(vcServer string, ctx context.Context) (*VSphereInstance, error) {
+	vsphereIns, ok := vs.vsphereInstanceMap[vcServer]
+	if !ok {
+		glog.Errorf("cannot find vcServer %q in cache. VC not found!!!", vcServer)
+		return nil, errors.New(fmt.Sprintf("Cannot find node %q in vsphere configuration map", vcServer))
+	}
+	// Ensure client is logged in and session is valid
+	err := vsphereIns.conn.Connect(ctx)
+	if err != nil {
+		glog.Errorf("failed connecting to vcServer %q with error %+v", vcServer, err)
+		return nil, err
+	}
+
+	return vsphereIns, nil
+}
+
 // Get the VM Managed Object instance by from the node
-func (vs *VSphere) getVMByName(ctx context.Context, nodeName k8stypes.NodeName) (*vclib.VirtualMachine, error) {
-	dc, err := vclib.GetDatacenter(ctx, vs.conn, vs.cfg.Global.Datacenter)
+func (vs *VSphere) getVMFromNodeName(ctx context.Context, nodeName k8stypes.NodeName) (*vclib.VirtualMachine, error) {
+	nodeInfo, err := vs.nodeManager.GetNodeInfo(nodeName)
 	if err != nil {
 		return nil, err
 	}
-	vmPath := vs.cfg.Global.WorkingDir + "/" + nodeNameToVMName(nodeName)
-	vm, err := dc.GetVMByPath(ctx, vmPath)
-	if err != nil {
-		return nil, err
-	}
-	return vm, nil
+	return nodeInfo.vm, nil
 }
 
 // NodeAddresses is an implementation of Instances.NodeAddresses.
 func (vs *VSphere) NodeAddresses(nodeName k8stypes.NodeName) ([]v1.NodeAddress, error) {
 	// Get local IP addresses if node is local node
-	if vs.localInstanceID == nodeNameToVMName(nodeName) {
+	if vs.hostName == convertToString(nodeName) {
 		return getLocalIP()
 	}
+
+	if vs.cfg == nil {
+		return nil, cloudprovider.InstanceNotFound
+	}
+
+	// Below logic can be executed only on master as VC details are present.
 	addrs := []v1.NodeAddress{}
 	// Create context
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
-	// Ensure client is logged in and session is valid
-	err := vs.conn.Connect(ctx)
+	vsi, err := vs.getVSphereInstance(nodeName)
 	if err != nil {
 		return nil, err
 	}
-	vm, err := vs.getVMByName(ctx, nodeName)
+	// Ensure client is logged in and session is valid
+	err = vsi.conn.Connect(ctx)
 	if err != nil {
-		glog.Errorf("Failed to get VM object for node: %q. err: +%v", nodeNameToVMName(nodeName), err)
+		return nil, err
+	}
+
+	vm, err := vs.getVMFromNodeName(ctx, nodeName)
+	if err != nil {
+		glog.Errorf("Failed to get VM object for node: %q. err: +%v", convertToString(nodeName), err)
 		return nil, err
 	}
 	vmMoList, err := vm.Datacenter.GetVMMoList(ctx, []*vclib.VirtualMachine{vm}, []string{"guest.net"})
 	if err != nil {
-		glog.Errorf("Failed to get VM Managed object with property guest.net for node: %q. err: +%v", nodeNameToVMName(nodeName), err)
+		glog.Errorf("Failed to get VM Managed object with property guest.net for node: %q. err: +%v", convertToString(nodeName), err)
 		return nil, err
 	}
 	// retrieve VM's ip(s)
@@ -348,8 +555,7 @@ func (vs *VSphere) NodeAddresses(nodeName k8stypes.NodeName) ([]v1.NodeAddress,
 // This method will not be called from the node that is requesting this ID. i.e. metadata service
 // and other local methods cannot be used here
 func (vs *VSphere) NodeAddressesByProviderID(providerID string) ([]v1.NodeAddress, error) {
-	vmName := path.Base(providerID)
-	return vs.NodeAddresses(vmNameToNodeName(vmName))
+	return vs.NodeAddresses(convertToK8sType(providerID))
 }
 
 // AddSSHKeyToAllInstances add SSH key to all instances
@@ -359,16 +565,14 @@ func (vs *VSphere) AddSSHKeyToAllInstances(user string, keyData []byte) error {
 
 // CurrentNodeName gives the current node name
 func (vs *VSphere) CurrentNodeName(hostname string) (k8stypes.NodeName, error) {
-	return vmNameToNodeName(vs.localInstanceID), nil
+	return convertToK8sType(vs.hostName), nil
 }
 
-// nodeNameToVMName maps a NodeName to the vmware infrastructure name
-func nodeNameToVMName(nodeName k8stypes.NodeName) string {
+func convertToString(nodeName k8stypes.NodeName) string {
 	return string(nodeName)
 }
 
-// nodeNameToVMName maps a vmware infrastructure name to a NodeName
-func vmNameToNodeName(vmName string) k8stypes.NodeName {
+func convertToK8sType(vmName string) k8stypes.NodeName {
 	return k8stypes.NodeName(vmName)
 }
 
@@ -380,68 +584,73 @@ func (vs *VSphere) ExternalID(nodeName k8stypes.NodeName) (string, error) {
 // InstanceExistsByProviderID returns true if the instance with the given provider id still exists and is running.
 // If false is returned with no error, the instance will be immediately deleted by the cloud controller manager.
 func (vs *VSphere) InstanceExistsByProviderID(providerID string) (bool, error) {
-	vmName := path.Base(providerID)
-	nodeName := vmNameToNodeName(vmName)
-	// Create context
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	// Ensure client is logged in and session is valid
-	err := vs.conn.Connect(ctx)
-	if err != nil {
-		return false, err
-	}
-	vm, err := vs.getVMByName(ctx, nodeName)
-	if err != nil {
-		if vclib.IsNotFound(err) {
-			return false, nil
-		}
-		glog.Errorf("Failed to get VM object for node: %q. err: +%v", nodeNameToVMName(nodeName), err)
-		return false, err
+	_, err := vs.InstanceID(convertToK8sType(providerID))
+	if err == nil {
+		return true, nil
 	}
 
-	isActive, err := vm.IsActive(ctx)
-	if err != nil {
-		glog.Errorf("Failed to check whether node %q is active. err: %+v.", nodeNameToVMName(nodeName), err)
-		return false, err
-	}
-	if !isActive {
-		return false, nil
-	}
-
-	return true, nil
+	return false, err
 }
 
 // InstanceID returns the cloud provider ID of the node with the specified Name.
 func (vs *VSphere) InstanceID(nodeName k8stypes.NodeName) (string, error) {
-	if vs.localInstanceID == nodeNameToVMName(nodeName) {
-		return vs.cfg.Global.WorkingDir + "/" + vs.localInstanceID, nil
-	}
-	// Create context
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	// Ensure client is logged in and session is valid
-	err := vs.conn.Connect(ctx)
-	if err != nil {
-		return "", err
-	}
-	vm, err := vs.getVMByName(ctx, nodeName)
-	if err != nil {
-		if vclib.IsNotFound(err) {
-			return "", cloudprovider.InstanceNotFound
+
+	instanceIDInternal := func() (string, error) {
+		if vs.hostName == convertToString(nodeName) {
+			return vs.hostName, nil
 		}
-		glog.Errorf("Failed to get VM object for node: %q. err: +%v", nodeNameToVMName(nodeName), err)
-		return "", err
+
+		// Below logic can be performed only on master node where VC details are preset.
+		if vs.cfg == nil {
+			return "", fmt.Errorf("The current node can't detremine InstanceID for %q", convertToString(nodeName))
+		}
+
+		// Create context
+		ctx, cancel := context.WithCancel(context.Background())
+		defer cancel()
+		vsi, err := vs.getVSphereInstance(nodeName)
+		if err != nil {
+			return "", err
+		}
+		// Ensure client is logged in and session is valid
+		err = vsi.conn.Connect(ctx)
+		if err != nil {
+			return "", err
+		}
+		vm, err := vs.getVMFromNodeName(ctx, nodeName)
+		if err != nil {
+			if err == vclib.ErrNoVMFound {
+				return "", cloudprovider.InstanceNotFound
+			}
+			glog.Errorf("Failed to get VM object for node: %q. err: +%v", convertToString(nodeName), err)
+			return "", err
+		}
+		isActive, err := vm.IsActive(ctx)
+		if err != nil {
+			glog.Errorf("Failed to check whether node %q is active. err: %+v.", convertToString(nodeName), err)
+			return "", err
+		}
+		if isActive {
+			return convertToString(nodeName), nil
+		}
+		glog.Warningf("The VM: %s is not in %s state", convertToString(nodeName), vclib.ActivePowerState)
+		return "", cloudprovider.InstanceNotFound
 	}
-	isActive, err := vm.IsActive(ctx)
+
+	instanceID, err := instanceIDInternal()
 	if err != nil {
-		glog.Errorf("Failed to check whether node %q is active. err: %+v.", nodeNameToVMName(nodeName), err)
-		return "", err
+		isManagedObjectNotFoundError, err := vs.retry(nodeName, err)
+		if isManagedObjectNotFoundError {
+			if err == nil {
+				glog.V(4).Infof("InstanceID: Found node %q", convertToString(nodeName))
+				instanceID, err = instanceIDInternal()
+			} else if err == vclib.ErrNoVMFound {
+				return "", cloudprovider.InstanceNotFound
+			}
+		}
 	}
-	if isActive {
-		return "/" + vm.InventoryPath, nil
-	}
-	glog.Warningf("The VM: %s is not in %s state", nodeNameToVMName(nodeName), vclib.ActivePowerState)
-	return "", cloudprovider.InstanceNotFound
+
+	return instanceID, err
 }
 
 // InstanceTypeByProviderID returns the cloudprovider instance type of the node with the specified unique providerID
@@ -486,72 +695,111 @@ func (vs *VSphere) ScrubDNS(nameservers, searches []string) (nsOut, srchOut []st
 }
 
 // AttachDisk attaches given virtual disk volume to the compute running kubelet.
-func (vs *VSphere) AttachDisk(vmDiskPath string, storagePolicyID string, nodeName k8stypes.NodeName) (diskUUID string, err error) {
-	attachDiskInternal := func(vmDiskPath string, storagePolicyID string, nodeName k8stypes.NodeName) (diskUUID string, err error) {
+func (vs *VSphere) AttachDisk(vmDiskPath string, storagePolicyName string, nodeName k8stypes.NodeName) (diskUUID string, err error) {
+	attachDiskInternal := func(vmDiskPath string, storagePolicyName string, nodeName k8stypes.NodeName) (diskUUID string, err error) {
 		if nodeName == "" {
-			nodeName = vmNameToNodeName(vs.localInstanceID)
+			nodeName = convertToK8sType(vs.hostName)
 		}
 		// Create context
 		ctx, cancel := context.WithCancel(context.Background())
 		defer cancel()
+		vsi, err := vs.getVSphereInstance(nodeName)
+		if err != nil {
+			return "", err
+		}
 		// Ensure client is logged in and session is valid
-		err = vs.conn.Connect(ctx)
+		err = vsi.conn.Connect(ctx)
 		if err != nil {
 			return "", err
 		}
-		vm, err := vs.getVMByName(ctx, nodeName)
+
+		vm, err := vs.getVMFromNodeName(ctx, nodeName)
 		if err != nil {
-			glog.Errorf("Failed to get VM object for node: %q. err: +%v", nodeNameToVMName(nodeName), err)
+			glog.Errorf("Failed to get VM object for node: %q. err: +%v", convertToString(nodeName), err)
 			return "", err
 		}
-		diskUUID, err = vm.AttachDisk(ctx, vmDiskPath, &vclib.VolumeOptions{SCSIControllerType: vclib.PVSCSIControllerType, StoragePolicyID: storagePolicyID})
+
+		diskUUID, err = vm.AttachDisk(ctx, vmDiskPath, &vclib.VolumeOptions{SCSIControllerType: vclib.PVSCSIControllerType, StoragePolicyName: storagePolicyName})
 		if err != nil {
-			glog.Errorf("Failed to attach disk: %s for node: %s. err: +%v", vmDiskPath, nodeNameToVMName(nodeName), err)
+			glog.Errorf("Failed to attach disk: %s for node: %s. err: +%v", vmDiskPath, convertToString(nodeName), err)
 			return "", err
 		}
 		return diskUUID, nil
 	}
 	requestTime := time.Now()
-	diskUUID, err = attachDiskInternal(vmDiskPath, storagePolicyID, nodeName)
+	diskUUID, err = attachDiskInternal(vmDiskPath, storagePolicyName, nodeName)
+	if err != nil {
+		isManagedObjectNotFoundError, err := vs.retry(nodeName, err)
+		if isManagedObjectNotFoundError {
+			if err == nil {
+				glog.V(4).Infof("AttachDisk: Found node %q", convertToString(nodeName))
+				diskUUID, err = attachDiskInternal(vmDiskPath, storagePolicyName, nodeName)
+			}
+		}
+	}
 	vclib.RecordvSphereMetric(vclib.OperationAttachVolume, requestTime, err)
 	return diskUUID, err
 }
 
+func (vs *VSphere) retry(nodeName k8stypes.NodeName, err error) (bool, error) {
+	isManagedObjectNotFoundError := false
+	if err != nil {
+		if vclib.IsManagedObjectNotFoundError(err) {
+			isManagedObjectNotFoundError = true
+			glog.V(4).Infof("error %q ManagedObjectNotFound for node %q", err, convertToString(nodeName))
+			err = vs.nodeManager.RediscoverNode(nodeName)
+		}
+	}
+	return isManagedObjectNotFoundError, err
+}
+
 // DetachDisk detaches given virtual disk volume from the compute running kubelet.
 func (vs *VSphere) DetachDisk(volPath string, nodeName k8stypes.NodeName) error {
 	detachDiskInternal := func(volPath string, nodeName k8stypes.NodeName) error {
 		if nodeName == "" {
-			nodeName = vmNameToNodeName(vs.localInstanceID)
+			nodeName = convertToK8sType(vs.hostName)
 		}
 		// Create context
 		ctx, cancel := context.WithCancel(context.Background())
 		defer cancel()
-		// Ensure client is logged in and session is valid
-		err := vs.conn.Connect(ctx)
+		vsi, err := vs.getVSphereInstance(nodeName)
 		if err != nil {
 			return err
 		}
-		vm, err := vs.getVMByName(ctx, nodeName)
+		// Ensure client is logged in and session is valid
+		err = vsi.conn.Connect(ctx)
+		if err != nil {
+			return err
+		}
+		vm, err := vs.getVMFromNodeName(ctx, nodeName)
 		if err != nil {
 			// If node doesn't exist, disk is already detached from node.
-			if vclib.IsNotFound(err) {
-				glog.Infof("Node %q does not exist, disk %s is already detached from node.", nodeNameToVMName(nodeName), volPath)
+			if err == vclib.ErrNoVMFound {
+				glog.Infof("Node %q does not exist, disk %s is already detached from node.", convertToString(nodeName), volPath)
 				return nil
 			}
 
-			glog.Errorf("Failed to get VM object for node: %q. err: +%v", nodeNameToVMName(nodeName), err)
+			glog.Errorf("Failed to get VM object for node: %q. err: +%v", convertToString(nodeName), err)
 			return err
 		}
 		err = vm.DetachDisk(ctx, volPath)
 		if err != nil {
-			glog.Errorf("Failed to detach disk: %s for node: %s. err: +%v", volPath, nodeNameToVMName(nodeName), err)
+			glog.Errorf("Failed to detach disk: %s for node: %s. err: +%v", volPath, convertToString(nodeName), err)
 			return err
 		}
 		return nil
 	}
 	requestTime := time.Now()
 	err := detachDiskInternal(volPath, nodeName)
-	vclib.RecordvSphereMetric(vclib.OperationDetachVolume, requestTime, nil)
+	if err != nil {
+		isManagedObjectNotFoundError, err := vs.retry(nodeName, err)
+		if isManagedObjectNotFoundError {
+			if err == nil {
+				err = detachDiskInternal(volPath, nodeName)
+			}
+		}
+	}
+	vclib.RecordvSphereMetric(vclib.OperationDetachVolume, requestTime, err)
 	return err
 }
 
@@ -560,22 +808,26 @@ func (vs *VSphere) DiskIsAttached(volPath string, nodeName k8stypes.NodeName) (b
 	diskIsAttachedInternal := func(volPath string, nodeName k8stypes.NodeName) (bool, error) {
 		var vSphereInstance string
 		if nodeName == "" {
-			vSphereInstance = vs.localInstanceID
-			nodeName = vmNameToNodeName(vSphereInstance)
+			vSphereInstance = vs.hostName
+			nodeName = convertToK8sType(vSphereInstance)
 		} else {
-			vSphereInstance = nodeNameToVMName(nodeName)
+			vSphereInstance = convertToString(nodeName)
 		}
 		// Create context
 		ctx, cancel := context.WithCancel(context.Background())
 		defer cancel()
-		// Ensure client is logged in and session is valid
-		err := vs.conn.Connect(ctx)
+		vsi, err := vs.getVSphereInstance(nodeName)
 		if err != nil {
 			return false, err
 		}
-		vm, err := vs.getVMByName(ctx, nodeName)
+		// Ensure client is logged in and session is valid
+		err = vsi.conn.Connect(ctx)
 		if err != nil {
-			if vclib.IsNotFound(err) {
+			return false, err
+		}
+		vm, err := vs.getVMFromNodeName(ctx, nodeName)
+		if err != nil {
+			if err == vclib.ErrNoVMFound {
 				glog.Warningf("Node %q does not exist, vsphere CP will assume disk %v is not attached to it.", nodeName, volPath)
 				// make the disk as detached and return false without error.
 				return false, nil
@@ -583,7 +835,7 @@ func (vs *VSphere) DiskIsAttached(volPath string, nodeName k8stypes.NodeName) (b
 			glog.Errorf("Failed to get VM object for node: %q. err: +%v", vSphereInstance, err)
 			return false, err
 		}
-		volPath = vclib.RemoveClusterFromVDiskPath(volPath)
+		volPath = vclib.RemoveStorageClusterORFolderNameFromVDiskPath(volPath)
 		attached, err := vm.IsDiskAttached(ctx, volPath)
 		if err != nil {
 			glog.Errorf("DiskIsAttached failed to determine whether disk %q is still attached on node %q",
@@ -594,57 +846,144 @@ func (vs *VSphere) DiskIsAttached(volPath string, nodeName k8stypes.NodeName) (b
 	}
 	requestTime := time.Now()
 	isAttached, err := diskIsAttachedInternal(volPath, nodeName)
+	if err != nil {
+		isManagedObjectNotFoundError, err := vs.retry(nodeName, err)
+		if isManagedObjectNotFoundError {
+			if err == vclib.ErrNoVMFound {
+				isAttached, err = false, nil
+			} else if err == nil {
+				isAttached, err = diskIsAttachedInternal(volPath, nodeName)
+			}
+		}
+	}
 	vclib.RecordvSphereMetric(vclib.OperationDiskIsAttached, requestTime, err)
 	return isAttached, err
 }
 
 // DisksAreAttached returns if disks are attached to the VM using controllers supported by the plugin.
+// 1. Converts volPaths into canonical form so that it can be compared with the VM device path.
+// 2. Segregates nodes by vCenter and Datacenter they are present in. This reduces calls to VC.
+// 3. Creates go routines per VC-DC to find whether disks are attached to the nodes.
+// 4. If the some of the VMs are not found or migrated then they are added to a list.
+// 5. After successful execution of goroutines,
+// 5a. If there are any VMs which needs to be retried, they are rediscovered and the whole operation is initiated again for only rediscovered VMs.
+// 5b. If VMs are removed from vSphere inventory they are ignored.
 func (vs *VSphere) DisksAreAttached(nodeVolumes map[k8stypes.NodeName][]string) (map[k8stypes.NodeName]map[string]bool, error) {
 	disksAreAttachedInternal := func(nodeVolumes map[k8stypes.NodeName][]string) (map[k8stypes.NodeName]map[string]bool, error) {
-		attached := make(map[k8stypes.NodeName]map[string]bool)
-		if len(nodeVolumes) == 0 {
-			return attached, nil
+
+		// disksAreAttach checks whether disks are attached to the nodes.
+		// Returns nodes that need to be retried if retry is true
+		// Segregates nodes per VC and DC
+		// Creates go routines per VC-DC to find whether disks are attached to the nodes.
+		disksAreAttach := func(ctx context.Context, nodeVolumes map[k8stypes.NodeName][]string, attached map[string]map[string]bool, retry bool) ([]k8stypes.NodeName, error) {
+
+			var wg sync.WaitGroup
+			var localAttachedMaps []map[string]map[string]bool
+			var nodesToRetry []k8stypes.NodeName
+			var globalErr error
+			globalErr = nil
+			globalErrMutex := &sync.Mutex{}
+			nodesToRetryMutex := &sync.Mutex{}
+
+			// Segregate nodes according to VC-DC
+			dcNodes := make(map[string][]k8stypes.NodeName)
+			for nodeName := range nodeVolumes {
+				nodeInfo, err := vs.nodeManager.GetNodeInfo(nodeName)
+				if err != nil {
+					glog.Errorf("Failed to get node info: %+v. err: %+v", nodeInfo.vm, err)
+					return nodesToRetry, err
+				}
+				VC_DC := nodeInfo.vcServer + nodeInfo.dataCenter.String()
+				dcNodes[VC_DC] = append(dcNodes[VC_DC], nodeName)
+			}
+
+			for _, nodes := range dcNodes {
+				localAttachedMap := make(map[string]map[string]bool)
+				localAttachedMaps = append(localAttachedMaps, localAttachedMap)
+				// Start go routines per VC-DC to check disks are attached
+				go func() {
+					nodesToRetryLocal, err := vs.checkDiskAttached(ctx, nodes, nodeVolumes, localAttachedMap, retry)
+					if err != nil {
+						if !vclib.IsManagedObjectNotFoundError(err) {
+							globalErrMutex.Lock()
+							globalErr = err
+							globalErrMutex.Unlock()
+							glog.Errorf("Failed to check disk attached for nodes: %+v. err: %+v", nodes, err)
+						}
+					}
+					nodesToRetryMutex.Lock()
+					nodesToRetry = append(nodesToRetry, nodesToRetryLocal...)
+					nodesToRetryMutex.Unlock()
+					wg.Done()
+				}()
+				wg.Add(1)
+			}
+			wg.Wait()
+			if globalErr != nil {
+				return nodesToRetry, globalErr
+			}
+			for _, localAttachedMap := range localAttachedMaps {
+				for key, value := range localAttachedMap {
+					attached[key] = value
+				}
+			}
+
+			return nodesToRetry, nil
 		}
+
+		glog.V(4).Info("Starting DisksAreAttached API for vSphere with nodeVolumes: %+v", nodeVolumes)
 		// Create context
 		ctx, cancel := context.WithCancel(context.Background())
 		defer cancel()
-		// Ensure client is logged in and session is valid
-		err := vs.conn.Connect(ctx)
+
+		disksAttached := make(map[k8stypes.NodeName]map[string]bool)
+		if len(nodeVolumes) == 0 {
+			return disksAttached, nil
+		}
+
+		// Convert VolPaths into canonical form so that it can be compared with the VM device path.
+		vmVolumes, err := vs.convertVolPathsToDevicePaths(ctx, nodeVolumes)
 		if err != nil {
+			glog.Errorf("Failed to convert volPaths to devicePaths: %+v. err: %+v", nodeVolumes, err)
 			return nil, err
 		}
-		dc, err := vclib.GetDatacenter(ctx, vs.conn, vs.cfg.Global.Datacenter)
+		attached := make(map[string]map[string]bool)
+		nodesToRetry, err := disksAreAttach(ctx, vmVolumes, attached, false)
 		if err != nil {
 			return nil, err
 		}
 
-		vmVolumes := make(map[string][]string)
-		for nodeName, volPaths := range nodeVolumes {
-			for i, volPath := range volPaths {
-				volPath = vclib.RemoveClusterFromVDiskPath(volPath)
-				// Get the canonical volume path for volPath.
-				canonicalVolumePath, err := getcanonicalVolumePath(ctx, dc, volPath)
+		if len(nodesToRetry) != 0 {
+			// Rediscover nodes which are need to be retried
+			remainingNodesVolumes := make(map[k8stypes.NodeName][]string)
+			for _, nodeName := range nodesToRetry {
+				err = vs.nodeManager.RediscoverNode(nodeName)
 				if err != nil {
-					glog.Errorf("Failed to get canonical vsphere volume path for volume: %s. err: %+v", volPath, err)
+					if err == vclib.ErrNoVMFound {
+						glog.V(4).Infof("node %s not found. err: %+v", nodeName, err)
+						continue
+					}
+					glog.Errorf("Failed to rediscover node %s. err: %+v", nodeName, err)
 					return nil, err
 				}
-				// Check if the volume path contains .vmdk extension. If not, add the extension and update the nodeVolumes Map
-				if len(canonicalVolumePath) > 0 && filepath.Ext(canonicalVolumePath) != ".vmdk" {
-					canonicalVolumePath += ".vmdk"
-				}
-				volPaths[i] = canonicalVolumePath
+				remainingNodesVolumes[nodeName] = nodeVolumes[nodeName]
+			}
+
+			// If some remaining nodes are still registered
+			if len(remainingNodesVolumes) != 0 {
+				nodesToRetry, err = disksAreAttach(ctx, remainingNodesVolumes, attached, true)
+				if err != nil || len(nodesToRetry) != 0 {
+					glog.Errorf("Failed to retry disksAreAttach  for nodes %+v. err: %+v", remainingNodesVolumes, err)
+					return nil, err
+				}
+			}
+
+			for nodeName, volPaths := range attached {
+				disksAttached[convertToK8sType(nodeName)] = volPaths
 			}
-			vmVolumes[nodeNameToVMName(nodeName)] = volPaths
 		}
-		// Check if the disks are attached to their respective nodes
-		disksAttachedList, err := dc.CheckDisksAttached(ctx, vmVolumes)
-		if err != nil {
-			return nil, err
-		}
-		for vmName, volPaths := range disksAttachedList {
-			attached[vmNameToNodeName(vmName)] = volPaths
-		}
-		return attached, nil
+		glog.V(4).Infof("DisksAreAttach successfully executed. result: %+v", attached)
+		return disksAttached, nil
 	}
 	requestTime := time.Now()
 	attached, err := disksAreAttachedInternal(nodeVolumes)
@@ -660,9 +999,9 @@ func (vs *VSphere) CreateVolume(volumeOptions *vclib.VolumeOptions) (canonicalVo
 	glog.V(1).Infof("Starting to create a vSphere volume with volumeOptions: %+v", volumeOptions)
 	createVolumeInternal := func(volumeOptions *vclib.VolumeOptions) (canonicalVolumePath string, err error) {
 		var datastore string
-		// Default datastore is the datastore in the vSphere config file that is used to initialize vSphere cloud provider.
+		// If datastore not specified, then use default datastore
 		if volumeOptions.Datastore == "" {
-			datastore = vs.cfg.Global.Datastore
+			datastore = vs.cfg.Workspace.DefaultDatastore
 		} else {
 			datastore = volumeOptions.Datastore
 		}
@@ -670,12 +1009,11 @@ func (vs *VSphere) CreateVolume(volumeOptions *vclib.VolumeOptions) (canonicalVo
 		// Create context
 		ctx, cancel := context.WithCancel(context.Background())
 		defer cancel()
-		// Ensure client is logged in and session is valid
-		err = vs.conn.Connect(ctx)
+		vsi, err := vs.getVSphereInstanceForServer(vs.cfg.Workspace.VCenterIP, ctx)
 		if err != nil {
 			return "", err
 		}
-		dc, err := vclib.GetDatacenter(ctx, vs.conn, vs.cfg.Global.Datacenter)
+		dc, err := vclib.GetDatacenter(ctx, vsi.conn, vs.cfg.Workspace.Datacenter)
 		if err != nil {
 			return "", err
 		}
@@ -693,18 +1031,37 @@ func (vs *VSphere) CreateVolume(volumeOptions *vclib.VolumeOptions) (canonicalVo
 				cleanUpRoutineInitialized = true
 			}
 			cleanUpRoutineInitLock.Unlock()
-			vmOptions, err = vs.setVMOptions(ctx, dc)
+			vmOptions, err = vs.setVMOptions(ctx, dc, vs.cfg.Workspace.ResourcePoolPath)
 			if err != nil {
 				glog.Errorf("Failed to set VM options requires to create a vsphere volume. err: %+v", err)
 				return "", err
 			}
 		}
 		if volumeOptions.StoragePolicyName != "" && volumeOptions.Datastore == "" {
-			datastore, err = getPbmCompatibleDatastore(ctx, dc.Client(), volumeOptions.StoragePolicyName, vmOptions.VMFolder)
+			datastore, err = getPbmCompatibleDatastore(ctx, dc, volumeOptions.StoragePolicyName, vs.nodeManager)
 			if err != nil {
 				glog.Errorf("Failed to get pbm compatible datastore with storagePolicy: %s. err: %+v", volumeOptions.StoragePolicyName, err)
 				return "", err
 			}
+		} else {
+			// Since no storage policy is specified but datastore is specified, check
+			// if the given datastore is a shared datastore across all node VMs.
+			sharedDsList, err := getSharedDatastoresInK8SCluster(ctx, dc, vs.nodeManager)
+			if err != nil {
+				glog.Errorf("Failed to get shared datastore: %+v", err)
+				return "", err
+			}
+			found := false
+			for _, sharedDs := range sharedDsList {
+				if datastore == sharedDs.Info.Name {
+					found = true
+					break
+				}
+			}
+			if !found {
+				msg := fmt.Sprintf("The specified datastore %s is not a shared datastore across node VMs", datastore)
+				return "", errors.New(msg)
+			}
 		}
 		ds, err := dc.GetDatastoreByName(ctx, datastore)
 		if err != nil {
@@ -743,7 +1100,7 @@ func (vs *VSphere) CreateVolume(volumeOptions *vclib.VolumeOptions) (canonicalVo
 	requestTime := time.Now()
 	canonicalVolumePath, err = createVolumeInternal(volumeOptions)
 	vclib.RecordCreateVolumeMetric(volumeOptions, requestTime, err)
-	glog.V(1).Infof("The canonical volume path for the newly created vSphere volume is %q", canonicalVolumePath)
+	glog.V(4).Infof("The canonical volume path for the newly created vSphere volume is %q", canonicalVolumePath)
 	return canonicalVolumePath, err
 }
 
@@ -754,16 +1111,11 @@ func (vs *VSphere) DeleteVolume(vmDiskPath string) error {
 		// Create context
 		ctx, cancel := context.WithCancel(context.Background())
 		defer cancel()
-		// Ensure client is logged in and session is valid
-		err := vs.conn.Connect(ctx)
+		vsi, err := vs.getVSphereInstanceForServer(vs.cfg.Workspace.VCenterIP, ctx)
 		if err != nil {
 			return err
 		}
-		dc, err := vclib.GetDatacenter(ctx, vs.conn, vs.cfg.Global.Datacenter)
-		if err != nil {
-			return err
-		}
-		ds, err := dc.GetDatastoreByName(ctx, vs.cfg.Global.Datastore)
+		dc, err := vclib.GetDatacenter(ctx, vsi.conn, vs.cfg.Workspace.Datacenter)
 		if err != nil {
 			return err
 		}
@@ -772,7 +1124,7 @@ func (vs *VSphere) DeleteVolume(vmDiskPath string) error {
 			VolumeOptions: &vclib.VolumeOptions{},
 			VMOptions:     &vclib.VMOptions{},
 		}
-		err = disk.Delete(ctx, ds)
+		err = disk.Delete(ctx, dc)
 		if err != nil {
 			glog.Errorf("Failed to delete vsphere volume with vmDiskPath: %s. err: %+v", vmDiskPath, err)
 		}
@@ -788,3 +1140,27 @@ func (vs *VSphere) DeleteVolume(vmDiskPath string) error {
 func (vs *VSphere) HasClusterID() bool {
 	return true
 }
+
+// Notification handler when node is added into k8s cluster.
+func (vs *VSphere) NodeAdded(obj interface{}) {
+	node, ok := obj.(*v1.Node)
+	if node == nil || !ok {
+		glog.Warningf("NodeAdded: unrecognized object %+v", obj)
+		return
+	}
+
+	glog.V(4).Infof("Node added: %+v", node)
+	vs.nodeManager.RegisterNode(node)
+}
+
+// Notification handler when node is removed from k8s cluster.
+func (vs *VSphere) NodeDeleted(obj interface{}) {
+	node, ok := obj.(*v1.Node)
+	if node == nil || !ok {
+		glog.Warningf("NodeDeleted: unrecognized object %+v", obj)
+		return
+	}
+
+	glog.V(4).Infof("Node deleted: %+v", node)
+	vs.nodeManager.UnRegisterNode(node)
+}
diff --git a/pkg/cloudprovider/providers/vsphere/vsphere_test.go b/pkg/cloudprovider/providers/vsphere/vsphere_test.go
index b8b54e99ae..b49224124e 100644
--- a/pkg/cloudprovider/providers/vsphere/vsphere_test.go
+++ b/pkg/cloudprovider/providers/vsphere/vsphere_test.go
@@ -39,7 +39,7 @@ func configFromEnv() (cfg VSphereConfig, ok bool) {
 	cfg.Global.Password = os.Getenv("VSPHERE_PASSWORD")
 	cfg.Global.Datacenter = os.Getenv("VSPHERE_DATACENTER")
 	cfg.Network.PublicNetwork = os.Getenv("VSPHERE_PUBLIC_NETWORK")
-	cfg.Global.Datastore = os.Getenv("VSPHERE_DATASTORE")
+	cfg.Global.DefaultDatastore = os.Getenv("VSPHERE_DATASTORE")
 	cfg.Disk.SCSIControllerType = os.Getenv("VSPHERE_SCSICONTROLLER_TYPE")
 	cfg.Global.WorkingDir = os.Getenv("VSPHERE_WORKING_DIR")
 	cfg.Global.VMName = os.Getenv("VSPHERE_VM_NAME")
@@ -103,7 +103,7 @@ func TestNewVSphere(t *testing.T) {
 		t.Skipf("No config found in environment")
 	}
 
-	_, err := newVSphere(cfg)
+	_, err := newControllerNode(cfg)
 	if err != nil {
 		t.Fatalf("Failed to construct/authenticate vSphere: %s", err)
 	}
@@ -116,7 +116,7 @@ func TestVSphereLogin(t *testing.T) {
 	}
 
 	// Create vSphere configuration object
-	vs, err := newVSphere(cfg)
+	vs, err := newControllerNode(cfg)
 	if err != nil {
 		t.Fatalf("Failed to construct/authenticate vSphere: %s", err)
 	}
@@ -126,11 +126,16 @@ func TestVSphereLogin(t *testing.T) {
 	defer cancel()
 
 	// Create vSphere client
-	err = vs.conn.Connect(ctx)
+	var vcInstance *VSphereInstance
+	if vcInstance, ok = vs.vsphereInstanceMap[cfg.Global.VCenterIP]; !ok {
+		t.Fatalf("Couldn't get vSphere instance: %s", cfg.Global.VCenterIP)
+	}
+
+	err = vcInstance.conn.Connect(ctx)
 	if err != nil {
 		t.Errorf("Failed to connect to vSphere: %s", err)
 	}
-	defer vs.conn.GoVmomiClient.Logout(ctx)
+	defer vcInstance.conn.GoVmomiClient.Logout(ctx)
 }
 
 func TestZones(t *testing.T) {
@@ -154,7 +159,7 @@ func TestInstances(t *testing.T) {
 		t.Skipf("No config found in environment")
 	}
 
-	vs, err := newVSphere(cfg)
+	vs, err := newControllerNode(cfg)
 	if err != nil {
 		t.Fatalf("Failed to construct/authenticate vSphere: %s", err)
 	}
@@ -213,7 +218,7 @@ func TestVolumes(t *testing.T) {
 		t.Skipf("No config found in environment")
 	}
 
-	vs, err := newVSphere(cfg)
+	vs, err := newControllerNode(cfg)
 	if err != nil {
 		t.Fatalf("Failed to construct/authenticate vSphere: %s", err)
 	}
diff --git a/pkg/cloudprovider/providers/vsphere/vsphere_util.go b/pkg/cloudprovider/providers/vsphere/vsphere_util.go
index 3fbe2b621f..efedb06213 100644
--- a/pkg/cloudprovider/providers/vsphere/vsphere_util.go
+++ b/pkg/cloudprovider/providers/vsphere/vsphere_util.go
@@ -28,14 +28,16 @@ import (
 
 	"github.com/golang/glog"
 	"github.com/vmware/govmomi"
-	"github.com/vmware/govmomi/object"
 	"github.com/vmware/govmomi/vim25"
-	"github.com/vmware/govmomi/vim25/mo"
 
 	"fmt"
 
+	"github.com/vmware/govmomi/vim25/mo"
+	"k8s.io/api/core/v1"
+	k8stypes "k8s.io/apimachinery/pkg/types"
 	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib"
 	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers"
+	"path/filepath"
 )
 
 const (
@@ -55,10 +57,28 @@ func GetVSphere() (*VSphere, error) {
 		return nil, err
 	}
 	vSphereConn.GoVmomiClient = client
+	vsphereIns := &VSphereInstance{
+		conn: vSphereConn,
+		cfg: &VirtualCenterConfig{
+			User:              cfg.Global.User,
+			Password:          cfg.Global.Password,
+			VCenterPort:       cfg.Global.VCenterPort,
+			Datacenters:       cfg.Global.Datacenters,
+			RoundTripperCount: cfg.Global.RoundTripperCount,
+		},
+	}
+	vsphereInsMap := make(map[string]*VSphereInstance)
+	vsphereInsMap[cfg.Global.VCenterIP] = vsphereIns
+	// TODO: Initialize nodeManager and set it in VSphere.
 	vs := &VSphere{
-		conn:            vSphereConn,
-		cfg:             cfg,
-		localInstanceID: "",
+		vsphereInstanceMap: vsphereInsMap,
+		hostName:           "",
+		cfg:                cfg,
+		nodeManager: &NodeManager{
+			vsphereInstanceMap: vsphereInsMap,
+			nodeInfoMap:        make(map[string]*NodeInfo),
+			registeredNodes:    make(map[string]*v1.Node),
+		},
 	}
 	runtime.SetFinalizer(vs, logout)
 	return vs, nil
@@ -70,14 +90,18 @@ func getVSphereConfig() *VSphereConfig {
 	cfg.Global.VCenterPort = os.Getenv("VSPHERE_VCENTER_PORT")
 	cfg.Global.User = os.Getenv("VSPHERE_USER")
 	cfg.Global.Password = os.Getenv("VSPHERE_PASSWORD")
-	cfg.Global.Datacenter = os.Getenv("VSPHERE_DATACENTER")
-	cfg.Global.Datastore = os.Getenv("VSPHERE_DATASTORE")
+	cfg.Global.Datacenters = os.Getenv("VSPHERE_DATACENTER")
+	cfg.Global.DefaultDatastore = os.Getenv("VSPHERE_DATASTORE")
 	cfg.Global.WorkingDir = os.Getenv("VSPHERE_WORKING_DIR")
 	cfg.Global.VMName = os.Getenv("VSPHERE_VM_NAME")
 	cfg.Global.InsecureFlag = false
 	if strings.ToLower(os.Getenv("VSPHERE_INSECURE")) == "true" {
 		cfg.Global.InsecureFlag = true
 	}
+	cfg.Workspace.VCenterIP = cfg.Global.VCenterIP
+	cfg.Workspace.Datacenter = cfg.Global.Datacenters
+	cfg.Workspace.DefaultDatastore = cfg.Global.DefaultDatastore
+	cfg.Workspace.Folder = cfg.Global.WorkingDir
 	return &cfg
 }
 
@@ -127,49 +151,83 @@ func getvmUUID() (string, error) {
 	return uuid, nil
 }
 
-// Get all datastores accessible for the virtual machine object.
-func getSharedDatastoresInK8SCluster(ctx context.Context, folder *vclib.Folder) ([]*vclib.Datastore, error) {
-	vmList, err := folder.GetVirtualMachines(ctx)
+// Returns the accessible datastores for the given node VM.
+func getAccessibleDatastores(ctx context.Context, nodeVmDetail *NodeDetails, nodeManager *NodeManager) ([]*vclib.DatastoreInfo, error) {
+	accessibleDatastores, err := nodeVmDetail.vm.GetAllAccessibleDatastores(ctx)
 	if err != nil {
-		glog.Errorf("Failed to get virtual machines in the kubernetes cluster: %s, err: %+v", folder.InventoryPath, err)
-		return nil, err
+		// Check if the node VM is not found which indicates that the node info in the node manager is stale.
+		// If so, rediscover the node and retry.
+		if vclib.IsManagedObjectNotFoundError(err) {
+			glog.V(4).Infof("error %q ManagedObjectNotFound for node %q. Rediscovering...", err, nodeVmDetail.NodeName)
+			err = nodeManager.RediscoverNode(convertToK8sType(nodeVmDetail.NodeName))
+			if err == nil {
+				glog.V(4).Infof("Discovered node %s successfully", nodeVmDetail.NodeName)
+				nodeInfo, err := nodeManager.GetNodeInfo(convertToK8sType(nodeVmDetail.NodeName))
+				if err != nil {
+					glog.V(4).Infof("error %q getting node info for node %+v", err, nodeVmDetail)
+					return nil, err
+				}
+
+				accessibleDatastores, err = nodeInfo.vm.GetAllAccessibleDatastores(ctx)
+				if err != nil {
+					glog.V(4).Infof("error %q getting accessible datastores for node %+v", err, nodeVmDetail)
+					return nil, err
+				}
+			} else {
+				glog.V(4).Infof("error %q rediscovering node %+v", err, nodeVmDetail)
+				return nil, err
+			}
+		} else {
+			glog.V(4).Infof("error %q getting accessible datastores for node %+v", err, nodeVmDetail)
+			return nil, err
+		}
 	}
-	if vmList == nil || len(vmList) == 0 {
-		glog.Errorf("No virtual machines found in the kubernetes cluster: %s", folder.InventoryPath)
-		return nil, fmt.Errorf("No virtual machines found in the kubernetes cluster: %s", folder.InventoryPath)
+	return accessibleDatastores, nil
+}
+
+// Get all datastores accessible for the virtual machine object.
+func getSharedDatastoresInK8SCluster(ctx context.Context, dc *vclib.Datacenter, nodeManager *NodeManager) ([]*vclib.DatastoreInfo, error) {
+	nodeVmDetails := nodeManager.GetNodeDetails()
+	if nodeVmDetails == nil || len(nodeVmDetails) == 0 {
+		msg := fmt.Sprintf("Kubernetes node nodeVmDetail details is empty. nodeVmDetails : %+v", nodeVmDetails)
+		glog.Error(msg)
+		return nil, fmt.Errorf(msg)
 	}
-	index := 0
-	var sharedDatastores []*vclib.Datastore
-	for _, vm := range vmList {
-		vmName, err := vm.ObjectName(ctx)
+	var sharedDatastores []*vclib.DatastoreInfo
+	for index, nodeVmDetail := range nodeVmDetails {
+		glog.V(9).Infof("Getting accessible datastores for node %s", nodeVmDetail.NodeName)
+		accessibleDatastores, err := getAccessibleDatastores(ctx, &nodeVmDetail, nodeManager)
 		if err != nil {
 			return nil, err
 		}
-		if !strings.HasPrefix(vmName, DummyVMPrefixName) {
-			accessibleDatastores, err := vm.GetAllAccessibleDatastores(ctx)
-			if err != nil {
-				return nil, err
+		if index == 0 {
+			sharedDatastores = accessibleDatastores
+		} else {
+			sharedDatastores = intersect(sharedDatastores, accessibleDatastores)
+			if len(sharedDatastores) == 0 {
+				return nil, fmt.Errorf("No shared datastores found in the Kubernetes cluster for nodeVmDetails: %+v", nodeVmDetails)
 			}
-			if index == 0 {
-				sharedDatastores = accessibleDatastores
-			} else {
-				sharedDatastores = intersect(sharedDatastores, accessibleDatastores)
-				if len(sharedDatastores) == 0 {
-					return nil, fmt.Errorf("No shared datastores found in the Kubernetes cluster: %s", folder.InventoryPath)
-				}
-			}
-			index++
 		}
 	}
+	glog.V(9).Infof("sharedDatastores : %+v", sharedDatastores)
+	sharedDatastores, err := getDatastoresForEndpointVC(ctx, dc, sharedDatastores)
+	if err != nil {
+		glog.Errorf("Failed to get shared datastores from endpoint VC. err: %+v", err)
+		return nil, err
+	}
+	glog.V(9).Infof("sharedDatastores at endpoint VC: %+v", sharedDatastores)
 	return sharedDatastores, nil
 }
 
-func intersect(list1 []*vclib.Datastore, list2 []*vclib.Datastore) []*vclib.Datastore {
-	var sharedDs []*vclib.Datastore
+func intersect(list1 []*vclib.DatastoreInfo, list2 []*vclib.DatastoreInfo) []*vclib.DatastoreInfo {
+	glog.V(9).Infof("list1: %+v", list1)
+	glog.V(9).Infof("list2: %+v", list2)
+	var sharedDs []*vclib.DatastoreInfo
 	for _, val1 := range list1 {
 		// Check if val1 is found in list2
 		for _, val2 := range list2 {
-			if val1.Reference().Value == val2.Reference().Value {
+			// Intersection is performed based on the datastoreUrl as this uniquely identifies the datastore.
+			if val1.Info.Url == val2.Info.Url {
 				sharedDs = append(sharedDs, val1)
 				break
 			}
@@ -178,46 +236,42 @@ func intersect(list1 []*vclib.Datastore, list2 []*vclib.Datastore) []*vclib.Data
 	return sharedDs
 }
 
-// Get the datastores accessible for the virtual machine object.
-func getAllAccessibleDatastores(ctx context.Context, client *vim25.Client, vmMo mo.VirtualMachine) ([]string, error) {
-	host := vmMo.Summary.Runtime.Host
-	if host == nil {
-		return nil, errors.New("VM doesn't have a HostSystem")
-	}
-	var hostSystemMo mo.HostSystem
-	s := object.NewSearchIndex(client)
-	err := s.Properties(ctx, host.Reference(), []string{DatastoreProperty}, &hostSystemMo)
-	if err != nil {
-		return nil, err
-	}
-	var dsRefValues []string
-	for _, dsRef := range hostSystemMo.Datastore {
-		dsRefValues = append(dsRefValues, dsRef.Value)
-	}
-	return dsRefValues, nil
-}
-
 // getMostFreeDatastore gets the best fit compatible datastore by free space.
-func getMostFreeDatastoreName(ctx context.Context, client *vim25.Client, dsObjList []*vclib.Datastore) (string, error) {
-	dsMoList, err := dsObjList[0].Datacenter.GetDatastoreMoList(ctx, dsObjList, []string{DatastoreInfoProperty})
-	if err != nil {
-		return "", err
-	}
+func getMostFreeDatastoreName(ctx context.Context, client *vim25.Client, dsInfoList []*vclib.DatastoreInfo) (string, error) {
 	var curMax int64
 	curMax = -1
 	var index int
-	for i, dsMo := range dsMoList {
-		dsFreeSpace := dsMo.Info.GetDatastoreInfo().FreeSpace
+	for i, dsInfo := range dsInfoList {
+		dsFreeSpace := dsInfo.Info.GetDatastoreInfo().FreeSpace
 		if dsFreeSpace > curMax {
 			curMax = dsFreeSpace
 			index = i
 		}
 	}
-	return dsMoList[index].Info.GetDatastoreInfo().Name, nil
+	return dsInfoList[index].Info.GetDatastoreInfo().Name, nil
 }
 
-func getPbmCompatibleDatastore(ctx context.Context, client *vim25.Client, storagePolicyName string, folder *vclib.Folder) (string, error) {
-	pbmClient, err := vclib.NewPbmClient(ctx, client)
+// Returns the datastores in the given datacenter by performing lookup based on datastore URL.
+func getDatastoresForEndpointVC(ctx context.Context, dc *vclib.Datacenter, sharedDsInfos []*vclib.DatastoreInfo) ([]*vclib.DatastoreInfo, error) {
+	var datastores []*vclib.DatastoreInfo
+	allDsInfoMap, err := dc.GetAllDatastores(ctx)
+	if err != nil {
+		return nil, err
+	}
+	for _, sharedDsInfo := range sharedDsInfos {
+		dsInfo, ok := allDsInfoMap[sharedDsInfo.Info.Url]
+		if ok {
+			datastores = append(datastores, dsInfo)
+		} else {
+			glog.V(4).Infof("Warning: Shared datastore with URL %s does not exist in endpoint VC", sharedDsInfo.Info.Url)
+		}
+	}
+	glog.V(9).Infof("Datastore from endpoint VC: %+v", datastores)
+	return datastores, nil
+}
+
+func getPbmCompatibleDatastore(ctx context.Context, dc *vclib.Datacenter, storagePolicyName string, nodeManager *NodeManager) (string, error) {
+	pbmClient, err := vclib.NewPbmClient(ctx, dc.Client())
 	if err != nil {
 		return "", err
 	}
@@ -226,35 +280,40 @@ func getPbmCompatibleDatastore(ctx context.Context, client *vim25.Client, storag
 		glog.Errorf("Failed to get Profile ID by name: %s. err: %+v", storagePolicyName, err)
 		return "", err
 	}
-	sharedDsList, err := getSharedDatastoresInK8SCluster(ctx, folder)
+	sharedDs, err := getSharedDatastoresInK8SCluster(ctx, dc, nodeManager)
 	if err != nil {
-		glog.Errorf("Failed to get shared datastores from kubernetes cluster: %s. err: %+v", folder.InventoryPath, err)
+		glog.Errorf("Failed to get shared datastores. err: %+v", err)
 		return "", err
 	}
-	compatibleDatastores, _, err := pbmClient.GetCompatibleDatastores(ctx, storagePolicyID, sharedDsList)
+	if len(sharedDs) == 0 {
+		msg := "No shared datastores found in the endpoint virtual center"
+		glog.Errorf(msg)
+		return "", errors.New(msg)
+	}
+	compatibleDatastores, _, err := pbmClient.GetCompatibleDatastores(ctx, dc, storagePolicyID, sharedDs)
 	if err != nil {
-		glog.Errorf("Failed to get compatible datastores from datastores : %+v with storagePolicy: %s. err: %+v", sharedDsList, storagePolicyID, err)
+		glog.Errorf("Failed to get compatible datastores from datastores : %+v with storagePolicy: %s. err: %+v",
+			sharedDs, storagePolicyID, err)
 		return "", err
 	}
-	datastore, err := getMostFreeDatastoreName(ctx, client, compatibleDatastores)
+	glog.V(9).Infof("compatibleDatastores : %+v", compatibleDatastores)
+	datastore, err := getMostFreeDatastoreName(ctx, dc.Client(), compatibleDatastores)
 	if err != nil {
 		glog.Errorf("Failed to get most free datastore from compatible datastores: %+v. err: %+v", compatibleDatastores, err)
 		return "", err
 	}
+	glog.V(4).Infof("Most free datastore : %+s", datastore)
 	return datastore, err
 }
 
-func (vs *VSphere) setVMOptions(ctx context.Context, dc *vclib.Datacenter) (*vclib.VMOptions, error) {
+func (vs *VSphere) setVMOptions(ctx context.Context, dc *vclib.Datacenter, resourcePoolPath string) (*vclib.VMOptions, error) {
 	var vmOptions vclib.VMOptions
-	vm, err := dc.GetVMByPath(ctx, vs.cfg.Global.WorkingDir+"/"+vs.localInstanceID)
+	resourcePool, err := dc.GetResourcePool(ctx, resourcePoolPath)
 	if err != nil {
 		return nil, err
 	}
-	resourcePool, err := vm.GetResourcePool(ctx)
-	if err != nil {
-		return nil, err
-	}
-	folder, err := dc.GetFolderByPath(ctx, vs.cfg.Global.WorkingDir)
+	glog.V(9).Infof("Resource pool path %s, resourcePool %+v", resourcePoolPath, resourcePool)
+	folder, err := dc.GetFolderByPath(ctx, vs.cfg.Workspace.Folder)
 	if err != nil {
 		return nil, err
 	}
@@ -270,28 +329,27 @@ func (vs *VSphere) cleanUpDummyVMs(dummyVMPrefix string) {
 	defer cancel()
 	for {
 		time.Sleep(CleanUpDummyVMRoutineInterval * time.Minute)
-		// Ensure client is logged in and session is valid
-		err := vs.conn.Connect(ctx)
+		vsi, err := vs.getVSphereInstanceForServer(vs.cfg.Workspace.VCenterIP, ctx)
 		if err != nil {
-			glog.V(4).Infof("Failed to connect to VC with err: %+v. Retrying again...", err)
+			glog.V(4).Infof("Failed to get VSphere instance with err: %+v. Retrying again...", err)
 			continue
 		}
-		dc, err := vclib.GetDatacenter(ctx, vs.conn, vs.cfg.Global.Datacenter)
+		dc, err := vclib.GetDatacenter(ctx, vsi.conn, vs.cfg.Workspace.Datacenter)
 		if err != nil {
-			glog.V(4).Infof("Failed to get the datacenter: %s from VC. err: %+v", vs.cfg.Global.Datacenter, err)
+			glog.V(4).Infof("Failed to get the datacenter: %s from VC. err: %+v", vs.cfg.Workspace.Datacenter, err)
 			continue
 		}
 		// Get the folder reference for global working directory where the dummy VM needs to be created.
-		vmFolder, err := dc.GetFolderByPath(ctx, vs.cfg.Global.WorkingDir)
+		vmFolder, err := dc.GetFolderByPath(ctx, vs.cfg.Workspace.Folder)
 		if err != nil {
-			glog.V(4).Infof("Unable to get the kubernetes folder: %q reference. err: %+v", vs.cfg.Global.WorkingDir, err)
+			glog.V(4).Infof("Unable to get the kubernetes folder: %q reference. err: %+v", vs.cfg.Workspace.Folder, err)
 			continue
 		}
 		// A write lock is acquired to make sure the cleanUp routine doesn't delete any VM's created by ongoing PVC requests.
 		defer cleanUpDummyVMLock.Lock()
 		err = diskmanagers.CleanUpDummyVMs(ctx, vmFolder, dc)
 		if err != nil {
-			glog.V(4).Infof("Unable to clean up dummy VM's in the kubernetes cluster: %q. err: %+v", vs.cfg.Global.WorkingDir, err)
+			glog.V(4).Infof("Unable to clean up dummy VM's in the kubernetes cluster: %q. err: %+v", vs.cfg.Workspace.Folder, err)
 		}
 	}
 }
@@ -353,3 +411,118 @@ func setdatastoreFolderIDMap(
 	}
 	folderNameIDMap[folderName] = folderID
 }
+
+func convertVolPathToDevicePath(ctx context.Context, dc *vclib.Datacenter, volPath string) (string, error) {
+	volPath = vclib.RemoveStorageClusterORFolderNameFromVDiskPath(volPath)
+	// Get the canonical volume path for volPath.
+	canonicalVolumePath, err := getcanonicalVolumePath(ctx, dc, volPath)
+	if err != nil {
+		glog.Errorf("Failed to get canonical vsphere volume path for volume: %s. err: %+v", volPath, err)
+		return "", err
+	}
+	// Check if the volume path contains .vmdk extension. If not, add the extension and update the nodeVolumes Map
+	if len(canonicalVolumePath) > 0 && filepath.Ext(canonicalVolumePath) != ".vmdk" {
+		canonicalVolumePath += ".vmdk"
+	}
+	return canonicalVolumePath, nil
+}
+
+// convertVolPathsToDevicePaths removes cluster or folder path from volPaths and convert to canonicalPath
+func (vs *VSphere) convertVolPathsToDevicePaths(ctx context.Context, nodeVolumes map[k8stypes.NodeName][]string) (map[k8stypes.NodeName][]string, error) {
+	vmVolumes := make(map[k8stypes.NodeName][]string)
+	for nodeName, volPaths := range nodeVolumes {
+		nodeInfo, err := vs.nodeManager.GetNodeInfo(nodeName)
+		if err != nil {
+			return nil, err
+		}
+
+		_, err = vs.getVSphereInstanceForServer(nodeInfo.vcServer, ctx)
+		if err != nil {
+			return nil, err
+		}
+
+		for i, volPath := range volPaths {
+			deviceVolPath, err := convertVolPathToDevicePath(ctx, nodeInfo.dataCenter, volPath)
+			if err != nil {
+				glog.Errorf("Failed to convert vsphere volume path %s to device path for volume %s. err: %+v", volPath, deviceVolPath, err)
+				return nil, err
+			}
+			volPaths[i] = deviceVolPath
+		}
+		vmVolumes[nodeName] = volPaths
+	}
+	return vmVolumes, nil
+}
+
+// checkDiskAttached verifies volumes are attached to the VMs which are in same vCenter and Datacenter
+// Returns nodes if exist any for which VM is not found in that vCenter and Datacenter
+func (vs *VSphere) checkDiskAttached(ctx context.Context, nodes []k8stypes.NodeName, nodeVolumes map[k8stypes.NodeName][]string, attached map[string]map[string]bool, retry bool) ([]k8stypes.NodeName, error) {
+	var nodesToRetry []k8stypes.NodeName
+	var vmList []*vclib.VirtualMachine
+	var nodeInfo NodeInfo
+	var err error
+
+	for _, nodeName := range nodes {
+		nodeInfo, err = vs.nodeManager.GetNodeInfo(nodeName)
+		if err != nil {
+			return nodesToRetry, err
+		}
+		vmList = append(vmList, nodeInfo.vm)
+	}
+
+	// Making sure session is valid
+	_, err = vs.getVSphereInstanceForServer(nodeInfo.vcServer, ctx)
+	if err != nil {
+		return nodesToRetry, err
+	}
+
+	// If any of the nodes are not present property collector query will fail for entire operation
+	vmMoList, err := nodeInfo.dataCenter.GetVMMoList(ctx, vmList, []string{"config.hardware.device", "name", "config.uuid"})
+	if err != nil {
+		if vclib.IsManagedObjectNotFoundError(err) && !retry {
+			glog.V(4).Infof("checkDiskAttached: ManagedObjectNotFound for property collector query for nodes: %+v vms: %+v", nodes, vmList)
+			// Property Collector Query failed
+			// VerifyVolumePaths per VM
+			for _, nodeName := range nodes {
+				nodeInfo, err := vs.nodeManager.GetNodeInfo(nodeName)
+				if err != nil {
+					return nodesToRetry, err
+				}
+				devices, err := nodeInfo.vm.VirtualMachine.Device(ctx)
+				if err != nil {
+					if vclib.IsManagedObjectNotFoundError(err) {
+						glog.V(4).Infof("checkDiskAttached: ManagedObjectNotFound for Kubernetes node: %s with vSphere Virtual Machine reference: %v", nodeName, nodeInfo.vm)
+						nodesToRetry = append(nodesToRetry, nodeName)
+						continue
+					}
+					return nodesToRetry, err
+				}
+				glog.V(4).Infof("Verifying Volume Paths by devices for node %s and VM %s", nodeName, nodeInfo.vm)
+				vclib.VerifyVolumePathsForVMDevices(devices, nodeVolumes[nodeName], convertToString(nodeName), attached)
+			}
+		}
+		return nodesToRetry, err
+	}
+
+	vmMoMap := make(map[string]mo.VirtualMachine)
+	for _, vmMo := range vmMoList {
+		if vmMo.Config == nil {
+			glog.Errorf("Config is not available for VM: %q", vmMo.Name)
+			continue
+		}
+		glog.V(9).Infof("vmMoMap vmname: %q vmuuid: %s", vmMo.Name, strings.ToLower(vmMo.Config.Uuid))
+		vmMoMap[strings.ToLower(vmMo.Config.Uuid)] = vmMo
+	}
+
+	glog.V(9).Infof("vmMoMap: +%v", vmMoMap)
+
+	for _, nodeName := range nodes {
+		node, err := vs.nodeManager.GetNode(nodeName)
+		if err != nil {
+			return nodesToRetry, err
+		}
+		glog.V(9).Infof("Verifying volume for nodeName: %q with nodeuuid: %s", nodeName, node.Status.NodeInfo.SystemUUID, vmMoMap)
+		vclib.VerifyVolumePathsForVM(vmMoMap[strings.ToLower(node.Status.NodeInfo.SystemUUID)], nodeVolumes[nodeName], convertToString(nodeName), attached)
+	}
+	return nodesToRetry, nil
+}
diff --git a/pkg/volume/vsphere_volume/attacher.go b/pkg/volume/vsphere_volume/attacher.go
index 5b1879def4..fb9886beba 100644
--- a/pkg/volume/vsphere_volume/attacher.go
+++ b/pkg/volume/vsphere_volume/attacher.go
@@ -76,7 +76,7 @@ func (attacher *vsphereVMDKAttacher) Attach(spec *volume.Spec, nodeName types.No
 
 	// vsphereCloud.AttachDisk checks if disk is already attached to host and
 	// succeeds in that case, so no need to do that separately.
-	diskUUID, err := attacher.vsphereVolumes.AttachDisk(volumeSource.VolumePath, volumeSource.StoragePolicyID, nodeName)
+	diskUUID, err := attacher.vsphereVolumes.AttachDisk(volumeSource.VolumePath, volumeSource.StoragePolicyName, nodeName)
 	if err != nil {
 		glog.Errorf("Error attaching volume %q to node %q: %+v", volumeSource.VolumePath, nodeName, err)
 		return "", err
diff --git a/test/e2e/storage/persistent_volumes-vsphere.go b/test/e2e/storage/persistent_volumes-vsphere.go
index a3be3de784..1d49361f06 100644
--- a/test/e2e/storage/persistent_volumes-vsphere.go
+++ b/test/e2e/storage/persistent_volumes-vsphere.go
@@ -70,7 +70,7 @@ var _ = SIGDescribe("PersistentVolumes:vsphere", func() {
 		selector = metav1.SetAsLabelSelector(volLabel)
 
 		if vsp == nil {
-			vsp, err = vsphere.GetVSphere()
+			vsp, err = getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())
 		}
 		if volumePath == "" {
@@ -105,7 +105,7 @@ var _ = SIGDescribe("PersistentVolumes:vsphere", func() {
 		node = types.NodeName(clientPod.Spec.NodeName)
 
 		By("Verify disk should be attached to the node")
-		isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, node)
+		isAttached, err := verifyVSphereDiskAttached(c, vsp, volumePath, node)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), "disk is not attached with the node")
 	})
@@ -133,7 +133,11 @@ var _ = SIGDescribe("PersistentVolumes:vsphere", func() {
 	framework.AddCleanupAction(func() {
 		// Cleanup actions will be called even when the tests are skipped and leaves namespace unset.
 		if len(ns) > 0 && len(volumePath) > 0 {
-			framework.ExpectNoError(waitForVSphereDiskToDetach(vsp, volumePath, node))
+			client, err := framework.LoadClientset()
+			if err != nil {
+				return
+			}
+			framework.ExpectNoError(waitForVSphereDiskToDetach(client, vsp, volumePath, node))
 			vsp.DeleteVolume(volumePath)
 		}
 	})
@@ -213,6 +217,6 @@ var _ = SIGDescribe("PersistentVolumes:vsphere", func() {
 		Expect(err).NotTo(HaveOccurred())
 
 		By("Verifying Persistent Disk detaches")
-		waitForVSphereDiskToDetach(vsp, volumePath, node)
+		waitForVSphereDiskToDetach(c, vsp, volumePath, node)
 	})
 })
diff --git a/test/e2e/storage/pv_reclaimpolicy.go b/test/e2e/storage/pv_reclaimpolicy.go
index b7415ec91a..8713ce7d7e 100644
--- a/test/e2e/storage/pv_reclaimpolicy.go
+++ b/test/e2e/storage/pv_reclaimpolicy.go
@@ -56,7 +56,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 		})
 
 		AfterEach(func() {
-			vsp, err := vsphere.GetVSphere()
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())
 			testCleanupVSpherePersistentVolumeReclaim(vsp, c, ns, volumePath, pv, pvc)
 		})
@@ -74,7 +74,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 			6. Verify PV is deleted automatically.
 		*/
 		It("should delete persistent volume when reclaimPolicy set to delete and associated claim is deleted", func() {
-			vsp, err := vsphere.GetVSphere()
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())
 
 			volumePath, pv, pvc, err = testSetupVSpherePersistentVolumeReclaim(vsp, c, ns, v1.PersistentVolumeReclaimDelete)
@@ -104,7 +104,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 			9. Verify PV should be detached from the node and automatically deleted.
 		*/
 		It("should not detach and unmount PV when associated pvc with delete as reclaimPolicy is deleted when it is in use by the pod", func() {
-			vsp, err := vsphere.GetVSphere()
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())
 
 			volumePath, pv, pvc, err = testSetupVSpherePersistentVolumeReclaim(vsp, c, ns, v1.PersistentVolumeReclaimDelete)
@@ -127,19 +127,19 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 			Expect(framework.WaitForPersistentVolumePhase(v1.VolumeFailed, c, pv.Name, 1*time.Second, 60*time.Second)).NotTo(HaveOccurred())
 
 			By("Verify the volume is attached to the node")
-			isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(vsp, pv.Spec.VsphereVolume.VolumePath, node)
+			isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(c, vsp, pv.Spec.VsphereVolume.VolumePath, node)
 			Expect(verifyDiskAttachedError).NotTo(HaveOccurred())
 			Expect(isVolumeAttached).To(BeTrue())
 
 			By("Verify the volume is accessible and available in the pod")
-			verifyVSphereVolumesAccessible(pod, []*v1.PersistentVolume{pv}, vsp)
+			verifyVSphereVolumesAccessible(c, pod, []*v1.PersistentVolume{pv}, vsp)
 			framework.Logf("Verified that Volume is accessible in the POD after deleting PV claim")
 
 			By("Deleting the Pod")
 			framework.ExpectNoError(framework.DeletePodWithWait(f, c, pod), "Failed to delete pod ", pod.Name)
 
 			By("Verify PV is detached from the node after Pod is deleted")
-			Expect(waitForVSphereDiskToDetach(vsp, pv.Spec.VsphereVolume.VolumePath, types.NodeName(pod.Spec.NodeName))).NotTo(HaveOccurred())
+			Expect(waitForVSphereDiskToDetach(c, vsp, pv.Spec.VsphereVolume.VolumePath, types.NodeName(pod.Spec.NodeName))).NotTo(HaveOccurred())
 
 			By("Verify PV should be deleted automatically")
 			framework.ExpectNoError(framework.WaitForPersistentVolumeDeleted(c, pv.Name, 1*time.Second, 30*time.Second))
@@ -167,7 +167,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 
 		It("should retain persistent volume when reclaimPolicy set to retain when associated claim is deleted", func() {
 			var volumeFileContent = "hello from vsphere cloud provider, Random Content is :" + strconv.FormatInt(time.Now().UnixNano(), 10)
-			vsp, err := vsphere.GetVSphere()
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())
 
 			volumePath, pv, pvc, err = testSetupVSpherePersistentVolumeReclaim(vsp, c, ns, v1.PersistentVolumeReclaimRetain)
diff --git a/test/e2e/storage/pvc_label_selector.go b/test/e2e/storage/pvc_label_selector.go
index 67842c7717..d389386a6c 100644
--- a/test/e2e/storage/pvc_label_selector.go
+++ b/test/e2e/storage/pvc_label_selector.go
@@ -23,7 +23,6 @@ import (
 	. "github.com/onsi/gomega"
 	"k8s.io/api/core/v1"
 	clientset "k8s.io/client-go/kubernetes"
-	vsphere "k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
 
@@ -104,7 +103,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:LabelSelector]", func() {
 func testSetupVSpherePVClabelselector(c clientset.Interface, ns string, ssdlabels map[string]string, vvollabels map[string]string) (volumePath string, pv_ssd *v1.PersistentVolume, pvc_ssd *v1.PersistentVolumeClaim, pvc_vvol *v1.PersistentVolumeClaim, err error) {
 	volumePath = ""
 	By("creating vmdk")
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(c)
 	Expect(err).NotTo(HaveOccurred())
 	volumePath, err = createVSphereVolume(vsp, nil)
 	if err != nil {
@@ -134,7 +133,7 @@ func testSetupVSpherePVClabelselector(c clientset.Interface, ns string, ssdlabel
 func testCleanupVSpherePVClabelselector(c clientset.Interface, ns string, volumePath string, pv_ssd *v1.PersistentVolume, pvc_ssd *v1.PersistentVolumeClaim, pvc_vvol *v1.PersistentVolumeClaim) {
 	By("running testCleanupVSpherePVClabelselector")
 	if len(volumePath) > 0 {
-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(c)
 		Expect(err).NotTo(HaveOccurred())
 		vsp.DeleteVolume(volumePath)
 	}
diff --git a/test/e2e/storage/volumes.go b/test/e2e/storage/volumes.go
index 61c42297ff..da4ad0bc67 100644
--- a/test/e2e/storage/volumes.go
+++ b/test/e2e/storage/volumes.go
@@ -53,7 +53,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
@@ -507,7 +506,11 @@ var _ = SIGDescribe("Volumes", func() {
 				Prefix:    "vsphere",
 			}
 			By("creating a test vsphere volume")
-			vsp, err := vsphere.GetVSphere()
+			c, err := framework.LoadClientset()
+			if err != nil {
+				return
+			}
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())
 
 			volumePath, err = createVSphereVolume(vsp, nil)
diff --git a/test/e2e/storage/vsphere_scale.go b/test/e2e/storage/vsphere_scale.go
index 810b70d6e3..ded690e204 100644
--- a/test/e2e/storage/vsphere_scale.go
+++ b/test/e2e/storage/vsphere_scale.go
@@ -150,7 +150,7 @@ var _ = SIGDescribe("vcp at scale [Feature:vsphere] ", func() {
 			scArrays[index] = sc
 		}
 
-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())
 
 		volumeCountPerInstance := volumeCount / numberOfInstances
@@ -176,7 +176,7 @@ var _ = SIGDescribe("vcp at scale [Feature:vsphere] ", func() {
 			Expect(err).NotTo(HaveOccurred())
 		}
 		By("Waiting for volumes to be detached from the node")
-		err = waitForVSphereDisksToDetach(vsp, nodeVolumeMap)
+		err = waitForVSphereDisksToDetach(client, vsp, nodeVolumeMap)
 		Expect(err).NotTo(HaveOccurred())
 
 		for _, pvcClaim := range pvcClaimList {
@@ -228,7 +228,7 @@ func VolumeCreateAndAttach(client clientset.Interface, namespace string, sc []*s
 			nodeVolumeMap[pod.Spec.NodeName] = append(nodeVolumeMap[pod.Spec.NodeName], pv.Spec.VsphereVolume.VolumePath)
 		}
 		By("Verify the volume is accessible and available in the pod")
-		verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+		verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)
 		nodeSelectorIndex++
 	}
 	nodeVolumeMapChan <- nodeVolumeMap
diff --git a/test/e2e/storage/vsphere_statefulsets.go b/test/e2e/storage/vsphere_statefulsets.go
index c2823c72d8..b0a633ad39 100644
--- a/test/e2e/storage/vsphere_statefulsets.go
+++ b/test/e2e/storage/vsphere_statefulsets.go
@@ -24,7 +24,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
 
@@ -104,7 +103,7 @@ var _ = SIGDescribe("vsphere statefulset", func() {
 		Expect(scaledownErr).NotTo(HaveOccurred())
 		statefulsetTester.WaitForStatusReadyReplicas(statefulset, replicas-1)
 
-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())
 
 		// After scale down, verify vsphere volumes are detached from deleted pods
@@ -117,7 +116,7 @@ var _ = SIGDescribe("vsphere statefulset", func() {
 					if volumespec.PersistentVolumeClaim != nil {
 						vSpherediskPath := getvSphereVolumePathFromClaim(client, statefulset.Namespace, volumespec.PersistentVolumeClaim.ClaimName)
 						framework.Logf("Waiting for Volume: %q to detach from Node: %q", vSpherediskPath, sspod.Spec.NodeName)
-						Expect(waitForVSphereDiskToDetach(vsp, vSpherediskPath, types.NodeName(sspod.Spec.NodeName))).NotTo(HaveOccurred())
+						Expect(waitForVSphereDiskToDetach(client, vsp, vSpherediskPath, types.NodeName(sspod.Spec.NodeName))).NotTo(HaveOccurred())
 					}
 				}
 			}
@@ -146,7 +145,7 @@ var _ = SIGDescribe("vsphere statefulset", func() {
 					framework.Logf("Verify Volume: %q is attached to the Node: %q", vSpherediskPath, sspod.Spec.NodeName)
 					// Verify scale up has re-attached the same volumes and not introduced new volume
 					Expect(volumesBeforeScaleDown[vSpherediskPath] == "").To(BeFalse())
-					isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(vsp, vSpherediskPath, types.NodeName(sspod.Spec.NodeName))
+					isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(client, vsp, vSpherediskPath, types.NodeName(sspod.Spec.NodeName))
 					Expect(isVolumeAttached).To(BeTrue())
 					Expect(verifyDiskAttachedError).NotTo(HaveOccurred())
 				}
diff --git a/test/e2e/storage/vsphere_stress.go b/test/e2e/storage/vsphere_stress.go
index 4dcea07605..4be0205e05 100644
--- a/test/e2e/storage/vsphere_stress.go
+++ b/test/e2e/storage/vsphere_stress.go
@@ -30,7 +30,6 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	k8stype "k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
 
@@ -135,9 +134,8 @@ var _ = SIGDescribe("vsphere cloud provider stress [Feature:vsphere]", func() {
 func PerformVolumeLifeCycleInParallel(f *framework.Framework, client clientset.Interface, namespace string, instanceId string, sc *storageV1.StorageClass, iterations int, wg *sync.WaitGroup) {
 	defer wg.Done()
 	defer GinkgoRecover()
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(f.ClientSet)
 	Expect(err).NotTo(HaveOccurred())
-
 	for iterationCount := 0; iterationCount < iterations; iterationCount++ {
 		logPrefix := fmt.Sprintf("Instance: [%v], Iteration: [%v] :", instanceId, iterationCount+1)
 		By(fmt.Sprintf("%v Creating PVC using the Storage Class: %v", logPrefix, sc.Name))
@@ -164,19 +162,19 @@ func PerformVolumeLifeCycleInParallel(f *framework.Framework, client clientset.I
 		Expect(err).NotTo(HaveOccurred())
 
 		By(fmt.Sprintf("%v Verifing the volume: %v is attached to the node VM: %v", logPrefix, persistentvolumes[0].Spec.VsphereVolume.VolumePath, pod.Spec.NodeName))
-		isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, types.NodeName(pod.Spec.NodeName))
+		isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(client, vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, types.NodeName(pod.Spec.NodeName))
 		Expect(isVolumeAttached).To(BeTrue())
 		Expect(verifyDiskAttachedError).NotTo(HaveOccurred())
 
 		By(fmt.Sprintf("%v Verifing the volume: %v is accessible in the pod: %v", logPrefix, persistentvolumes[0].Spec.VsphereVolume.VolumePath, pod.Name))
-		verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+		verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)
 
 		By(fmt.Sprintf("%v Deleting pod: %v", logPrefix, pod.Name))
 		err = framework.DeletePodWithWait(f, client, pod)
 		Expect(err).NotTo(HaveOccurred())
 
 		By(fmt.Sprintf("%v Waiting for volume: %v to be detached from the node: %v", logPrefix, persistentvolumes[0].Spec.VsphereVolume.VolumePath, pod.Spec.NodeName))
-		err = waitForVSphereDiskToDetach(vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
+		err = waitForVSphereDiskToDetach(client, vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
 		Expect(err).NotTo(HaveOccurred())
 
 		By(fmt.Sprintf("%v Deleting the Claim: %v", logPrefix, pvclaim.Name))
diff --git a/test/e2e/storage/vsphere_utils.go b/test/e2e/storage/vsphere_utils.go
index 2215b6fa77..b0c87c0cd7 100644
--- a/test/e2e/storage/vsphere_utils.go
+++ b/test/e2e/storage/vsphere_utils.go
@@ -55,13 +55,13 @@ const (
 )
 
 // Sanity check for vSphere testing.  Verify the persistent disk attached to the node.
-func verifyVSphereDiskAttached(vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) (bool, error) {
+func verifyVSphereDiskAttached(c clientset.Interface, vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) (bool, error) {
 	var (
 		isAttached bool
 		err        error
 	)
 	if vsp == nil {
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(c)
 		Expect(err).NotTo(HaveOccurred())
 	}
 	isAttached, err = vsp.DiskIsAttached(volumePath, nodeName)
@@ -70,7 +70,7 @@ func verifyVSphereDiskAttached(vsp *vsphere.VSphere, volumePath string, nodeName
 }
 
 // Wait until vsphere volumes are detached from the list of nodes or time out after 5 minutes
-func waitForVSphereDisksToDetach(vsp *vsphere.VSphere, nodeVolumes map[k8stype.NodeName][]string) error {
+func waitForVSphereDisksToDetach(c clientset.Interface, vsp *vsphere.VSphere, nodeVolumes map[k8stype.NodeName][]string) error {
 	var (
 		err            error
 		disksAttached  = true
@@ -78,7 +78,7 @@ func waitForVSphereDisksToDetach(vsp *vsphere.VSphere, nodeVolumes map[k8stype.N
 		detachPollTime = 10 * time.Second
 	)
 	if vsp == nil {
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(c)
 		if err != nil {
 			return err
 		}
@@ -110,7 +110,7 @@ func waitForVSphereDisksToDetach(vsp *vsphere.VSphere, nodeVolumes map[k8stype.N
 }
 
 // Wait until vsphere vmdk moves to expected state on the given node, or time out after 6 minutes
-func waitForVSphereDiskStatus(vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName, expectedState volumeState) error {
+func waitForVSphereDiskStatus(c clientset.Interface, vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName, expectedState volumeState) error {
 	var (
 		err          error
 		diskAttached bool
@@ -130,7 +130,7 @@ func waitForVSphereDiskStatus(vsp *vsphere.VSphere, volumePath string, nodeName
 	}
 
 	err = wait.Poll(pollTime, timeout, func() (bool, error) {
-		diskAttached, err = verifyVSphereDiskAttached(vsp, volumePath, nodeName)
+		diskAttached, err = verifyVSphereDiskAttached(c, vsp, volumePath, nodeName)
 		if err != nil {
 			return true, err
 		}
@@ -154,13 +154,13 @@ func waitForVSphereDiskStatus(vsp *vsphere.VSphere, volumePath string, nodeName
 }
 
 // Wait until vsphere vmdk is attached from the given node or time out after 6 minutes
-func waitForVSphereDiskToAttach(vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) error {
-	return waitForVSphereDiskStatus(vsp, volumePath, nodeName, volumeStateAttached)
+func waitForVSphereDiskToAttach(c clientset.Interface, vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) error {
+	return waitForVSphereDiskStatus(c, vsp, volumePath, nodeName, volumeStateAttached)
 }
 
 // Wait until vsphere vmdk is detached from the given node or time out after 6 minutes
-func waitForVSphereDiskToDetach(vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) error {
-	return waitForVSphereDiskStatus(vsp, volumePath, nodeName, volumeStateDetached)
+func waitForVSphereDiskToDetach(c clientset.Interface, vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) error {
+	return waitForVSphereDiskStatus(c, vsp, volumePath, nodeName, volumeStateDetached)
 }
 
 // function to create vsphere volume spec with given VMDK volume path, Reclaim Policy and labels
@@ -414,12 +414,12 @@ func createEmptyFilesOnVSphereVolume(namespace string, podName string, filePaths
 }
 
 // verify volumes are attached to the node and are accessible in pod
-func verifyVSphereVolumesAccessible(pod *v1.Pod, persistentvolumes []*v1.PersistentVolume, vsp *vsphere.VSphere) {
+func verifyVSphereVolumesAccessible(c clientset.Interface, pod *v1.Pod, persistentvolumes []*v1.PersistentVolume, vsp *vsphere.VSphere) {
 	nodeName := pod.Spec.NodeName
 	namespace := pod.Namespace
 	for index, pv := range persistentvolumes {
 		// Verify disks are attached to the node
-		isAttached, err := verifyVSphereDiskAttached(vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(nodeName))
+		isAttached, err := verifyVSphereDiskAttached(c, vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(nodeName))
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), fmt.Sprintf("disk %v is not attached with the node", pv.Spec.VsphereVolume.VolumePath))
 		// Verify Volumes are accessible
@@ -437,3 +437,23 @@ func getvSphereVolumePathFromClaim(client clientset.Interface, namespace string,
 	Expect(err).NotTo(HaveOccurred())
 	return pv.Spec.VsphereVolume.VolumePath
 }
+
+func addNodesToVCP(vsp *vsphere.VSphere, c clientset.Interface) error {
+	nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
+	if err != nil {
+		return err
+	}
+	for _, node := range nodes.Items {
+		vsp.NodeAdded(&node)
+	}
+	return nil
+}
+
+func getVSphere(c clientset.Interface) (*vsphere.VSphere, error) {
+	vsp, err := vsphere.GetVSphere()
+	if err != nil {
+		return nil, err
+	}
+	addNodesToVCP(vsp, c)
+	return vsp, nil
+}
diff --git a/test/e2e/storage/vsphere_volume_cluster_ds.go b/test/e2e/storage/vsphere_volume_cluster_ds.go
index 75ef426f65..b2ebd29fb1 100644
--- a/test/e2e/storage/vsphere_volume_cluster_ds.go
+++ b/test/e2e/storage/vsphere_volume_cluster_ds.go
@@ -25,7 +25,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
@@ -69,7 +68,7 @@ var _ = SIGDescribe("Volume Provisioning On Clustered Datastore [Feature:vsphere
 
 	It("verify static provisioning on clustered datastore", func() {
 		var volumePath string
-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())
 
 		By("creating a test vsphere volume")
@@ -100,7 +99,7 @@ var _ = SIGDescribe("Volume Provisioning On Clustered Datastore [Feature:vsphere
 		nodeName := types.NodeName(pod.Spec.NodeName)
 
 		By("Verifying volume is attached")
-		isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, nodeName)
+		isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePath, nodeName)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), fmt.Sprintf("disk: %s is not attached with the node: %v", volumePath, nodeName))
 
@@ -109,7 +108,7 @@ var _ = SIGDescribe("Volume Provisioning On Clustered Datastore [Feature:vsphere
 		Expect(err).NotTo(HaveOccurred())
 
 		By("Waiting for volumes to be detached from the node")
-		err = waitForVSphereDiskToDetach(vsp, volumePath, nodeName)
+		err = waitForVSphereDiskToDetach(client, vsp, volumePath, nodeName)
 		Expect(err).NotTo(HaveOccurred())
 	})
 
diff --git a/test/e2e/storage/vsphere_volume_datastore.go b/test/e2e/storage/vsphere_volume_datastore.go
index a80f22e911..df86a96e5e 100644
--- a/test/e2e/storage/vsphere_volume_datastore.go
+++ b/test/e2e/storage/vsphere_volume_datastore.go
@@ -68,7 +68,7 @@ var _ = SIGDescribe("Volume Provisioning on Datastore [Feature:vsphere]", func()
 		scParameters[DiskFormat] = ThinDisk
 		err := invokeInvalidDatastoreTestNeg(client, namespace, scParameters)
 		Expect(err).To(HaveOccurred())
-		errorMsg := `Failed to provision volume with StorageClass \"` + DatastoreSCName + `\": datastore '` + InvalidDatastore + `' not found`
+		errorMsg := `Failed to provision volume with StorageClass \"` + DatastoreSCName + `\": The specified datastore ` + InvalidDatastore + ` is not a shared datastore across node VMs`
 		if !strings.Contains(err.Error(), errorMsg) {
 			Expect(err).NotTo(HaveOccurred(), errorMsg)
 		}
diff --git a/test/e2e/storage/vsphere_volume_diskformat.go b/test/e2e/storage/vsphere_volume_diskformat.go
index de1915b990..b805eb4d5a 100644
--- a/test/e2e/storage/vsphere_volume_diskformat.go
+++ b/test/e2e/storage/vsphere_volume_diskformat.go
@@ -145,9 +145,9 @@ func invokeTest(f *framework.Framework, client clientset.Interface, namespace st
 	pod, err := client.CoreV1().Pods(namespace).Create(podSpec)
 	Expect(err).NotTo(HaveOccurred())
 
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())
-	verifyVSphereDiskAttached(vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(nodeName))
+	verifyVSphereDiskAttached(client, vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(nodeName))
 
 	By("Waiting for pod to be running")
 	Expect(framework.WaitForPodNameRunningInNamespace(client, pod.Name, namespace)).To(Succeed())
diff --git a/test/e2e/storage/vsphere_volume_fstype.go b/test/e2e/storage/vsphere_volume_fstype.go
index 7ace7eed6e..352b6dd393 100644
--- a/test/e2e/storage/vsphere_volume_fstype.go
+++ b/test/e2e/storage/vsphere_volume_fstype.go
@@ -97,7 +97,7 @@ func invokeTestForFstype(f *framework.Framework, client clientset.Interface, nam
 	framework.Logf("Invoking Test for fstype: %s", fstype)
 	scParameters := make(map[string]string)
 	scParameters["fstype"] = fstype
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())
 
 	// Create Persistent Volume
@@ -117,7 +117,7 @@ func invokeTestForFstype(f *framework.Framework, client clientset.Interface, nam
 func invokeTestForInvalidFstype(f *framework.Framework, client clientset.Interface, namespace string, fstype string) {
 	scParameters := make(map[string]string)
 	scParameters["fstype"] = fstype
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())
 
 	// Create Persistent Volume
@@ -170,12 +170,12 @@ func createPodAndVerifyVolumeAccessible(client clientset.Interface, namespace st
 	pvclaims = append(pvclaims, pvclaim)
 	By("Creating pod to attach PV to the node")
 	// Create pod to attach Volume to Node
-	pod, err := framework.CreatePod(client, namespace, nil, pvclaims, false, "")
+	pod, err := framework.CreatePod(client, namespace, nil, pvclaims, false, ExecCommand)
 	Expect(err).NotTo(HaveOccurred())
 
 	// Asserts: Right disk is attached to the pod
 	By("Verify the volume is accessible and available in the pod")
-	verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+	verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)
 	return pod
 }
 
@@ -184,7 +184,7 @@ func detachVolume(f *framework.Framework, client clientset.Interface, vsp *vsphe
 	framework.DeletePodWithWait(f, client, pod)
 
 	By("Waiting for volumes to be detached from the node")
-	waitForVSphereDiskToDetach(vsp, volPath, k8stype.NodeName(pod.Spec.NodeName))
+	waitForVSphereDiskToDetach(client, vsp, volPath, k8stype.NodeName(pod.Spec.NodeName))
 }
 
 func deleteVolume(client clientset.Interface, pvclaimName string, namespace string) {
diff --git a/test/e2e/storage/vsphere_volume_master_restart.go b/test/e2e/storage/vsphere_volume_master_restart.go
index 36a0164aab..5ba8616251 100644
--- a/test/e2e/storage/vsphere_volume_master_restart.go
+++ b/test/e2e/storage/vsphere_volume_master_restart.go
@@ -27,7 +27,6 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
 
@@ -79,7 +78,7 @@ var _ = SIGDescribe("Volume Attach Verify [Feature:vsphere][Serial][Disruptive]"
 	})
 
 	It("verify volume remains attached after master kubelet restart", func() {
-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())
 
 		// Create pod on each node
@@ -106,7 +105,7 @@ var _ = SIGDescribe("Volume Attach Verify [Feature:vsphere][Serial][Disruptive]"
 
 			nodeName := types.NodeName(pod.Spec.NodeName)
 			By(fmt.Sprintf("Verify volume %s is attached to the pod %v", volumePath, nodeName))
-			isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, types.NodeName(nodeName))
+			isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePath, types.NodeName(nodeName))
 			Expect(err).NotTo(HaveOccurred())
 			Expect(isAttached).To(BeTrue(), fmt.Sprintf("disk: %s is not attached with the node", volumePath))
 
@@ -126,7 +125,7 @@ var _ = SIGDescribe("Volume Attach Verify [Feature:vsphere][Serial][Disruptive]"
 
 			nodeName := types.NodeName(pod.Spec.NodeName)
 			By(fmt.Sprintf("After master restart, verify volume %v is attached to the pod %v", volumePath, nodeName))
-			isAttached, err := verifyVSphereDiskAttached(vsp, volumePaths[i], types.NodeName(nodeName))
+			isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePaths[i], types.NodeName(nodeName))
 			Expect(err).NotTo(HaveOccurred())
 			Expect(isAttached).To(BeTrue(), fmt.Sprintf("disk: %s is not attached with the node", volumePath))
 
@@ -135,7 +134,7 @@ var _ = SIGDescribe("Volume Attach Verify [Feature:vsphere][Serial][Disruptive]"
 			Expect(err).NotTo(HaveOccurred())
 
 			By(fmt.Sprintf("Waiting for volume %s to be detached from the node %v", volumePath, nodeName))
-			err = waitForVSphereDiskToDetach(vsp, volumePath, types.NodeName(nodeName))
+			err = waitForVSphereDiskToDetach(client, vsp, volumePath, types.NodeName(nodeName))
 			Expect(err).NotTo(HaveOccurred())
 
 			By(fmt.Sprintf("Deleting volume %s", volumePath))
diff --git a/test/e2e/storage/vsphere_volume_node_poweroff.go b/test/e2e/storage/vsphere_volume_node_poweroff.go
index a28a06305f..b902872edc 100644
--- a/test/e2e/storage/vsphere_volume_node_poweroff.go
+++ b/test/e2e/storage/vsphere_volume_node_poweroff.go
@@ -61,7 +61,7 @@ var _ = SIGDescribe("Node Poweroff [Feature:vsphere] [Slow] [Disruptive]", func(
 		nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
 		Expect(nodeList.Items).NotTo(BeEmpty(), "Unable to find ready and schedulable Node")
 		Expect(len(nodeList.Items) > 1).To(BeTrue(), "At least 2 nodes are required for this test")
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())
 		workingDir = os.Getenv("VSPHERE_WORKING_DIR")
 		Expect(workingDir).NotTo(BeEmpty())
@@ -112,7 +112,7 @@ var _ = SIGDescribe("Node Poweroff [Feature:vsphere] [Slow] [Disruptive]", func(
 		node1 := types.NodeName(pod.Spec.NodeName)
 
 		By(fmt.Sprintf("Verify disk is attached to the node: %v", node1))
-		isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, node1)
+		isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePath, node1)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), "Disk is not attached to the node")
 
@@ -139,11 +139,11 @@ var _ = SIGDescribe("Node Poweroff [Feature:vsphere] [Slow] [Disruptive]", func(
 		Expect(err).NotTo(HaveOccurred(), "Pod did not fail over to a different node")
 
 		By(fmt.Sprintf("Waiting for disk to be attached to the new node: %v", node2))
-		err = waitForVSphereDiskToAttach(vsp, volumePath, node2)
+		err = waitForVSphereDiskToAttach(client, vsp, volumePath, node2)
 		Expect(err).NotTo(HaveOccurred(), "Disk is not attached to the node")
 
 		By(fmt.Sprintf("Waiting for disk to be detached from the previous node: %v", node1))
-		err = waitForVSphereDiskToDetach(vsp, volumePath, node1)
+		err = waitForVSphereDiskToDetach(client, vsp, volumePath, node1)
 		Expect(err).NotTo(HaveOccurred(), "Disk is not detached from the node")
 
 		By(fmt.Sprintf("Power on the previous node: %v", node1))
diff --git a/test/e2e/storage/vsphere_volume_ops_storm.go b/test/e2e/storage/vsphere_volume_ops_storm.go
index cfaca95fd8..b1b6516d0d 100644
--- a/test/e2e/storage/vsphere_volume_ops_storm.go
+++ b/test/e2e/storage/vsphere_volume_ops_storm.go
@@ -75,7 +75,7 @@ var _ = SIGDescribe("Volume Operations Storm [Feature:vsphere]", func() {
 			volume_ops_scale = DEFAULT_VOLUME_OPS_SCALE
 		}
 		pvclaims = make([]*v1.PersistentVolumeClaim, volume_ops_scale)
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())
 	})
 	AfterEach(func() {
@@ -113,14 +113,14 @@ var _ = SIGDescribe("Volume Operations Storm [Feature:vsphere]", func() {
 		Expect(err).NotTo(HaveOccurred())
 
 		By("Verify all volumes are accessible and available in the pod")
-		verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+		verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)
 
 		By("Deleting pod")
 		framework.ExpectNoError(framework.DeletePodWithWait(f, client, pod))
 
 		By("Waiting for volumes to be detached from the node")
 		for _, pv := range persistentvolumes {
-			waitForVSphereDiskToDetach(vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
+			waitForVSphereDiskToDetach(client, vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
 		}
 	})
 })
diff --git a/test/e2e/storage/vsphere_volume_perf.go b/test/e2e/storage/vsphere_volume_perf.go
index 06bcaa83c6..59ca395178 100644
--- a/test/e2e/storage/vsphere_volume_perf.go
+++ b/test/e2e/storage/vsphere_volume_perf.go
@@ -28,7 +28,6 @@ import (
 	storageV1 "k8s.io/api/storage/v1"
 	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
 
@@ -214,11 +213,11 @@ func invokeVolumeLifeCyclePerformance(f *framework.Framework, client clientset.I
 	latency[AttachOp] = elapsed.Seconds()
 
 	// Verify access to the volumes
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())
 
 	for i, pod := range totalpods {
-		verifyVSphereVolumesAccessible(pod, totalpvs[i], vsp)
+		verifyVSphereVolumesAccessible(client, pod, totalpvs[i], vsp)
 	}
 
 	By("Deleting pods")
@@ -237,7 +236,7 @@ func invokeVolumeLifeCyclePerformance(f *framework.Framework, client clientset.I
 		}
 	}
 
-	err = waitForVSphereDisksToDetach(vsp, nodeVolumeMap)
+	err = waitForVSphereDisksToDetach(client, vsp, nodeVolumeMap)
 	Expect(err).NotTo(HaveOccurred())
 
 	By("Deleting the PVCs")
diff --git a/test/e2e/storage/vsphere_volume_placement.go b/test/e2e/storage/vsphere_volume_placement.go
index 6b056a994b..417d0723b1 100644
--- a/test/e2e/storage/vsphere_volume_placement.go
+++ b/test/e2e/storage/vsphere_volume_placement.go
@@ -57,7 +57,7 @@ var _ = SIGDescribe("Volume Placement", func() {
 			isNodeLabeled = true
 		}
 		By("creating vmdk")
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(c)
 		Expect(err).NotTo(HaveOccurred())
 		volumePath, err := createVSphereVolume(vsp, nil)
 		Expect(err).NotTo(HaveOccurred())
@@ -285,7 +285,7 @@ var _ = SIGDescribe("Volume Placement", func() {
 			framework.ExpectNoError(framework.DeletePodWithWait(f, c, podB), "defer: Failed to delete pod ", podB.Name)
 			By(fmt.Sprintf("wait for volumes to be detached from the node: %v", node1Name))
 			for _, volumePath := range volumePaths {
-				framework.ExpectNoError(waitForVSphereDiskToDetach(vsp, volumePath, types.NodeName(node1Name)))
+				framework.ExpectNoError(waitForVSphereDiskToDetach(c, vsp, volumePath, types.NodeName(node1Name)))
 			}
 		}()
 
@@ -362,7 +362,7 @@ func createPodWithVolumeAndNodeSelector(client clientset.Interface, namespace st
 
 	By(fmt.Sprintf("Verify volume is attached to the node:%v", nodeName))
 	for _, volumePath := range volumePaths {
-		isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, types.NodeName(nodeName))
+		isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePath, types.NodeName(nodeName))
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), "disk:"+volumePath+" is not attached with the node")
 	}
@@ -385,6 +385,6 @@ func deletePodAndWaitForVolumeToDetach(f *framework.Framework, c clientset.Inter
 
 	By("Waiting for volume to be detached from the node")
 	for _, volumePath := range volumePaths {
-		framework.ExpectNoError(waitForVSphereDiskToDetach(vsp, volumePath, types.NodeName(nodeName)))
+		framework.ExpectNoError(waitForVSphereDiskToDetach(c, vsp, volumePath, types.NodeName(nodeName)))
 	}
 }
diff --git a/test/e2e/storage/vsphere_volume_vsan_policy.go b/test/e2e/storage/vsphere_volume_vsan_policy.go
index f558e49c10..f3bef3f1c9 100644
--- a/test/e2e/storage/vsphere_volume_vsan_policy.go
+++ b/test/e2e/storage/vsphere_volume_vsan_policy.go
@@ -295,16 +295,16 @@ func invokeValidPolicyTest(f *framework.Framework, client clientset.Interface, n
 	pod, err := framework.CreatePod(client, namespace, nil, pvclaims, false, "")
 	Expect(err).NotTo(HaveOccurred())
 
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())
 	By("Verify the volume is accessible and available in the pod")
-	verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+	verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)
 
 	By("Deleting pod")
 	framework.DeletePodWithWait(f, client, pod)
 
 	By("Waiting for volumes to be detached from the node")
-	waitForVSphereDiskToDetach(vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
+	waitForVSphereDiskToDetach(client, vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
 }
 
 func invokeInvalidPolicyTestNeg(client clientset.Interface, namespace string, scParameters map[string]string) error {

From 2956314cde74f0481be1da6107cc266f56127173 Mon Sep 17 00:00:00 2001
From: Daniel Smith <dbsmith@google.com>
Date: Fri, 17 Nov 2017 15:22:53 -0800
Subject: [PATCH 05/33] add detail to flag help

---
 .../src/k8s.io/apiserver/pkg/server/options/admission.go    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/staging/src/k8s.io/apiserver/pkg/server/options/admission.go b/staging/src/k8s.io/apiserver/pkg/server/options/admission.go
index c5ed7f9b8a..525e6db75e 100644
--- a/staging/src/k8s.io/apiserver/pkg/server/options/admission.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/options/admission.go
@@ -66,7 +66,11 @@ func NewAdmissionOptions() *AdmissionOptions {
 // AddFlags adds flags related to admission for a specific APIServer to the specified FlagSet
 func (a *AdmissionOptions) AddFlags(fs *pflag.FlagSet) {
 	fs.StringSliceVar(&a.PluginNames, "admission-control", a.PluginNames, ""+
-		"Ordered list of plug-ins to do admission control of resources into cluster. "+
+		"Admission is divided into two phases. "+
+		"In the first phase, only mutating admission plugins run. "+
+		"In the second phase, only validating admission plugins run. "+
+		"The names in the below list may represent a validating plugin, a mutating plugin, or both. "+
+		"Within each phase, the plugins will run in the order in which they are passed to this flag. "+
 		"Comma-delimited list of: "+strings.Join(a.Plugins.Registered(), ", ")+".")
 
 	fs.StringVar(&a.ConfigFile, "admission-control-config-file", a.ConfigFile,

From ca8cffef242d4a57ead2dfb35958ea8ae7d01901 Mon Sep 17 00:00:00 2001
From: Michael Taufen <mtaufen@google.com>
Date: Fri, 17 Nov 2017 17:43:00 -0800
Subject: [PATCH 06/33] seccomp is an alpha feature and not feature gated

Move SeccompProfileRoot to KubeletFlags and document flag as alpha
---
 cmd/kubelet/app/options/options.go                       | 6 +++++-
 cmd/kubelet/app/server.go                                | 9 ++++++---
 pkg/kubelet/apis/kubeletconfig/helpers.go                | 1 -
 pkg/kubelet/apis/kubeletconfig/helpers_test.go           | 1 -
 pkg/kubelet/apis/kubeletconfig/types.go                  | 2 --
 pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go      | 4 ----
 pkg/kubelet/apis/kubeletconfig/v1alpha1/types.go         | 2 --
 .../kubeletconfig/v1alpha1/zz_generated.conversion.go    | 2 --
 pkg/kubelet/kubelet.go                                   | 8 +++++---
 9 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go
index e7d0e8ab7e..9b17a09872 100644
--- a/cmd/kubelet/app/options/options.go
+++ b/cmd/kubelet/app/options/options.go
@@ -20,6 +20,7 @@ package options
 import (
 	"fmt"
 	_ "net/http/pprof"
+	"path/filepath"
 	"runtime"
 	"strings"
 
@@ -154,6 +155,8 @@ type KubeletFlags struct {
 	// This will cause the kubelet to listen to inotify events on the lock file,
 	// releasing it and exiting when another process tries to open that file.
 	ExitOnLockContention bool
+	// seccompProfileRoot is the directory path for seccomp profiles.
+	SeccompProfileRoot string
 
 	// DEPRECATED FLAGS
 	// minimumGCAge is the minimum age for a finished container before it is
@@ -214,6 +217,7 @@ func NewKubeletFlags() *KubeletFlags {
 		NodeLabels:          make(map[string]string),
 		VolumePluginDir:     "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/",
 		RegisterNode:        true,
+		SeccompProfileRoot:  filepath.Join(v1alpha1.DefaultRootDir, "seccomp"),
 	}
 }
 
@@ -338,6 +342,7 @@ func (f *KubeletFlags) AddFlags(fs *pflag.FlagSet) {
 	fs.StringVar(&f.VolumePluginDir, "volume-plugin-dir", f.VolumePluginDir, "<Warning: Alpha feature> The full path of the directory in which to search for additional third party volume plugins")
 	fs.StringVar(&f.LockFilePath, "lock-file", f.LockFilePath, "<Warning: Alpha feature> The path to file for kubelet to use as a lock file.")
 	fs.BoolVar(&f.ExitOnLockContention, "exit-on-lock-contention", f.ExitOnLockContention, "Whether kubelet should exit upon lock-file contention.")
+	fs.StringVar(&f.SeccompProfileRoot, "seccomp-profile-root", f.SeccompProfileRoot, "<Warning: Alpha feature> Directory path for seccomp profiles.")
 
 	// DEPRECATED FLAGS
 	fs.DurationVar(&f.MinimumGCAge.Duration, "minimum-container-ttl-duration", f.MinimumGCAge.Duration, "Minimum age for a finished container before it is garbage collected.  Examples: '300ms', '10s' or '2h45m'")
@@ -405,7 +410,6 @@ func AddKubeletConfigFlags(fs *pflag.FlagSet, c *kubeletconfig.KubeletConfigurat
 		"are generated for the public address and saved to the directory passed to --cert-dir.")
 	fs.StringVar(&c.TLSPrivateKeyFile, "tls-private-key-file", c.TLSPrivateKeyFile, "File containing x509 private key matching --tls-cert-file.")
 
-	fs.StringVar(&c.SeccompProfileRoot, "seccomp-profile-root", c.SeccompProfileRoot, "Directory path for seccomp profiles.")
 	fs.BoolVar(&c.AllowPrivileged, "allow-privileged", c.AllowPrivileged, "If true, allow containers to request privileged mode.")
 	fs.StringSliceVar(&c.HostNetworkSources, "host-network-sources", c.HostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network.")
 	fs.StringSliceVar(&c.HostPIDSources, "host-pid-sources", c.HostPIDSources, "Comma-separated list of sources from which the Kubelet allows pods to use the host pid namespace.")
diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go
index c9534d3506..d9f4685367 100644
--- a/cmd/kubelet/app/server.go
+++ b/cmd/kubelet/app/server.go
@@ -728,7 +728,8 @@ func RunKubelet(kubeFlags *options.KubeletFlags, kubeCfg *kubeletconfiginternal.
 		kubeFlags.RegisterSchedulable,
 		kubeFlags.NonMasqueradeCIDR,
 		kubeFlags.KeepTerminatedPodVolumes,
-		kubeFlags.NodeLabels)
+		kubeFlags.NodeLabels,
+		kubeFlags.SeccompProfileRoot)
 	if err != nil {
 		return fmt.Errorf("failed to create kubelet: %v", err)
 	}
@@ -800,7 +801,8 @@ func CreateAndInitKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 	registerSchedulable bool,
 	nonMasqueradeCIDR string,
 	keepTerminatedPodVolumes bool,
-	nodeLabels map[string]string) (k kubelet.Bootstrap, err error) {
+	nodeLabels map[string]string,
+	seccompProfileRoot string) (k kubelet.Bootstrap, err error) {
 	// TODO: block until all sources have delivered at least one update to the channel, or break the sync loop
 	// up into "per source" synchronizations
 
@@ -832,7 +834,8 @@ func CreateAndInitKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 		registerSchedulable,
 		nonMasqueradeCIDR,
 		keepTerminatedPodVolumes,
-		nodeLabels)
+		nodeLabels,
+		seccompProfileRoot)
 	if err != nil {
 		return nil, err
 	}
diff --git a/pkg/kubelet/apis/kubeletconfig/helpers.go b/pkg/kubelet/apis/kubeletconfig/helpers.go
index e8b5c43985..392dd4ea2c 100644
--- a/pkg/kubelet/apis/kubeletconfig/helpers.go
+++ b/pkg/kubelet/apis/kubeletconfig/helpers.go
@@ -25,7 +25,6 @@ func KubeletConfigurationPathRefs(kc *KubeletConfiguration) []*string {
 	paths = append(paths, &kc.Authentication.X509.ClientCAFile)
 	paths = append(paths, &kc.TLSCertFile)
 	paths = append(paths, &kc.TLSPrivateKeyFile)
-	paths = append(paths, &kc.SeccompProfileRoot)
 	paths = append(paths, &kc.ResolverConfig)
 	return paths
 }
diff --git a/pkg/kubelet/apis/kubeletconfig/helpers_test.go b/pkg/kubelet/apis/kubeletconfig/helpers_test.go
index 3cde31ddee..5bf10e67a3 100644
--- a/pkg/kubelet/apis/kubeletconfig/helpers_test.go
+++ b/pkg/kubelet/apis/kubeletconfig/helpers_test.go
@@ -132,7 +132,6 @@ var (
 		"Authentication.X509.ClientCAFile",
 		"TLSCertFile",
 		"TLSPrivateKeyFile",
-		"SeccompProfileRoot",
 		"ResolverConfig",
 	)
 
diff --git a/pkg/kubelet/apis/kubeletconfig/types.go b/pkg/kubelet/apis/kubeletconfig/types.go
index ef44ccde12..3b7f42509a 100644
--- a/pkg/kubelet/apis/kubeletconfig/types.go
+++ b/pkg/kubelet/apis/kubeletconfig/types.go
@@ -89,8 +89,6 @@ type KubeletConfiguration struct {
 	Authentication KubeletAuthentication
 	// authorization specifies how requests to the Kubelet's server are authorized
 	Authorization KubeletAuthorization
-	// seccompProfileRoot is the directory path for seccomp profiles.
-	SeccompProfileRoot string
 	// allowPrivileged enables containers to request privileged mode.
 	// Defaults to false.
 	AllowPrivileged bool
diff --git a/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go b/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go
index e53665a90c..e14cdf3fff 100644
--- a/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go
+++ b/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go
@@ -17,7 +17,6 @@ limitations under the License.
 package v1alpha1
 
 import (
-	"path/filepath"
 	"time"
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -177,9 +176,6 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) {
 	if obj.SerializeImagePulls == nil {
 		obj.SerializeImagePulls = boolVar(true)
 	}
-	if obj.SeccompProfileRoot == "" {
-		obj.SeccompProfileRoot = filepath.Join(DefaultRootDir, "seccomp")
-	}
 	if obj.StreamingConnectionIdleTimeout == zeroDuration {
 		obj.StreamingConnectionIdleTimeout = metav1.Duration{Duration: 4 * time.Hour}
 	}
diff --git a/pkg/kubelet/apis/kubeletconfig/v1alpha1/types.go b/pkg/kubelet/apis/kubeletconfig/v1alpha1/types.go
index 4880b83794..0c661f8a0e 100644
--- a/pkg/kubelet/apis/kubeletconfig/v1alpha1/types.go
+++ b/pkg/kubelet/apis/kubeletconfig/v1alpha1/types.go
@@ -89,8 +89,6 @@ type KubeletConfiguration struct {
 	Authentication KubeletAuthentication `json:"authentication"`
 	// authorization specifies how requests to the Kubelet's server are authorized
 	Authorization KubeletAuthorization `json:"authorization"`
-	// seccompProfileRoot is the directory path for seccomp profiles.
-	SeccompProfileRoot string `json:"seccompProfileRoot"`
 	// allowPrivileged enables containers to request privileged mode.
 	// Defaults to false.
 	AllowPrivileged *bool `json:"allowPrivileged"`
diff --git a/pkg/kubelet/apis/kubeletconfig/v1alpha1/zz_generated.conversion.go b/pkg/kubelet/apis/kubeletconfig/v1alpha1/zz_generated.conversion.go
index d4a086bcf3..b4701bb564 100644
--- a/pkg/kubelet/apis/kubeletconfig/v1alpha1/zz_generated.conversion.go
+++ b/pkg/kubelet/apis/kubeletconfig/v1alpha1/zz_generated.conversion.go
@@ -163,7 +163,6 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_kubeletconfig_KubeletConfigura
 	if err := Convert_v1alpha1_KubeletAuthorization_To_kubeletconfig_KubeletAuthorization(&in.Authorization, &out.Authorization, s); err != nil {
 		return err
 	}
-	out.SeccompProfileRoot = in.SeccompProfileRoot
 	if err := v1.Convert_Pointer_bool_To_bool(&in.AllowPrivileged, &out.AllowPrivileged, s); err != nil {
 		return err
 	}
@@ -289,7 +288,6 @@ func autoConvert_kubeletconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigura
 	if err := Convert_kubeletconfig_KubeletAuthorization_To_v1alpha1_KubeletAuthorization(&in.Authorization, &out.Authorization, s); err != nil {
 		return err
 	}
-	out.SeccompProfileRoot = in.SeccompProfileRoot
 	if err := v1.Convert_bool_To_Pointer_bool(&in.AllowPrivileged, &out.AllowPrivileged, s); err != nil {
 		return err
 	}
diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go
index 9b6ed93ee9..68aef5b207 100644
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@@ -218,7 +218,8 @@ type Builder func(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 	registerSchedulable bool,
 	nonMasqueradeCIDR string,
 	keepTerminatedPodVolumes bool,
-	nodeLabels map[string]string) (Bootstrap, error)
+	nodeLabels map[string]string,
+	seccompProfileRoot string) (Bootstrap, error)
 
 // Dependencies is a bin for things we might consider "injected dependencies" -- objects constructed
 // at runtime that are necessary for running the Kubelet. This is a temporary solution for grouping
@@ -344,7 +345,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 	registerSchedulable bool,
 	nonMasqueradeCIDR string,
 	keepTerminatedPodVolumes bool,
-	nodeLabels map[string]string) (*Kubelet, error) {
+	nodeLabels map[string]string,
+	seccompProfileRoot string) (*Kubelet, error) {
 	if rootDirectory == "" {
 		return nil, fmt.Errorf("invalid root directory %q", rootDirectory)
 	}
@@ -658,7 +660,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 		runtime, err := kuberuntime.NewKubeGenericRuntimeManager(
 			kubecontainer.FilterEventRecorder(kubeDeps.Recorder),
 			klet.livenessManager,
-			kubeCfg.SeccompProfileRoot,
+			seccompProfileRoot,
 			containerRefManager,
 			machineInfo,
 			klet,

From 1f34863b9e2422638e5e55faca6c916d41c96f9b Mon Sep 17 00:00:00 2001
From: Benjamin Elder <bentheelder@google.com>
Date: Sat, 18 Nov 2017 15:23:17 -0800
Subject: [PATCH 07/33] fix cadvisor.New signature for cross build

---
 pkg/kubelet/cadvisor/cadvisor_unsupported.go | 2 +-
 pkg/kubelet/cadvisor/cadvisor_windows.go     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/kubelet/cadvisor/cadvisor_unsupported.go b/pkg/kubelet/cadvisor/cadvisor_unsupported.go
index 0b653e69e6..f1ae9486d8 100644
--- a/pkg/kubelet/cadvisor/cadvisor_unsupported.go
+++ b/pkg/kubelet/cadvisor/cadvisor_unsupported.go
@@ -31,7 +31,7 @@ type cadvisorUnsupported struct {
 
 var _ Interface = new(cadvisorUnsupported)
 
-func New(address string, port uint, imageFsInfoProvider ImageFsInfoProvider, rootPath string) (Interface, error) {
+func New(address string, port uint, imageFsInfoProvider ImageFsInfoProvider, rootPath string, usingLegacyStats bool) (Interface, error) {
 	return &cadvisorUnsupported{}, nil
 }
 
diff --git a/pkg/kubelet/cadvisor/cadvisor_windows.go b/pkg/kubelet/cadvisor/cadvisor_windows.go
index 52d3cd5605..6ce1f8d1b3 100644
--- a/pkg/kubelet/cadvisor/cadvisor_windows.go
+++ b/pkg/kubelet/cadvisor/cadvisor_windows.go
@@ -32,7 +32,7 @@ type cadvisorClient struct {
 var _ Interface = new(cadvisorClient)
 
 // New creates a cAdvisor and exports its API on the specified port if port > 0.
-func New(address string, port uint, imageFsInfoProvider ImageFsInfoProvider, rootPath string) (Interface, error) {
+func New(address string, port uint, imageFsInfoProvider ImageFsInfoProvider, rootPath string, usingLegacyStats bool) (Interface, error) {
 	client, err := winstats.NewPerfCounterClient()
 	return &cadvisorClient{winStatsClient: client}, err
 }

From 3c4c85f2125144bf8a20677afee1e14ed528b26c Mon Sep 17 00:00:00 2001
From: Rohit Agarwal <agarwalrohit@google.com>
Date: Wed, 15 Nov 2017 19:02:48 -0800
Subject: [PATCH 08/33] Add ExtendedResourceToleration admission controller.

---
 cmd/kube-apiserver/app/options/BUILD          |   1 +
 cmd/kube-apiserver/app/options/plugins.go     |   2 +
 plugin/BUILD                                  |   1 +
 .../extendedresourcetoleration/BUILD          |  42 ++
 .../extendedresourcetoleration/admission.go   |  94 +++++
 .../admission_test.go                         | 382 ++++++++++++++++++
 6 files changed, 522 insertions(+)
 create mode 100644 plugin/pkg/admission/extendedresourcetoleration/BUILD
 create mode 100644 plugin/pkg/admission/extendedresourcetoleration/admission.go
 create mode 100644 plugin/pkg/admission/extendedresourcetoleration/admission_test.go

diff --git a/cmd/kube-apiserver/app/options/BUILD b/cmd/kube-apiserver/app/options/BUILD
index 6d7c32772c..2ad00991da 100644
--- a/cmd/kube-apiserver/app/options/BUILD
+++ b/cmd/kube-apiserver/app/options/BUILD
@@ -30,6 +30,7 @@ go_library(
         "//plugin/pkg/admission/deny:go_default_library",
         "//plugin/pkg/admission/eventratelimit:go_default_library",
         "//plugin/pkg/admission/exec:go_default_library",
+        "//plugin/pkg/admission/extendedresourcetoleration:go_default_library",
         "//plugin/pkg/admission/gc:go_default_library",
         "//plugin/pkg/admission/imagepolicy:go_default_library",
         "//plugin/pkg/admission/initialresources:go_default_library",
diff --git a/cmd/kube-apiserver/app/options/plugins.go b/cmd/kube-apiserver/app/options/plugins.go
index 6d899d3b07..30ed306485 100644
--- a/cmd/kube-apiserver/app/options/plugins.go
+++ b/cmd/kube-apiserver/app/options/plugins.go
@@ -32,6 +32,7 @@ import (
 	"k8s.io/kubernetes/plugin/pkg/admission/deny"
 	"k8s.io/kubernetes/plugin/pkg/admission/eventratelimit"
 	"k8s.io/kubernetes/plugin/pkg/admission/exec"
+	"k8s.io/kubernetes/plugin/pkg/admission/extendedresourcetoleration"
 	"k8s.io/kubernetes/plugin/pkg/admission/gc"
 	"k8s.io/kubernetes/plugin/pkg/admission/imagepolicy"
 	"k8s.io/kubernetes/plugin/pkg/admission/initialresources"
@@ -61,6 +62,7 @@ func RegisterAllAdmissionPlugins(plugins *admission.Plugins) {
 	deny.Register(plugins)
 	eventratelimit.Register(plugins)
 	exec.Register(plugins)
+	extendedresourcetoleration.Register(plugins)
 	gc.Register(plugins)
 	imagepolicy.Register(plugins)
 	initialresources.Register(plugins)
diff --git a/plugin/BUILD b/plugin/BUILD
index 843efb9c3e..764a8e3283 100644
--- a/plugin/BUILD
+++ b/plugin/BUILD
@@ -19,6 +19,7 @@ filegroup(
         "//plugin/pkg/admission/deny:all-srcs",
         "//plugin/pkg/admission/eventratelimit:all-srcs",
         "//plugin/pkg/admission/exec:all-srcs",
+        "//plugin/pkg/admission/extendedresourcetoleration:all-srcs",
         "//plugin/pkg/admission/gc:all-srcs",
         "//plugin/pkg/admission/imagepolicy:all-srcs",
         "//plugin/pkg/admission/initialresources:all-srcs",
diff --git a/plugin/pkg/admission/extendedresourcetoleration/BUILD b/plugin/pkg/admission/extendedresourcetoleration/BUILD
new file mode 100644
index 0000000000..882b966cd7
--- /dev/null
+++ b/plugin/pkg/admission/extendedresourcetoleration/BUILD
@@ -0,0 +1,42 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["admission.go"],
+    importpath = "k8s.io/kubernetes/plugin/pkg/admission/extendedresourcetoleration",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/apis/core:go_default_library",
+        "//pkg/apis/core/helper:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
+        "//vendor/k8s.io/apiserver/pkg/admission:go_default_library",
+    ],
+)
+
+go_test(
+    name = "go_default_test",
+    srcs = ["admission_test.go"],
+    importpath = "k8s.io/kubernetes/plugin/pkg/admission/extendedresourcetoleration",
+    library = ":go_default_library",
+    deps = [
+        "//pkg/apis/core:go_default_library",
+        "//pkg/apis/core/helper:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
+        "//vendor/k8s.io/apiserver/pkg/admission:go_default_library",
+    ],
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [":package-srcs"],
+    tags = ["automanaged"],
+    visibility = ["//visibility:public"],
+)
diff --git a/plugin/pkg/admission/extendedresourcetoleration/admission.go b/plugin/pkg/admission/extendedresourcetoleration/admission.go
new file mode 100644
index 0000000000..410c18160b
--- /dev/null
+++ b/plugin/pkg/admission/extendedresourcetoleration/admission.go
@@ -0,0 +1,94 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package extendedresourcetoleration
+
+import (
+	"fmt"
+	"io"
+
+	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apiserver/pkg/admission"
+	"k8s.io/kubernetes/pkg/apis/core"
+	"k8s.io/kubernetes/pkg/apis/core/helper"
+)
+
+// Register is called by the apiserver to register the plugin factory.
+func Register(plugins *admission.Plugins) {
+	plugins.Register("ExtendedResourceToleration", func(config io.Reader) (admission.Interface, error) {
+		return newExtendedResourceToleration(), nil
+	})
+}
+
+// newExtendedResourceToleration creates a new instance of the ExtendedResourceToleration admission controller.
+func newExtendedResourceToleration() *plugin {
+	return &plugin{
+		Handler: admission.NewHandler(admission.Create, admission.Update),
+	}
+}
+
+// Make sure we are implementing the interface.
+var _ admission.MutationInterface = &plugin{}
+
+type plugin struct {
+	*admission.Handler
+}
+
+// Admit updates the toleration of a pod based on the resources requested by it.
+// If an extended resource of name "example.com/device" is requested, it adds
+// a toleration with key "example.com/device", operator "Exists" and effect "NoSchedule".
+// The rationale for this is described in:
+// https://github.com/kubernetes/kubernetes/issues/55080
+func (p *plugin) Admit(attributes admission.Attributes) error {
+	// Ignore all calls to subresources or resources other than pods.
+	if len(attributes.GetSubresource()) != 0 || attributes.GetResource().GroupResource() != core.Resource("pods") {
+		return nil
+	}
+
+	pod, ok := attributes.GetObject().(*core.Pod)
+	if !ok {
+		return errors.NewBadRequest(fmt.Sprintf("expected *core.Pod but got %T", attributes.GetObject()))
+	}
+
+	resources := sets.String{}
+	for _, container := range pod.Spec.Containers {
+		for resourceName := range container.Resources.Requests {
+			if helper.IsExtendedResourceName(resourceName) {
+				resources.Insert(string(resourceName))
+			}
+		}
+	}
+	for _, container := range pod.Spec.InitContainers {
+		for resourceName := range container.Resources.Requests {
+			if helper.IsExtendedResourceName(resourceName) {
+				resources.Insert(string(resourceName))
+			}
+		}
+	}
+
+	// Doing .List() so that we get a stable sorted list.
+	// This allows us to test adding tolerations for multiple extended resources.
+	for _, resource := range resources.List() {
+		helper.AddOrUpdateTolerationInPod(pod, &core.Toleration{
+			Key:      resource,
+			Operator: core.TolerationOpExists,
+			Effect:   core.TaintEffectNoSchedule,
+		})
+	}
+
+	return nil
+}
diff --git a/plugin/pkg/admission/extendedresourcetoleration/admission_test.go b/plugin/pkg/admission/extendedresourcetoleration/admission_test.go
new file mode 100644
index 0000000000..646ae007e1
--- /dev/null
+++ b/plugin/pkg/admission/extendedresourcetoleration/admission_test.go
@@ -0,0 +1,382 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package extendedresourcetoleration
+
+import (
+	"testing"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apiserver/pkg/admission"
+	"k8s.io/kubernetes/pkg/apis/core"
+	"k8s.io/kubernetes/pkg/apis/core/helper"
+)
+
+func TestAdmit(t *testing.T) {
+
+	plugin := newExtendedResourceToleration()
+
+	containerRequestingCPU := core.Container{
+		Resources: core.ResourceRequirements{
+			Requests: core.ResourceList{
+				core.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI),
+			},
+		},
+	}
+
+	containerRequestingMemory := core.Container{
+		Resources: core.ResourceRequirements{
+			Requests: core.ResourceList{
+				core.ResourceMemory: *resource.NewQuantity(2048, resource.DecimalSI),
+			},
+		},
+	}
+
+	extendedResource1 := "example.com/device-ek"
+	extendedResource2 := "example.com/device-do"
+
+	containerRequestingExtendedResource1 := core.Container{
+		Resources: core.ResourceRequirements{
+			Requests: core.ResourceList{
+				core.ResourceName(extendedResource1): *resource.NewQuantity(1, resource.DecimalSI),
+			},
+		},
+	}
+	containerRequestingExtendedResource2 := core.Container{
+		Resources: core.ResourceRequirements{
+			Requests: core.ResourceList{
+				core.ResourceName(extendedResource2): *resource.NewQuantity(2, resource.DecimalSI),
+			},
+		},
+	}
+
+	tests := []struct {
+		description  string
+		requestedPod core.Pod
+		expectedPod  core.Pod
+	}{
+		{
+			description: "empty pod without any extended resources, expect no change in tolerations",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{},
+			},
+		},
+		{
+			description: "pod with container without any extended resources, expect no change in tolerations",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+					},
+				},
+			},
+		},
+		{
+			description: "pod with init container without any extended resources, expect no change in tolerations",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					InitContainers: []core.Container{
+						containerRequestingMemory,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					InitContainers: []core.Container{
+						containerRequestingMemory,
+					},
+				},
+			},
+		},
+		{
+			description: "pod with container with extended resource, expect toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingExtendedResource1,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with init container with extended resource, expect toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					InitContainers: []core.Container{
+						containerRequestingExtendedResource2,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					InitContainers: []core.Container{
+						containerRequestingExtendedResource2,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource2,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with existing tolerations and container with extended resource, expect existing tolerations to be preserved and new toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      "foo",
+							Operator: core.TolerationOpEqual,
+							Value:    "bar",
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      "foo",
+							Operator: core.TolerationOpEqual,
+							Value:    "bar",
+							Effect:   core.TaintEffectNoSchedule,
+						},
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with multiple extended resources, expect multiple tolerations to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					InitContainers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingExtendedResource2,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					InitContainers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingExtendedResource2,
+					},
+					Tolerations: []core.Toleration{
+						// Note the order, it's sorted by the Key
+						{
+							Key:      extendedResource2,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with container requesting extended resource and existing correct toleration, expect no change in tolerations",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with container requesting extended resource and existing toleration with the same key but different effect and value, expect existing tolerations to be preserved and new toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpEqual,
+							Value:    "foo",
+							Effect:   core.TaintEffectNoExecute,
+						},
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpEqual,
+							Value:    "foo",
+							Effect:   core.TaintEffectNoExecute,
+						},
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with wildcard toleration and container requesting extended resource, expect existing tolerations to be preserved and new toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Operator: core.TolerationOpExists,
+						},
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Operator: core.TolerationOpExists,
+						},
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+	}
+	for i, test := range tests {
+		err := plugin.Admit(admission.NewAttributesRecord(&test.requestedPod, nil, core.Kind("Pod").WithVersion("version"), "foo", "name", core.Resource("pods").WithVersion("version"), "", "ignored", nil))
+		if err != nil {
+			t.Errorf("[%d: %s] unexpected error %v for pod %+v", i, test.description, err, test.requestedPod)
+		}
+
+		if !helper.Semantic.DeepEqual(test.expectedPod.Spec.Tolerations, test.requestedPod.Spec.Tolerations) {
+			t.Errorf("[%d: %s] expected %#v got %#v", i, test.description, test.expectedPod.Spec.Tolerations, test.requestedPod.Spec.Tolerations)
+		}
+	}
+}
+
+func TestHandles(t *testing.T) {
+	plugin := newExtendedResourceToleration()
+	tests := map[admission.Operation]bool{
+		admission.Create:  true,
+		admission.Update:  true,
+		admission.Delete:  false,
+		admission.Connect: false,
+	}
+	for op, expected := range tests {
+		result := plugin.Handles(op)
+		if result != expected {
+			t.Errorf("Unexpected result for operation %s: %v\n", op, result)
+		}
+	}
+}

From 3592c1be18e56c960cbc384778046d28f539f64c Mon Sep 17 00:00:00 2001
From: wackxu <xushiwei5@huawei.com>
Date: Mon, 20 Nov 2017 10:50:09 +0800
Subject: [PATCH 09/33] Improve kubeadm apply error logging style

---
 cmd/kubeadm/app/cmd/upgrade/apply.go |  6 +++---
 cmd/kubeadm/app/util/error.go        |  9 ++++++++
 cmd/kubeadm/app/util/error_test.go   | 31 ++++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/cmd/kubeadm/app/cmd/upgrade/apply.go b/cmd/kubeadm/app/cmd/upgrade/apply.go
index b36ddf23c1..75668e36ad 100644
--- a/cmd/kubeadm/app/cmd/upgrade/apply.go
+++ b/cmd/kubeadm/app/cmd/upgrade/apply.go
@@ -202,16 +202,16 @@ func EnforceVersionPolicies(flags *applyFlags, versionGetter upgrade.VersionGett
 	if versionSkewErrs != nil {
 
 		if len(versionSkewErrs.Mandatory) > 0 {
-			return fmt.Errorf("The --version argument is invalid due to these fatal errors: %v", versionSkewErrs.Mandatory)
+			return fmt.Errorf("The --version argument is invalid due to these fatal errors:\n\n%v\nPlease fix the misalignments highlighted above and try upgrading again", kubeadmutil.FormatErrMsg(versionSkewErrs.Mandatory))
 		}
 
 		if len(versionSkewErrs.Skippable) > 0 {
 			// Return the error if the user hasn't specified the --force flag
 			if !flags.force {
-				return fmt.Errorf("The --version argument is invalid due to these errors: %v. Can be bypassed if you pass the --force flag", versionSkewErrs.Skippable)
+				return fmt.Errorf("The --version argument is invalid due to these errors:\n\n%v\nCan be bypassed if you pass the --force flag", kubeadmutil.FormatErrMsg(versionSkewErrs.Skippable))
 			}
 			// Soft errors found, but --force was specified
-			fmt.Printf("[upgrade/version] Found %d potential version compatibility errors but skipping since the --force flag is set: %v\n", len(versionSkewErrs.Skippable), versionSkewErrs.Skippable)
+			fmt.Printf("[upgrade/version] Found %d potential version compatibility errors but skipping since the --force flag is set: \n\n%v", len(versionSkewErrs.Skippable), kubeadmutil.FormatErrMsg(versionSkewErrs.Skippable))
 		}
 	}
 	return nil
diff --git a/cmd/kubeadm/app/util/error.go b/cmd/kubeadm/app/util/error.go
index 32cb615019..c27ab860d7 100644
--- a/cmd/kubeadm/app/util/error.go
+++ b/cmd/kubeadm/app/util/error.go
@@ -75,3 +75,12 @@ func checkErr(prefix string, err error, handleErr func(string, int)) {
 		handleErr(err.Error(), DefaultErrorExitCode)
 	}
 }
+
+// FormatErrMsg returns a human-readable string describing the slice of errors passed to the function
+func FormatErrMsg(errs []error) string {
+	var errMsg string
+	for _, err := range errs {
+		errMsg = fmt.Sprintf("%s\t-%s\n", errMsg, err.Error())
+	}
+	return errMsg
+}
diff --git a/cmd/kubeadm/app/util/error_test.go b/cmd/kubeadm/app/util/error_test.go
index 07aae6c335..831007550a 100644
--- a/cmd/kubeadm/app/util/error_test.go
+++ b/cmd/kubeadm/app/util/error_test.go
@@ -50,3 +50,34 @@ func TestCheckErr(t *testing.T) {
 		}
 	}
 }
+
+func TestFormatErrMsg(t *testing.T) {
+	errMsg1 := "specified version to upgrade to v1.9.0-alpha.3 is equal to or lower than the cluster version v1.10.0-alpha.0.69+638add6ddfb6d2. Downgrades are not supported yet"
+	errMsg2 := "specified version to upgrade to v1.9.0-alpha.3 is higher than the kubeadm version v1.9.0-alpha.1.3121+84178212527295-dirty. Upgrade kubeadm first using the tool you used to install kubeadm"
+
+	testCases := []struct {
+		errs   []error
+		expect string
+	}{
+		{
+			errs: []error{
+				fmt.Errorf(errMsg1),
+				fmt.Errorf(errMsg2),
+			},
+			expect: "\t-" + errMsg1 + "\n" + "\t-" + errMsg2 + "\n",
+		},
+		{
+			errs: []error{
+				fmt.Errorf(errMsg1),
+			},
+			expect: "\t-" + errMsg1 + "\n",
+		},
+	}
+
+	for _, testCase := range testCases {
+		got := FormatErrMsg(testCase.errs)
+		if got != testCase.expect {
+			t.Errorf("FormatErrMsg error, expect: %v, got: %v", testCase.expect, got)
+		}
+	}
+}

From 35138acfdf6b133894826c5237c0ebbb53b369f3 Mon Sep 17 00:00:00 2001
From: xiangpengzhao <zhao.xiangpeng@zte.com.cn>
Date: Mon, 20 Nov 2017 23:54:03 +0800
Subject: [PATCH 10/33] Add condition "len(cfg.DiscoveryToken) != 0" to
 ValidateArgSelection.

---
 cmd/kubeadm/app/apis/kubeadm/validation/validation.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmd/kubeadm/app/apis/kubeadm/validation/validation.go b/cmd/kubeadm/app/apis/kubeadm/validation/validation.go
index e54416e8b0..5ef679c502 100644
--- a/cmd/kubeadm/app/apis/kubeadm/validation/validation.go
+++ b/cmd/kubeadm/app/apis/kubeadm/validation/validation.go
@@ -140,7 +140,8 @@ func ValidateArgSelection(cfg *kubeadm.NodeConfiguration, fldPath *field.Path) f
 		allErrs = append(allErrs, field.Invalid(fldPath, "", "DiscoveryTokenCACertHashes cannot be used with DiscoveryFile"))
 	}
 
-	if len(cfg.DiscoveryFile) == 0 && len(cfg.DiscoveryTokenCACertHashes) == 0 && !cfg.DiscoveryTokenUnsafeSkipCAVerification {
+	if len(cfg.DiscoveryFile) == 0 && len(cfg.DiscoveryToken) != 0 &&
+		len(cfg.DiscoveryTokenCACertHashes) == 0 && !cfg.DiscoveryTokenUnsafeSkipCAVerification {
 		allErrs = append(allErrs, field.Invalid(fldPath, "", "using token-based discovery without DiscoveryTokenCACertHashes can be unsafe. set --discovery-token-unsafe-skip-ca-verification to continue"))
 	}
 

From edfb2ad55293841abd3cbabe45fa162499831a7a Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Tue, 14 Nov 2017 17:39:55 -0800
Subject: [PATCH 11/33] Azure load balancer general improvement

---
 pkg/cloudprovider/providers/azure/BUILD       |   1 +
 pkg/cloudprovider/providers/azure/azure.go    | 142 +++-
 .../providers/azure/azure_backoff.go          | 205 ++++-
 .../providers/azure/azure_fakes.go            | 584 +++++++++++++
 .../providers/azure/azure_instances.go        |  33 -
 .../providers/azure/azure_loadbalancer.go     | 793 +++++++++---------
 .../providers/azure/azure_loadbalancer.md     |  68 ++
 .../providers/azure/azure_test.go             | 785 ++++++++++++++---
 .../providers/azure/azure_util.go             | 212 ++++-
 .../providers/azure/azure_wrap.go             |  33 +-
 10 files changed, 2240 insertions(+), 616 deletions(-)
 create mode 100644 pkg/cloudprovider/providers/azure/azure_fakes.go
 create mode 100644 pkg/cloudprovider/providers/azure/azure_loadbalancer.md

diff --git a/pkg/cloudprovider/providers/azure/BUILD b/pkg/cloudprovider/providers/azure/BUILD
index 8aa8da13c1..c1d7bb6d73 100644
--- a/pkg/cloudprovider/providers/azure/BUILD
+++ b/pkg/cloudprovider/providers/azure/BUILD
@@ -13,6 +13,7 @@ go_library(
         "azure_backoff.go",
         "azure_blobDiskController.go",
         "azure_controllerCommon.go",
+        "azure_fakes.go",
         "azure_file.go",
         "azure_instance_metadata.go",
         "azure_instances.go",
diff --git a/pkg/cloudprovider/providers/azure/azure.go b/pkg/cloudprovider/providers/azure/azure.go
index ee9ebf352f..a7cff34e77 100644
--- a/pkg/cloudprovider/providers/azure/azure.go
+++ b/pkg/cloudprovider/providers/azure/azure.go
@@ -44,13 +44,14 @@ import (
 
 const (
 	// CloudProviderName is the value used for the --cloud-provider flag
-	CloudProviderName      = "azure"
-	rateLimitQPSDefault    = 1.0
-	rateLimitBucketDefault = 5
-	backoffRetriesDefault  = 6
-	backoffExponentDefault = 1.5
-	backoffDurationDefault = 5 // in seconds
-	backoffJitterDefault   = 1.0
+	CloudProviderName            = "azure"
+	rateLimitQPSDefault          = 1.0
+	rateLimitBucketDefault       = 5
+	backoffRetriesDefault        = 6
+	backoffExponentDefault       = 1.5
+	backoffDurationDefault       = 5 // in seconds
+	backoffJitterDefault         = 1.0
+	maximumLoadBalancerRuleCount = 148 // According to Azure LB rule default limit
 )
 
 // Config holds the configuration parsed from the --cloud-config flag
@@ -113,6 +114,51 @@ type Config struct {
 
 	// Use managed service identity for the virtual machine to access Azure ARM APIs
 	UseManagedIdentityExtension bool `json:"useManagedIdentityExtension"`
+
+	// Maximum allowed LoadBalancer Rule Count is the limit enforced by Azure Load balancer
+	MaximumLoadBalancerRuleCount int `json:"maximumLoadBalancerRuleCount"`
+}
+
+type iVirtualMachinesClient interface {
+	CreateOrUpdate(resourceGroupName string, VMName string, parameters compute.VirtualMachine, cancel <-chan struct{}) (<-chan compute.VirtualMachine, <-chan error)
+	Get(resourceGroupName string, VMName string, expand compute.InstanceViewTypes) (result compute.VirtualMachine, err error)
+	List(resourceGroupName string) (result compute.VirtualMachineListResult, err error)
+	ListNextResults(lastResults compute.VirtualMachineListResult) (result compute.VirtualMachineListResult, err error)
+}
+
+type iInterfacesClient interface {
+	CreateOrUpdate(resourceGroupName string, networkInterfaceName string, parameters network.Interface, cancel <-chan struct{}) (<-chan network.Interface, <-chan error)
+	Get(resourceGroupName string, networkInterfaceName string, expand string) (result network.Interface, err error)
+}
+
+type iLoadBalancersClient interface {
+	CreateOrUpdate(resourceGroupName string, loadBalancerName string, parameters network.LoadBalancer, cancel <-chan struct{}) (<-chan network.LoadBalancer, <-chan error)
+	Delete(resourceGroupName string, loadBalancerName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
+	Get(resourceGroupName string, loadBalancerName string, expand string) (result network.LoadBalancer, err error)
+	List(resourceGroupName string) (result network.LoadBalancerListResult, err error)
+	ListNextResults(lastResult network.LoadBalancerListResult) (result network.LoadBalancerListResult, err error)
+}
+
+type iPublicIPAddressesClient interface {
+	CreateOrUpdate(resourceGroupName string, publicIPAddressName string, parameters network.PublicIPAddress, cancel <-chan struct{}) (<-chan network.PublicIPAddress, <-chan error)
+	Delete(resourceGroupName string, publicIPAddressName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
+	Get(resourceGroupName string, publicIPAddressName string, expand string) (result network.PublicIPAddress, err error)
+	List(resourceGroupName string) (result network.PublicIPAddressListResult, err error)
+	ListNextResults(lastResults network.PublicIPAddressListResult) (result network.PublicIPAddressListResult, err error)
+}
+
+type iSubnetsClient interface {
+	CreateOrUpdate(resourceGroupName string, virtualNetworkName string, subnetName string, subnetParameters network.Subnet, cancel <-chan struct{}) (<-chan network.Subnet, <-chan error)
+	Delete(resourceGroupName string, virtualNetworkName string, subnetName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
+	Get(resourceGroupName string, virtualNetworkName string, subnetName string, expand string) (result network.Subnet, err error)
+	List(resourceGroupName string, virtualNetworkName string) (result network.SubnetListResult, err error)
+}
+
+type iSecurityGroupsClient interface {
+	CreateOrUpdate(resourceGroupName string, networkSecurityGroupName string, parameters network.SecurityGroup, cancel <-chan struct{}) (<-chan network.SecurityGroup, <-chan error)
+	Delete(resourceGroupName string, networkSecurityGroupName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
+	Get(resourceGroupName string, networkSecurityGroupName string, expand string) (result network.SecurityGroup, err error)
+	List(resourceGroupName string) (result network.SecurityGroupListResult, err error)
 }
 
 // Cloud holds the config and clients
@@ -120,13 +166,13 @@ type Cloud struct {
 	Config
 	Environment              azure.Environment
 	RoutesClient             network.RoutesClient
-	SubnetsClient            network.SubnetsClient
-	InterfacesClient         network.InterfacesClient
+	SubnetsClient            iSubnetsClient
+	InterfacesClient         iInterfacesClient
 	RouteTablesClient        network.RouteTablesClient
-	LoadBalancerClient       network.LoadBalancersClient
-	PublicIPAddressesClient  network.PublicIPAddressesClient
-	SecurityGroupsClient     network.SecurityGroupsClient
-	VirtualMachinesClient    compute.VirtualMachinesClient
+	LoadBalancerClient       iLoadBalancersClient
+	PublicIPAddressesClient  iPublicIPAddressesClient
+	SecurityGroupsClient     iSecurityGroupsClient
+	VirtualMachinesClient    iVirtualMachinesClient
 	StorageAccountClient     storage.AccountsClient
 	DisksClient              disk.DisksClient
 	operationPollRateLimiter flowcontrol.RateLimiter
@@ -221,11 +267,12 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) {
 		return nil, err
 	}
 
-	az.SubnetsClient = network.NewSubnetsClient(az.SubscriptionID)
-	az.SubnetsClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.SubnetsClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.SubnetsClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.SubnetsClient.Client)
+	subnetsClient := network.NewSubnetsClient(az.SubscriptionID)
+	subnetsClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	subnetsClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	subnetsClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&subnetsClient.Client)
+	az.SubnetsClient = subnetsClient
 
 	az.RouteTablesClient = network.NewRouteTablesClient(az.SubscriptionID)
 	az.RouteTablesClient.BaseURI = az.Environment.ResourceManagerEndpoint
@@ -239,35 +286,40 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) {
 	az.RoutesClient.PollingDelay = 5 * time.Second
 	configureUserAgent(&az.RoutesClient.Client)
 
-	az.InterfacesClient = network.NewInterfacesClient(az.SubscriptionID)
-	az.InterfacesClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.InterfacesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.InterfacesClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.InterfacesClient.Client)
+	interfacesClient := network.NewInterfacesClient(az.SubscriptionID)
+	interfacesClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	interfacesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	interfacesClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&interfacesClient.Client)
+	az.InterfacesClient = interfacesClient
 
-	az.LoadBalancerClient = network.NewLoadBalancersClient(az.SubscriptionID)
-	az.LoadBalancerClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.LoadBalancerClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.LoadBalancerClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.LoadBalancerClient.Client)
+	loadBalancerClient := network.NewLoadBalancersClient(az.SubscriptionID)
+	loadBalancerClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	loadBalancerClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	loadBalancerClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&loadBalancerClient.Client)
+	az.LoadBalancerClient = loadBalancerClient
 
-	az.VirtualMachinesClient = compute.NewVirtualMachinesClient(az.SubscriptionID)
-	az.VirtualMachinesClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.VirtualMachinesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.VirtualMachinesClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.VirtualMachinesClient.Client)
+	virtualMachinesClient := compute.NewVirtualMachinesClient(az.SubscriptionID)
+	virtualMachinesClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	virtualMachinesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	virtualMachinesClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&virtualMachinesClient.Client)
+	az.VirtualMachinesClient = virtualMachinesClient
 
-	az.PublicIPAddressesClient = network.NewPublicIPAddressesClient(az.SubscriptionID)
-	az.PublicIPAddressesClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.PublicIPAddressesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.PublicIPAddressesClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.PublicIPAddressesClient.Client)
+	publicIPAddressClient := network.NewPublicIPAddressesClient(az.SubscriptionID)
+	publicIPAddressClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	publicIPAddressClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	publicIPAddressClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&publicIPAddressClient.Client)
+	az.PublicIPAddressesClient = publicIPAddressClient
 
-	az.SecurityGroupsClient = network.NewSecurityGroupsClient(az.SubscriptionID)
-	az.SecurityGroupsClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.SecurityGroupsClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.SecurityGroupsClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.SecurityGroupsClient.Client)
+	securityGroupsClient := network.NewSecurityGroupsClient(az.SubscriptionID)
+	securityGroupsClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	securityGroupsClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	securityGroupsClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&securityGroupsClient.Client)
+	az.SecurityGroupsClient = securityGroupsClient
 
 	az.StorageAccountClient = storage.NewAccountsClientWithBaseURI(az.Environment.ResourceManagerEndpoint, az.SubscriptionID)
 	az.StorageAccountClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
@@ -327,6 +379,10 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) {
 
 	az.metadata = NewInstanceMetadata()
 
+	if az.MaximumLoadBalancerRuleCount == 0 {
+		az.MaximumLoadBalancerRuleCount = maximumLoadBalancerRuleCount
+	}
+
 	if err := initDiskControllers(&az); err != nil {
 		return nil, err
 	}
diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go
index b30b1da38b..32f3a5c051 100644
--- a/pkg/cloudprovider/providers/azure/azure_backoff.go
+++ b/pkg/cloudprovider/providers/azure/azure_backoff.go
@@ -26,11 +26,25 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 )
 
+// getorCreateRequestBackoff returns a new Backoff object steps = 1
+// This is to make sure that the requested command executes
+// at least once
+func (az *Cloud) getorCreateRequestBackoff() (resourceRequestBackoff wait.Backoff) {
+	if az.CloudProviderBackoff {
+		return az.resourceRequestBackoff
+	}
+	resourceRequestBackoff = wait.Backoff{
+		Steps: 1,
+	}
+
+	return resourceRequestBackoff
+}
+
 // GetVirtualMachineWithRetry invokes az.getVirtualMachine with exponential backoff retry
 func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.VirtualMachine, bool, error) {
 	var machine compute.VirtualMachine
 	var exists bool
-	err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
 		machine, exists, retryErr = az.getVirtualMachine(name)
 		if retryErr != nil {
@@ -46,8 +60,9 @@ func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.Virtua
 // VirtualMachineClientGetWithRetry invokes az.VirtualMachinesClient.Get with exponential backoff retry
 func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string, types compute.InstanceViewTypes) (compute.VirtualMachine, error) {
 	var machine compute.VirtualMachine
-	err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
+		az.operationPollRateLimiter.Accept()
 		machine, retryErr = az.VirtualMachinesClient.Get(resourceGroup, vmName, types)
 		if retryErr != nil {
 			glog.Errorf("backoff: failure, will retry,err=%v", retryErr)
@@ -59,10 +74,63 @@ func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string,
 	return machine, err
 }
 
+// VirtualMachineClientListWithRetry invokes az.VirtualMachinesClient.List with exponential backoff retry
+func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine, error) {
+	allNodes := []compute.VirtualMachine{}
+	var result compute.VirtualMachineListResult
+	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+		var retryErr error
+		az.operationPollRateLimiter.Accept()
+		glog.V(10).Infof("VirtualMachinesClient.List(%v): start", az.ResourceGroup)
+		result, retryErr = az.VirtualMachinesClient.List(az.ResourceGroup)
+		glog.V(10).Infof("VirtualMachinesClient.List(%v): end", az.ResourceGroup)
+		if retryErr != nil {
+			glog.Errorf("VirtualMachinesClient.List(%v) - backoff: failure, will retry,err=%v",
+				az.ResourceGroup,
+				retryErr)
+			return false, retryErr
+		}
+		glog.V(2).Infof("VirtualMachinesClient.List(%v) - backoff: success", az.ResourceGroup)
+		return true, nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	appendResults := (result.Value != nil && len(*result.Value) > 0)
+	for appendResults {
+		allNodes = append(allNodes, *result.Value...)
+		appendResults = false
+		// follow the next link to get all the vms for resource group
+		if result.NextLink != nil {
+			err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+				var retryErr error
+				az.operationPollRateLimiter.Accept()
+				glog.V(10).Infof("VirtualMachinesClient.ListNextResults(%v): start", az.ResourceGroup)
+				result, retryErr = az.VirtualMachinesClient.ListNextResults(result)
+				glog.V(10).Infof("VirtualMachinesClient.ListNextResults(%v): end", az.ResourceGroup)
+				if retryErr != nil {
+					glog.Errorf("VirtualMachinesClient.ListNextResults(%v) - backoff: failure, will retry,err=%v",
+						az.ResourceGroup, retryErr)
+					return false, retryErr
+				}
+				glog.V(2).Infof("VirtualMachinesClient.ListNextResults(%v): success", az.ResourceGroup)
+				return true, nil
+			})
+			if err != nil {
+				return allNodes, err
+			}
+			appendResults = (result.Value != nil && len(*result.Value) > 0)
+		}
+	}
+
+	return allNodes, err
+}
+
 // GetIPForMachineWithRetry invokes az.getIPForMachine with exponential backoff retry
 func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {
 	var ip string
-	err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
 		ip, retryErr = az.getIPForMachine(name)
 		if retryErr != nil {
@@ -77,7 +145,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {
 
 // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%s): start", *sg.Name)
 		respChan, errChan := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil)
@@ -90,7 +158,7 @@ func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {
 
 // CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%s): start", *lb.Name)
 		respChan, errChan := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil)
@@ -101,9 +169,120 @@ func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error {
 	})
 }
 
+// ListLBWithRetry invokes az.VirtualMachinesClient.List with exponential backoff retry
+func (az *Cloud) ListLBWithRetry() ([]network.LoadBalancer, error) {
+	allLBs := []network.LoadBalancer{}
+	var result network.LoadBalancerListResult
+
+	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+		var retryErr error
+		az.operationPollRateLimiter.Accept()
+		glog.V(10).Infof("LoadBalancerClient.List(%v): start", az.ResourceGroup)
+		result, retryErr = az.LoadBalancerClient.List(az.ResourceGroup)
+		glog.V(10).Infof("LoadBalancerClient.List(%v): end", az.ResourceGroup)
+		if retryErr != nil {
+			glog.Errorf("LoadBalancerClient.List(%v) - backoff: failure, will retry,err=%v",
+				az.ResourceGroup,
+				retryErr)
+			return false, retryErr
+		}
+		glog.V(2).Infof("LoadBalancerClient.List(%v) - backoff: success", az.ResourceGroup)
+		return true, nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	appendResults := (result.Value != nil && len(*result.Value) > 0)
+	for appendResults {
+		allLBs = append(allLBs, *result.Value...)
+		appendResults = false
+
+		// follow the next link to get all the vms for resource group
+		if result.NextLink != nil {
+			err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+				var retryErr error
+				az.operationPollRateLimiter.Accept()
+				glog.V(10).Infof("LoadBalancerClient.ListNextResults(%v): start", az.ResourceGroup)
+				result, retryErr = az.LoadBalancerClient.ListNextResults(result)
+				glog.V(10).Infof("LoadBalancerClient.ListNextResults(%v): end", az.ResourceGroup)
+				if retryErr != nil {
+					glog.Errorf("LoadBalancerClient.ListNextResults(%v) - backoff: failure, will retry,err=%v",
+						az.ResourceGroup,
+						retryErr)
+					return false, retryErr
+				}
+				glog.V(2).Infof("LoadBalancerClient.ListNextResults(%v) - backoff: success", az.ResourceGroup)
+				return true, nil
+			})
+			if err != nil {
+				return allLBs, err
+			}
+			appendResults = (result.Value != nil && len(*result.Value) > 0)
+		}
+	}
+
+	return allLBs, nil
+}
+
+// ListPIPWithRetry list the PIP resources in az.ResourceGroup
+func (az *Cloud) ListPIPWithRetry() ([]network.PublicIPAddress, error) {
+	allPIPs := []network.PublicIPAddress{}
+	var result network.PublicIPAddressListResult
+	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+		var retryErr error
+		az.operationPollRateLimiter.Accept()
+		glog.V(10).Infof("PublicIPAddressesClient.List(%v): start", az.ResourceGroup)
+		result, retryErr = az.PublicIPAddressesClient.List(az.ResourceGroup)
+		glog.V(10).Infof("PublicIPAddressesClient.List(%v): end", az.ResourceGroup)
+		if retryErr != nil {
+			glog.Errorf("PublicIPAddressesClient.List(%v) - backoff: failure, will retry,err=%v",
+				az.ResourceGroup,
+				retryErr)
+			return false, retryErr
+		}
+		glog.V(2).Infof("PublicIPAddressesClient.List(%v) - backoff: success", az.ResourceGroup)
+		return true, nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	appendResults := (result.Value != nil && len(*result.Value) > 0)
+	for appendResults {
+		allPIPs = append(allPIPs, *result.Value...)
+		appendResults = false
+
+		// follow the next link to get all the vms for resource group
+		if result.NextLink != nil {
+			err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+				var retryErr error
+				az.operationPollRateLimiter.Accept()
+				glog.V(10).Infof("PublicIPAddressesClient.ListNextResults(%v): start", az.ResourceGroup)
+				result, retryErr = az.PublicIPAddressesClient.ListNextResults(result)
+				glog.V(10).Infof("PublicIPAddressesClient.ListNextResults(%v): end", az.ResourceGroup)
+				if retryErr != nil {
+					glog.Errorf("PublicIPAddressesClient.ListNextResults(%v) - backoff: failure, will retry,err=%v",
+						az.ResourceGroup,
+						retryErr)
+					return false, retryErr
+				}
+				glog.V(2).Infof("PublicIPAddressesClient.ListNextResults(%v) - backoff: success", az.ResourceGroup)
+				return true, nil
+			})
+			if err != nil {
+				return allPIPs, err
+			}
+			appendResults = (result.Value != nil && len(*result.Value) > 0)
+		}
+	}
+
+	return allPIPs, nil
+}
+
 // CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s): start", *pip.Name)
 		respChan, errChan := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil)
@@ -116,7 +295,7 @@ func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error {
 
 // CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("InterfacesClient.CreateOrUpdate(%s): start", *nic.Name)
 		respChan, errChan := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil)
@@ -129,7 +308,7 @@ func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error {
 
 // DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry
 func (az *Cloud) DeletePublicIPWithRetry(pipName string) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.Delete(%s): start", pipName)
 		respChan, errChan := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil)
@@ -142,7 +321,7 @@ func (az *Cloud) DeletePublicIPWithRetry(pipName string) error {
 
 // DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry
 func (az *Cloud) DeleteLBWithRetry(lbName string) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.Delete(%s): start", lbName)
 		respChan, errChan := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil)
@@ -155,7 +334,7 @@ func (az *Cloud) DeleteLBWithRetry(lbName string) error {
 
 // CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RouteTablesClient.CreateOrUpdate(%s): start", *routeTable.Name)
 		respChan, errChan := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil)
@@ -168,7 +347,7 @@ func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable
 
 // CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RoutesClient.CreateOrUpdate(%s): start", *route.Name)
 		respChan, errChan := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil)
@@ -181,7 +360,7 @@ func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error {
 
 // DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry
 func (az *Cloud) DeleteRouteWithRetry(routeName string) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RoutesClient.Delete(%s): start", az.RouteTableName)
 		respChan, errChan := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil)
@@ -194,7 +373,7 @@ func (az *Cloud) DeleteRouteWithRetry(routeName string) error {
 
 // CreateOrUpdateVMWithRetry invokes az.VirtualMachinesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualMachine) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s): start", vmName)
 		respChan, errChan := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil)
diff --git a/pkg/cloudprovider/providers/azure/azure_fakes.go b/pkg/cloudprovider/providers/azure/azure_fakes.go
new file mode 100644
index 0000000000..0351f4efaa
--- /dev/null
+++ b/pkg/cloudprovider/providers/azure/azure_fakes.go
@@ -0,0 +1,584 @@
+package azure
+
+import (
+	"fmt"
+	"math/rand"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/Azure/go-autorest/autorest/to"
+
+	"github.com/Azure/azure-sdk-for-go/arm/compute"
+	"github.com/Azure/azure-sdk-for-go/arm/network"
+	"github.com/Azure/go-autorest/autorest"
+)
+
+type fakeAzureLBClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]network.LoadBalancer
+}
+
+func NewFakeAzureLBClient() fakeAzureLBClient {
+	fLBC := fakeAzureLBClient{}
+	fLBC.FakeStore = make(map[string]map[string]network.LoadBalancer)
+	fLBC.mutex = &sync.Mutex{}
+	return fLBC
+}
+
+func (fLBC fakeAzureLBClient) CreateOrUpdate(resourceGroupName string, loadBalancerName string, parameters network.LoadBalancer, cancel <-chan struct{}) (<-chan network.LoadBalancer, <-chan error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	resultChan := make(chan network.LoadBalancer, 1)
+	errChan := make(chan error, 1)
+	var result network.LoadBalancer
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fLBC.FakeStore[resourceGroupName]; !ok {
+		fLBC.FakeStore[resourceGroupName] = make(map[string]network.LoadBalancer)
+	}
+
+	// For dynamic ip allocation, just fill in the PrivateIPAddress
+	if parameters.FrontendIPConfigurations != nil {
+		for idx, config := range *parameters.FrontendIPConfigurations {
+			if config.PrivateIPAllocationMethod == network.Dynamic {
+				(*parameters.FrontendIPConfigurations)[idx].PrivateIPAddress = to.StringPtr("10.0.0.19")
+			}
+		}
+	}
+	fLBC.FakeStore[resourceGroupName][loadBalancerName] = parameters
+	result = fLBC.FakeStore[resourceGroupName][loadBalancerName]
+	err = nil
+	return resultChan, errChan
+}
+
+func (fLBC fakeAzureLBClient) Delete(resourceGroupName string, loadBalancerName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	respChan := make(chan autorest.Response, 1)
+	errChan := make(chan error, 1)
+	var resp autorest.Response
+	var err error
+	defer func() {
+		respChan <- resp
+		errChan <- err
+		close(respChan)
+		close(errChan)
+	}()
+	if _, ok := fLBC.FakeStore[resourceGroupName]; ok {
+		if _, ok := fLBC.FakeStore[resourceGroupName][loadBalancerName]; ok {
+			delete(fLBC.FakeStore[resourceGroupName], loadBalancerName)
+			resp.Response = &http.Response{
+				StatusCode: http.StatusAccepted,
+			}
+			err = nil
+			return respChan, errChan
+		}
+	}
+	resp.Response = &http.Response{
+		StatusCode: http.StatusNotFound,
+	}
+	err = autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such LB",
+	}
+	return respChan, errChan
+}
+
+func (fLBC fakeAzureLBClient) Get(resourceGroupName string, loadBalancerName string, expand string) (result network.LoadBalancer, err error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	if _, ok := fLBC.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fLBC.FakeStore[resourceGroupName][loadBalancerName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such LB",
+	}
+}
+
+func (fLBC fakeAzureLBClient) List(resourceGroupName string) (result network.LoadBalancerListResult, err error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	var value []network.LoadBalancer
+	if _, ok := fLBC.FakeStore[resourceGroupName]; ok {
+		for _, v := range fLBC.FakeStore[resourceGroupName] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+
+func (fLBC fakeAzureLBClient) ListNextResults(lastResult network.LoadBalancerListResult) (result network.LoadBalancerListResult, err error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = nil
+	return result, nil
+}
+
+type fakeAzurePIPClient struct {
+	mutex          *sync.Mutex
+	FakeStore      map[string]map[string]network.PublicIPAddress
+	SubscriptionID string
+}
+
+const publicIPAddressIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/publicIPAddresses/%s"
+
+// returns the full identifier of a publicIPAddress.
+func getpublicIPAddressID(subscriptionID string, resourceGroupName, pipName string) string {
+	return fmt.Sprintf(
+		publicIPAddressIDTemplate,
+		subscriptionID,
+		resourceGroupName,
+		pipName)
+}
+
+func NewFakeAzurePIPClient(subscriptionID string) fakeAzurePIPClient {
+	fAPC := fakeAzurePIPClient{}
+	fAPC.FakeStore = make(map[string]map[string]network.PublicIPAddress)
+	fAPC.SubscriptionID = subscriptionID
+	fAPC.mutex = &sync.Mutex{}
+	return fAPC
+}
+
+func (fAPC fakeAzurePIPClient) CreateOrUpdate(resourceGroupName string, publicIPAddressName string, parameters network.PublicIPAddress, cancel <-chan struct{}) (<-chan network.PublicIPAddress, <-chan error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	resultChan := make(chan network.PublicIPAddress, 1)
+	errChan := make(chan error, 1)
+	var result network.PublicIPAddress
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fAPC.FakeStore[resourceGroupName]; !ok {
+		fAPC.FakeStore[resourceGroupName] = make(map[string]network.PublicIPAddress)
+	}
+
+	// assign id
+	pipID := getpublicIPAddressID(fAPC.SubscriptionID, resourceGroupName, publicIPAddressName)
+	parameters.ID = &pipID
+
+	// only create in the case user has not provided
+	if parameters.PublicIPAddressPropertiesFormat != nil &&
+		parameters.PublicIPAddressPropertiesFormat.PublicIPAllocationMethod == network.Static {
+		// assign ip
+		rand.Seed(time.Now().UnixNano())
+		randomIP := fmt.Sprintf("%d.%d.%d.%d", rand.Intn(256), rand.Intn(256), rand.Intn(256), rand.Intn(256))
+		parameters.IPAddress = &randomIP
+	}
+
+	fAPC.FakeStore[resourceGroupName][publicIPAddressName] = parameters
+	result = fAPC.FakeStore[resourceGroupName][publicIPAddressName]
+	err = nil
+	return resultChan, errChan
+}
+
+func (fAPC fakeAzurePIPClient) Delete(resourceGroupName string, publicIPAddressName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	respChan := make(chan autorest.Response, 1)
+	errChan := make(chan error, 1)
+	var resp autorest.Response
+	var err error
+	defer func() {
+		respChan <- resp
+		errChan <- err
+		close(respChan)
+		close(errChan)
+	}()
+	if _, ok := fAPC.FakeStore[resourceGroupName]; ok {
+		if _, ok := fAPC.FakeStore[resourceGroupName][publicIPAddressName]; ok {
+			delete(fAPC.FakeStore[resourceGroupName], publicIPAddressName)
+			resp.Response = &http.Response{
+				StatusCode: http.StatusAccepted,
+			}
+			err = nil
+			return respChan, errChan
+		}
+	}
+	resp.Response = &http.Response{
+		StatusCode: http.StatusNotFound,
+	}
+	err = autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such PIP",
+	}
+	return respChan, errChan
+}
+
+func (fAPC fakeAzurePIPClient) Get(resourceGroupName string, publicIPAddressName string, expand string) (result network.PublicIPAddress, err error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	if _, ok := fAPC.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fAPC.FakeStore[resourceGroupName][publicIPAddressName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such PIP",
+	}
+}
+
+func (fAPC fakeAzurePIPClient) ListNextResults(lastResults network.PublicIPAddressListResult) (result network.PublicIPAddressListResult, err error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	return network.PublicIPAddressListResult{}, nil
+}
+
+func (fAPC fakeAzurePIPClient) List(resourceGroupName string) (result network.PublicIPAddressListResult, err error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	var value []network.PublicIPAddress
+	if _, ok := fAPC.FakeStore[resourceGroupName]; ok {
+		for _, v := range fAPC.FakeStore[resourceGroupName] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+
+type fakeInterfacesClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]network.Interface
+}
+
+func NewFakeInterfacesClient() fakeInterfacesClient {
+	fIC := fakeInterfacesClient{}
+	fIC.FakeStore = make(map[string]map[string]network.Interface)
+	fIC.mutex = &sync.Mutex{}
+
+	return fIC
+}
+
+func (fIC fakeInterfacesClient) CreateOrUpdate(resourceGroupName string, networkInterfaceName string, parameters network.Interface, cancel <-chan struct{}) (<-chan network.Interface, <-chan error) {
+	fIC.mutex.Lock()
+	defer fIC.mutex.Unlock()
+	resultChan := make(chan network.Interface, 1)
+	errChan := make(chan error, 1)
+	var result network.Interface
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fIC.FakeStore[resourceGroupName]; !ok {
+		fIC.FakeStore[resourceGroupName] = make(map[string]network.Interface)
+	}
+	fIC.FakeStore[resourceGroupName][networkInterfaceName] = parameters
+	result = fIC.FakeStore[resourceGroupName][networkInterfaceName]
+	err = nil
+
+	return resultChan, errChan
+}
+
+func (fIC fakeInterfacesClient) Get(resourceGroupName string, networkInterfaceName string, expand string) (result network.Interface, err error) {
+	fIC.mutex.Lock()
+	defer fIC.mutex.Unlock()
+	if _, ok := fIC.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fIC.FakeStore[resourceGroupName][networkInterfaceName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such Interface",
+	}
+}
+
+type fakeVirtualMachinesClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]compute.VirtualMachine
+}
+
+func NewFakeVirtualMachinesClient() fakeVirtualMachinesClient {
+	fVMC := fakeVirtualMachinesClient{}
+	fVMC.FakeStore = make(map[string]map[string]compute.VirtualMachine)
+	fVMC.mutex = &sync.Mutex{}
+	return fVMC
+}
+
+func (fVMC fakeVirtualMachinesClient) CreateOrUpdate(resourceGroupName string, VMName string, parameters compute.VirtualMachine, cancel <-chan struct{}) (<-chan compute.VirtualMachine, <-chan error) {
+	fVMC.mutex.Lock()
+	defer fVMC.mutex.Unlock()
+	resultChan := make(chan compute.VirtualMachine, 1)
+	errChan := make(chan error, 1)
+	var result compute.VirtualMachine
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fVMC.FakeStore[resourceGroupName]; !ok {
+		fVMC.FakeStore[resourceGroupName] = make(map[string]compute.VirtualMachine)
+	}
+	fVMC.FakeStore[resourceGroupName][VMName] = parameters
+	result = fVMC.FakeStore[resourceGroupName][VMName]
+	err = nil
+	return resultChan, errChan
+}
+
+func (fVMC fakeVirtualMachinesClient) Get(resourceGroupName string, VMName string, expand compute.InstanceViewTypes) (result compute.VirtualMachine, err error) {
+	fVMC.mutex.Lock()
+	defer fVMC.mutex.Unlock()
+	if _, ok := fVMC.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fVMC.FakeStore[resourceGroupName][VMName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such VM",
+	}
+}
+
+func (fVMC fakeVirtualMachinesClient) List(resourceGroupName string) (result compute.VirtualMachineListResult, err error) {
+	fVMC.mutex.Lock()
+	defer fVMC.mutex.Unlock()
+	var value []compute.VirtualMachine
+	if _, ok := fVMC.FakeStore[resourceGroupName]; ok {
+		for _, v := range fVMC.FakeStore[resourceGroupName] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+func (fVMC fakeVirtualMachinesClient) ListNextResults(lastResults compute.VirtualMachineListResult) (result compute.VirtualMachineListResult, err error) {
+	fVMC.mutex.Lock()
+	defer fVMC.mutex.Unlock()
+	return compute.VirtualMachineListResult{}, nil
+}
+
+type fakeAzureSubnetsClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]network.Subnet
+}
+
+func NewFakeAzureSubnetsClient() fakeAzureSubnetsClient {
+	fASC := fakeAzureSubnetsClient{}
+	fASC.FakeStore = make(map[string]map[string]network.Subnet)
+	fASC.mutex = &sync.Mutex{}
+	return fASC
+}
+
+func (fASC fakeAzureSubnetsClient) CreateOrUpdate(resourceGroupName string, virtualNetworkName string, subnetName string, subnetParameters network.Subnet, cancel <-chan struct{}) (<-chan network.Subnet, <-chan error) {
+	fASC.mutex.Lock()
+	defer fASC.mutex.Unlock()
+	resultChan := make(chan network.Subnet, 1)
+	errChan := make(chan error, 1)
+	var result network.Subnet
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
+	if _, ok := fASC.FakeStore[rgVnet]; !ok {
+		fASC.FakeStore[rgVnet] = make(map[string]network.Subnet)
+	}
+	fASC.FakeStore[rgVnet][subnetName] = subnetParameters
+	result = fASC.FakeStore[rgVnet][subnetName]
+	err = nil
+	return resultChan, errChan
+}
+
+func (fASC fakeAzureSubnetsClient) Delete(resourceGroupName string, virtualNetworkName string, subnetName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error) {
+	fASC.mutex.Lock()
+	defer fASC.mutex.Unlock()
+	respChan := make(chan autorest.Response, 1)
+	errChan := make(chan error, 1)
+	var resp autorest.Response
+	var err error
+	defer func() {
+		respChan <- resp
+		errChan <- err
+		close(respChan)
+		close(errChan)
+	}()
+
+	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
+	if _, ok := fASC.FakeStore[rgVnet]; ok {
+		if _, ok := fASC.FakeStore[rgVnet][subnetName]; ok {
+			delete(fASC.FakeStore[rgVnet], subnetName)
+			resp.Response = &http.Response{
+				StatusCode: http.StatusAccepted,
+			}
+			err = nil
+			return respChan, errChan
+		}
+	}
+	resp.Response = &http.Response{
+		StatusCode: http.StatusNotFound,
+	}
+	err = autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such Subnet",
+	}
+	return respChan, errChan
+}
+func (fASC fakeAzureSubnetsClient) Get(resourceGroupName string, virtualNetworkName string, subnetName string, expand string) (result network.Subnet, err error) {
+	fASC.mutex.Lock()
+	defer fASC.mutex.Unlock()
+	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
+	if _, ok := fASC.FakeStore[rgVnet]; ok {
+		if entity, ok := fASC.FakeStore[rgVnet][subnetName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such Subnet",
+	}
+}
+func (fASC fakeAzureSubnetsClient) List(resourceGroupName string, virtualNetworkName string) (result network.SubnetListResult, err error) {
+	fASC.mutex.Lock()
+	defer fASC.mutex.Unlock()
+	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
+	var value []network.Subnet
+	if _, ok := fASC.FakeStore[rgVnet]; ok {
+		for _, v := range fASC.FakeStore[rgVnet] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+
+type fakeAzureNSGClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]network.SecurityGroup
+}
+
+func NewFakeAzureNSGClient() fakeAzureNSGClient {
+	fNSG := fakeAzureNSGClient{}
+	fNSG.FakeStore = make(map[string]map[string]network.SecurityGroup)
+	fNSG.mutex = &sync.Mutex{}
+	return fNSG
+}
+
+func (fNSG fakeAzureNSGClient) CreateOrUpdate(resourceGroupName string, networkSecurityGroupName string, parameters network.SecurityGroup, cancel <-chan struct{}) (<-chan network.SecurityGroup, <-chan error) {
+	fNSG.mutex.Lock()
+	defer fNSG.mutex.Unlock()
+	resultChan := make(chan network.SecurityGroup, 1)
+	errChan := make(chan error, 1)
+	var result network.SecurityGroup
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fNSG.FakeStore[resourceGroupName]; !ok {
+		fNSG.FakeStore[resourceGroupName] = make(map[string]network.SecurityGroup)
+	}
+	fNSG.FakeStore[resourceGroupName][networkSecurityGroupName] = parameters
+	result = fNSG.FakeStore[resourceGroupName][networkSecurityGroupName]
+	err = nil
+	return resultChan, errChan
+}
+
+func (fNSG fakeAzureNSGClient) Delete(resourceGroupName string, networkSecurityGroupName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error) {
+	fNSG.mutex.Lock()
+	defer fNSG.mutex.Unlock()
+	respChan := make(chan autorest.Response, 1)
+	errChan := make(chan error, 1)
+	var resp autorest.Response
+	var err error
+	defer func() {
+		respChan <- resp
+		errChan <- err
+		close(respChan)
+		close(errChan)
+	}()
+	if _, ok := fNSG.FakeStore[resourceGroupName]; ok {
+		if _, ok := fNSG.FakeStore[resourceGroupName][networkSecurityGroupName]; ok {
+			delete(fNSG.FakeStore[resourceGroupName], networkSecurityGroupName)
+			resp.Response = &http.Response{
+				StatusCode: http.StatusAccepted,
+			}
+			err = nil
+			return respChan, errChan
+		}
+	}
+	resp.Response = &http.Response{
+		StatusCode: http.StatusNotFound,
+	}
+	err = autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such NSG",
+	}
+	return respChan, errChan
+}
+
+func (fNSG fakeAzureNSGClient) Get(resourceGroupName string, networkSecurityGroupName string, expand string) (result network.SecurityGroup, err error) {
+	fNSG.mutex.Lock()
+	defer fNSG.mutex.Unlock()
+	if _, ok := fNSG.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fNSG.FakeStore[resourceGroupName][networkSecurityGroupName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such NSG",
+	}
+}
+
+func (fNSG fakeAzureNSGClient) List(resourceGroupName string) (result network.SecurityGroupListResult, err error) {
+	fNSG.mutex.Lock()
+	defer fNSG.mutex.Unlock()
+	var value []network.SecurityGroup
+	if _, ok := fNSG.FakeStore[resourceGroupName]; ok {
+		for _, v := range fNSG.FakeStore[resourceGroupName] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go
index 0af7eec292..fe9ed07ae0 100644
--- a/pkg/cloudprovider/providers/azure/azure_instances.go
+++ b/pkg/cloudprovider/providers/azure/azure_instances.go
@@ -199,39 +199,6 @@ func (az *Cloud) CurrentNodeName(hostname string) (types.NodeName, error) {
 	return types.NodeName(hostname), nil
 }
 
-func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) {
-	allNodes := []compute.VirtualMachine{}
-
-	az.operationPollRateLimiter.Accept()
-	glog.V(10).Infof("VirtualMachinesClient.List(%s): start", az.ResourceGroup)
-	result, err := az.VirtualMachinesClient.List(az.ResourceGroup)
-	glog.V(10).Infof("VirtualMachinesClient.List(%s): end", az.ResourceGroup)
-	if err != nil {
-		glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.List(%s), err=%v", az.ResourceGroup, err)
-		return nil, err
-	}
-
-	morePages := (result.Value != nil && len(*result.Value) > 1)
-
-	for morePages {
-		allNodes = append(allNodes, *result.Value...)
-
-		az.operationPollRateLimiter.Accept()
-		glog.V(10).Infof("VirtualMachinesClient.ListAllNextResults(%v): start", az.ResourceGroup)
-		result, err = az.VirtualMachinesClient.ListAllNextResults(result)
-		glog.V(10).Infof("VirtualMachinesClient.ListAllNextResults(%v): end", az.ResourceGroup)
-		if err != nil {
-			glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.ListAllNextResults(%v), err=%v", result, err)
-			return nil, err
-		}
-
-		morePages = (result.Value != nil && len(*result.Value) > 1)
-	}
-
-	return allNodes, nil
-
-}
-
 // mapNodeNameToVMName maps a k8s NodeName to an Azure VM Name
 // This is a simple string cast.
 func mapNodeNameToVMName(nodeName types.NodeName) string {
diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
index 3b970e87ed..2afb656830 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
@@ -39,86 +39,31 @@ const ServiceAnnotationLoadBalancerInternal = "service.beta.kubernetes.io/azure-
 // to specify what subnet it is exposed on
 const ServiceAnnotationLoadBalancerInternalSubnet = "service.beta.kubernetes.io/azure-load-balancer-internal-subnet"
 
+// ServiceAnnotationLoadBalancerMode is the annotation used on the service to specify the
+// Azure load balancer selection based on availability sets
+const ServiceAnnotationLoadBalancerMode = "service.beta.kubernetes.io/azure-load-balancer-mode"
+
+// ServiceAnnotationLoadBalancerAutoModeValue the annotation used on the service to specify the
+// Azure load balancer auto selection from the availability sets
+const ServiceAnnotationLoadBalancerAutoModeValue = "__auto__"
+
 // ServiceAnnotationDNSLabelName annotation speficying the DNS label name for the service.
 const ServiceAnnotationDNSLabelName = "service.beta.kubernetes.io/azure-dns-label-name"
 
 // GetLoadBalancer returns whether the specified load balancer exists, and
 // if so, what its status is.
 func (az *Cloud) GetLoadBalancer(clusterName string, service *v1.Service) (status *v1.LoadBalancerStatus, exists bool, err error) {
-	isInternal := requiresInternalLoadBalancer(service)
-	lbName := getLoadBalancerName(clusterName, isInternal)
-	serviceName := getServiceName(service)
-
-	lb, existsLb, err := az.getAzureLoadBalancer(lbName)
+	_, status, exists, err = az.getServiceLoadBalancer(service, clusterName, nil, false)
 	if err != nil {
 		return nil, false, err
 	}
-	if !existsLb {
-		glog.V(5).Infof("get(%s): lb(%s) - doesn't exist", serviceName, lbName)
-		return nil, false, nil
+	if exists == false {
+		serviceName := getServiceName(service)
+		glog.V(5).Infof("getloadbalancer (cluster:%s) (service:%s)- IP doesn't exist in any of the lbs", clusterName, serviceName)
+		return nil, false, fmt.Errorf("Service(%s) - Loadbalancer not found", serviceName)
 	}
 
-	var lbIP *string
-
-	if isInternal {
-		lbFrontendIPConfigName := getFrontendIPConfigName(service, subnet(service))
-		for _, ipConfiguration := range *lb.FrontendIPConfigurations {
-			if lbFrontendIPConfigName == *ipConfiguration.Name {
-				lbIP = ipConfiguration.PrivateIPAddress
-				break
-			}
-		}
-	} else {
-		// TODO: Consider also read address from lb's FrontendIPConfigurations
-		pipName, err := az.determinePublicIPName(clusterName, service)
-		if err != nil {
-			return nil, false, err
-		}
-		pip, existsPip, err := az.getPublicIPAddress(pipName)
-		if err != nil {
-			return nil, false, err
-		}
-		if existsPip {
-			lbIP = pip.IPAddress
-		}
-	}
-
-	if lbIP == nil {
-		glog.V(5).Infof("get(%s): lb(%s) - IP doesn't exist", serviceName, lbName)
-		return nil, false, nil
-	}
-
-	return &v1.LoadBalancerStatus{
-		Ingress: []v1.LoadBalancerIngress{{IP: *lbIP}},
-	}, true, nil
-}
-
-func (az *Cloud) determinePublicIPName(clusterName string, service *v1.Service) (string, error) {
-	loadBalancerIP := service.Spec.LoadBalancerIP
-	if len(loadBalancerIP) == 0 {
-		return getPublicIPName(clusterName, service), nil
-	}
-
-	az.operationPollRateLimiter.Accept()
-	glog.V(10).Infof("PublicIPAddressesClient.List(%v): start", az.ResourceGroup)
-	list, err := az.PublicIPAddressesClient.List(az.ResourceGroup)
-	glog.V(10).Infof("PublicIPAddressesClient.List(%v): end", az.ResourceGroup)
-	if err != nil {
-		return "", err
-	}
-
-	if list.Value != nil {
-		for ix := range *list.Value {
-			ip := &(*list.Value)[ix]
-			if ip.PublicIPAddressPropertiesFormat.IPAddress != nil &&
-				*ip.PublicIPAddressPropertiesFormat.IPAddress == loadBalancerIP {
-				return *ip.Name, nil
-			}
-		}
-	}
-	// TODO: follow next link here? Will there really ever be that many public IPs?
-
-	return "", fmt.Errorf("user supplied IP Address %s was not found", loadBalancerIP)
+	return status, true, nil
 }
 
 func getPublicIPLabel(service *v1.Service) string {
@@ -130,193 +75,35 @@ func getPublicIPLabel(service *v1.Service) string {
 
 // EnsureLoadBalancer creates a new load balancer 'name', or updates the existing one. Returns the status of the balancer
 func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nodes []*v1.Node) (*v1.LoadBalancerStatus, error) {
-	isInternal := requiresInternalLoadBalancer(service)
-	lbName := getLoadBalancerName(clusterName, isInternal)
-
 	// When a client updates the internal load balancer annotation,
 	// the service may be switched from an internal LB to a public one, or vise versa.
 	// Here we'll firstly ensure service do not lie in the opposite LB.
-	err := az.cleanupLoadBalancer(clusterName, service, !isInternal)
-	if err != nil {
-		return nil, err
-	}
-
 	serviceName := getServiceName(service)
-	glog.V(5).Infof("ensure(%s): START clusterName=%q lbName=%q", serviceName, clusterName, lbName)
+	glog.V(5).Infof("ensureloadbalancer(%s): START clusterName=%q", serviceName, clusterName)
+	flipedService := flipServiceInternalAnnotation(service)
+	if _, err := az.reconcileLoadBalancer(clusterName, flipedService, nil, false /* wantLb */); err != nil {
+		return nil, err
+	}
 
-	lb, existsLb, err := az.getAzureLoadBalancer(lbName)
+	if _, err := az.reconcilePublicIP(clusterName, service, true /* wantLb */); err != nil {
+		return nil, err
+	}
+
+	lb, err := az.reconcileLoadBalancer(clusterName, service, nodes, true /* wantLb */)
 	if err != nil {
 		return nil, err
 	}
-	if !existsLb {
-		lb = network.LoadBalancer{
-			Name:                         &lbName,
-			Location:                     &az.Location,
-			LoadBalancerPropertiesFormat: &network.LoadBalancerPropertiesFormat{},
-		}
-	}
 
-	var lbIP *string
-	var fipConfigurationProperties *network.FrontendIPConfigurationPropertiesFormat
-
-	if isInternal {
-		subnetName := subnet(service)
-		if subnetName == nil {
-			subnetName = &az.SubnetName
-		}
-		subnet, existsSubnet, err := az.getSubnet(az.VnetName, *subnetName)
-		if err != nil {
-			return nil, err
-		}
-
-		if !existsSubnet {
-			return nil, fmt.Errorf("ensure(%s): lb(%s) - failed to get subnet: %s/%s", serviceName, lbName, az.VnetName, az.SubnetName)
-		}
-
-		configProperties := network.FrontendIPConfigurationPropertiesFormat{
-			Subnet: &network.Subnet{
-				ID: subnet.ID,
-			},
-		}
-
-		loadBalancerIP := service.Spec.LoadBalancerIP
-		if loadBalancerIP != "" {
-			configProperties.PrivateIPAllocationMethod = network.Static
-			configProperties.PrivateIPAddress = &loadBalancerIP
-			lbIP = &loadBalancerIP
-		} else {
-			// We'll need to call GetLoadBalancer later to retrieve allocated IP.
-			configProperties.PrivateIPAllocationMethod = network.Dynamic
-		}
-
-		fipConfigurationProperties = &configProperties
-	} else {
-		pipName, err := az.determinePublicIPName(clusterName, service)
-		if err != nil {
-			return nil, err
-		}
-		domainNameLabel := getPublicIPLabel(service)
-		pip, err := az.ensurePublicIPExists(serviceName, pipName, domainNameLabel)
-		if err != nil {
-			return nil, err
-		}
-
-		lbIP = pip.IPAddress
-		fipConfigurationProperties = &network.FrontendIPConfigurationPropertiesFormat{
-			PublicIPAddress: &network.PublicIPAddress{ID: pip.ID},
-		}
-	}
-
-	lb, lbNeedsUpdate, err := az.reconcileLoadBalancer(lb, fipConfigurationProperties, clusterName, service, nodes)
+	lbStatus, err := az.getServiceLoadBalancerStatus(service, lb)
 	if err != nil {
 		return nil, err
 	}
-	if !existsLb || lbNeedsUpdate {
-		glog.V(3).Infof("ensure(%s): lb(%s) - updating", serviceName, lbName)
-		az.operationPollRateLimiter.Accept()
-		glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%q): start", *lb.Name)
-		respChan, errChan := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil)
-		resp := <-respChan
-		err := <-errChan
-		glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%q): end", *lb.Name)
-		if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
-			glog.V(2).Infof("ensure(%s) backing off: lb(%s) - updating", serviceName, lbName)
-			retryErr := az.CreateOrUpdateLBWithRetry(lb)
-			if retryErr != nil {
-				glog.V(2).Infof("ensure(%s) abort backoff: lb(%s) - updating", serviceName, lbName)
-				return nil, retryErr
-			}
-		}
-		if err != nil {
-			return nil, err
-		}
-	}
 
-	var lbStatus *v1.LoadBalancerStatus
-	if lbIP == nil {
-		lbStatus, exists, err := az.GetLoadBalancer(clusterName, service)
-		if err != nil {
-			return nil, err
-		}
-		if !exists {
-			return nil, fmt.Errorf("ensure(%s): lb(%s) - failed to get back load balancer", serviceName, lbName)
-		}
-		lbIP = &lbStatus.Ingress[0].IP
-	}
-
-	az.operationPollRateLimiter.Accept()
-	glog.V(10).Infof("SecurityGroupsClient.Get(%q): start", az.SecurityGroupName)
-	sg, err := az.SecurityGroupsClient.Get(az.ResourceGroup, az.SecurityGroupName, "")
-	glog.V(10).Infof("SecurityGroupsClient.Get(%q): end", az.SecurityGroupName)
-	if err != nil {
+	if _, err := az.reconcileSecurityGroup(clusterName, service, lbStatus, true /* wantLb */); err != nil {
 		return nil, err
 	}
-	sg, sgNeedsUpdate, err := az.reconcileSecurityGroup(sg, clusterName, service, lbIP, true /* wantLb */)
-	if err != nil {
-		return nil, err
-	}
-	if sgNeedsUpdate {
-		glog.V(3).Infof("ensure(%s): sg(%s) - updating", serviceName, *sg.Name)
-		az.operationPollRateLimiter.Accept()
-		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%q): start", *sg.Name)
-		respChan, errChan := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil)
-		resp := <-respChan
-		err := <-errChan
-		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%q): end", *sg.Name)
-		if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
-			glog.V(2).Infof("ensure(%s) backing off: sg(%s) - updating", serviceName, *sg.Name)
-			retryErr := az.CreateOrUpdateSGWithRetry(sg)
-			if retryErr != nil {
-				glog.V(2).Infof("ensure(%s) abort backoff: sg(%s) - updating", serviceName, *sg.Name)
-				return nil, retryErr
-			}
-		}
-		if err != nil {
-			return nil, err
-		}
-	}
 
-	// Add the machines to the backend pool if they're not already
-	lbBackendName := getBackendPoolName(clusterName)
-	lbBackendPoolID := az.getBackendPoolID(lbName, lbBackendName)
-	hostUpdates := make([]func() error, len(nodes))
-	for i, node := range nodes {
-		localNodeName := node.Name
-		f := func() error {
-			err := az.ensureHostInPool(serviceName, types.NodeName(localNodeName), lbBackendPoolID)
-			if err != nil {
-				return fmt.Errorf("ensure(%s): lb(%s) - failed to ensure host in pool: %q", serviceName, lbName, err)
-			}
-			return nil
-		}
-		hostUpdates[i] = f
-	}
-
-	errs := utilerrors.AggregateGoroutines(hostUpdates...)
-	if errs != nil {
-		return nil, utilerrors.Flatten(errs)
-	}
-
-	glog.V(2).Infof("ensure(%s): lb(%s) finished", serviceName, lbName)
-
-	if lbStatus != nil {
-		return lbStatus, nil
-	}
-
-	if lbIP == nil {
-		lbStatus, exists, err := az.GetLoadBalancer(clusterName, service)
-		if err != nil {
-			return nil, err
-		}
-		if !exists {
-			return nil, fmt.Errorf("ensure(%s): lb(%s) - failed to get back load balancer", serviceName, lbName)
-		}
-		return lbStatus, nil
-	}
-
-	return &v1.LoadBalancerStatus{
-		Ingress: []v1.LoadBalancerIngress{{IP: *lbIP}},
-	}, nil
+	return lbStatus, nil
 }
 
 // UpdateLoadBalancer updates hosts under the specified load balancer.
@@ -332,146 +119,152 @@ func (az *Cloud) UpdateLoadBalancer(clusterName string, service *v1.Service, nod
 // have multiple underlying components, meaning a Get could say that the LB
 // doesn't exist even if some part of it is still laying around.
 func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Service) error {
-	isInternal := requiresInternalLoadBalancer(service)
-	lbName := getLoadBalancerName(clusterName, isInternal)
 	serviceName := getServiceName(service)
-
-	glog.V(5).Infof("delete(%s): START clusterName=%q lbName=%q", serviceName, clusterName, lbName)
-
-	err := az.cleanupLoadBalancer(clusterName, service, isInternal)
-	if err != nil {
+	glog.V(5).Infof("delete(%s): START clusterName=%q", serviceName, clusterName)
+	if _, err := az.reconcileSecurityGroup(clusterName, service, nil, false /* wantLb */); err != nil {
 		return err
 	}
 
-	sg, existsSg, err := az.getSecurityGroup()
-	if err != nil {
+	if _, err := az.reconcileLoadBalancer(clusterName, service, nil, false /* wantLb */); err != nil {
 		return err
 	}
-	if existsSg {
-		reconciledSg, sgNeedsUpdate, reconcileErr := az.reconcileSecurityGroup(sg, clusterName, service, nil, false /* wantLb */)
-		if reconcileErr != nil {
-			return reconcileErr
-		}
-		if sgNeedsUpdate {
-			glog.V(3).Infof("delete(%s): sg(%s) - updating", serviceName, az.SecurityGroupName)
-			az.operationPollRateLimiter.Accept()
-			glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%q): start", *reconciledSg.Name)
-			respChan, errChan := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *reconciledSg.Name, reconciledSg, nil)
-			resp := <-respChan
-			err := <-errChan
-			glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%q): end", *reconciledSg.Name)
-			if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
-				glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName)
-				retryErr := az.CreateOrUpdateSGWithRetry(reconciledSg)
-				if retryErr != nil {
-					err = retryErr
-					glog.V(2).Infof("delete(%s) abort backoff: sg(%s) - updating", serviceName, az.SecurityGroupName)
-				}
-			}
-			if err != nil {
-				return err
-			}
-		}
+
+	if _, err := az.reconcilePublicIP(clusterName, service, false /* wantLb */); err != nil {
+		return err
 	}
 
 	glog.V(2).Infof("delete(%s): FINISH", serviceName)
 	return nil
 }
 
-func (az *Cloud) cleanupLoadBalancer(clusterName string, service *v1.Service, isInternalLb bool) error {
-	lbName := getLoadBalancerName(clusterName, isInternalLb)
-	serviceName := getServiceName(service)
+// getServiceLoadBalancer gets the loadbalancer for the service if it already exits
+// If wantLb is TRUE then -it selects a new load balancer
+// In case the selected load balancer does not exists it returns network.LoadBalancer struct
+// with added metadata (such as name, location) and existsLB set to FALSE
+// By default - cluster default LB is returned
+func (az *Cloud) getServiceLoadBalancer(service *v1.Service, clusterName string, nodes []*v1.Node, wantLb bool) (lb *network.LoadBalancer, status *v1.LoadBalancerStatus, exists bool, err error) {
+	isInternal := requiresInternalLoadBalancer(service)
+	var defaultLB *network.LoadBalancer
+	defaultLBName := az.getLoadBalancerName(clusterName, az.Config.PrimaryAvailabilitySetName, isInternal)
 
-	glog.V(10).Infof("ensure lb deleted: clusterName=%q, serviceName=%s, lbName=%q", clusterName, serviceName, lbName)
-
-	lb, existsLb, err := az.getAzureLoadBalancer(lbName)
+	lbs, err := az.ListLBWithRetry()
 	if err != nil {
-		return err
+		return nil, nil, false, err
 	}
-	if existsLb {
-		var publicIPToCleanup *string
-
-		if !isInternalLb {
-			// Find public ip resource to clean up from IP configuration
-			lbFrontendIPConfigName := getFrontendIPConfigName(service, nil)
-			for _, config := range *lb.FrontendIPConfigurations {
-				if strings.EqualFold(*config.Name, lbFrontendIPConfigName) {
-					if config.PublicIPAddress != nil {
-						// Only ID property is available
-						publicIPToCleanup = config.PublicIPAddress.ID
-					}
-					break
-				}
+	if lbs != nil {
+		for lbx := range lbs {
+			lb := &(lbs[lbx])
+			if strings.EqualFold(*lb.Name, defaultLBName) {
+				defaultLB = lb
 			}
+			if isInternalLoadBalancer(lb) != isInternal {
+				continue
+			}
+			status, err = az.getServiceLoadBalancerStatus(service, lb)
+			if err != nil {
+				return nil, nil, false, err
+			}
+			if status == nil {
+				// service is not om this load balancer
+				continue
+			}
+
+			return lb, status, true, nil
+		}
+	}
+	// service does not have a load balancer, select one
+	if wantLb {
+		// select new load balancer for service
+		lb, exists, err = az.selectLoadBalancer(clusterName, service, &lbs, nodes)
+		if err != nil {
+			return nil, nil, false, err
 		}
 
-		lb, lbNeedsUpdate, reconcileErr := az.reconcileLoadBalancer(lb, nil, clusterName, service, []*v1.Node{})
-		if reconcileErr != nil {
-			return reconcileErr
+		return lb, nil, exists, err
+	}
+	if defaultLB == nil {
+		defaultLB = &network.LoadBalancer{
+			Name:                         &defaultLBName,
+			Location:                     &az.Location,
+			LoadBalancerPropertiesFormat: &network.LoadBalancerPropertiesFormat{},
 		}
-		if lbNeedsUpdate {
-			if len(*lb.FrontendIPConfigurations) > 0 {
-				glog.V(3).Infof("delete(%s): lb(%s) - updating", serviceName, lbName)
-				az.operationPollRateLimiter.Accept()
-				glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%q): start", *lb.Name)
-				respChan, errChan := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil)
-				resp := <-respChan
-				err := <-errChan
-				glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%q): end", *lb.Name)
-				if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
-					glog.V(2).Infof("delete(%s) backing off: sg(%s) - updating", serviceName, az.SecurityGroupName)
-					retryErr := az.CreateOrUpdateLBWithRetry(lb)
-					if retryErr != nil {
-						err = retryErr
-						glog.V(2).Infof("delete(%s) abort backoff: sg(%s) - updating", serviceName, az.SecurityGroupName)
-					}
-				}
-				if err != nil {
-					return err
-				}
+	}
+
+	return defaultLB, nil, false, nil
+}
+
+func (az *Cloud) getServiceLoadBalancerStatus(service *v1.Service, lb *network.LoadBalancer) (status *v1.LoadBalancerStatus, err error) {
+	if lb == nil {
+		glog.V(10).Infof("getServiceLoadBalancerStatus lb is nil")
+		return nil, nil
+	}
+	if lb.FrontendIPConfigurations == nil || *lb.FrontendIPConfigurations == nil {
+		return nil, nil
+	}
+	isInternal := requiresInternalLoadBalancer(service)
+	lbFrontendIPConfigName := getFrontendIPConfigName(service, subnet(service))
+	serviceName := getServiceName(service)
+	for _, ipConfiguration := range *lb.FrontendIPConfigurations {
+		if lbFrontendIPConfigName == *ipConfiguration.Name {
+			var lbIP *string
+			if isInternal {
+				lbIP = ipConfiguration.PrivateIPAddress
 			} else {
-				glog.V(3).Infof("delete(%s): lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
-
-				az.operationPollRateLimiter.Accept()
-				glog.V(10).Infof("LoadBalancerClient.Delete(%q): start", lbName)
-				respChan, errChan := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil)
-				resp := <-respChan
-				err := <-errChan
-				glog.V(10).Infof("LoadBalancerClient.Delete(%q): end", lbName)
-				if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) {
-					glog.V(2).Infof("delete(%s) backing off: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
-					retryErr := az.DeleteLBWithRetry(lbName)
-					if retryErr != nil {
-						err = retryErr
-						glog.V(2).Infof("delete(%s) abort backoff: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
-					}
+				if ipConfiguration.PublicIPAddress == nil {
+					return nil, fmt.Errorf("get(%s): lb(%s) - failed to get LB PublicIPAddress is Nil", serviceName, *lb.Name)
 				}
+				pipID := ipConfiguration.PublicIPAddress.ID
+				if pipID == nil {
+					return nil, fmt.Errorf("get(%s): lb(%s) - failed to get LB PublicIPAddress ID is Nil", serviceName, *lb.Name)
+				}
+				pipName, err := getLastSegment(*pipID)
 				if err != nil {
-					return err
+					return nil, fmt.Errorf("get(%s): lb(%s) - failed to get LB PublicIPAddress Name from ID(%s)", serviceName, *lb.Name, *pipID)
+				}
+				pip, existsPip, err := az.getPublicIPAddress(pipName)
+				if err != nil {
+					return nil, err
+				}
+				if existsPip {
+					lbIP = pip.IPAddress
 				}
 			}
-		}
 
-		// Public IP can be deleted after frontend ip configuration rule deleted.
-		if publicIPToCleanup != nil {
-			// Only delete an IP address if we created it, deducing by name.
-			if index := strings.LastIndex(*publicIPToCleanup, "/"); index != -1 {
-				managedPipName := getPublicIPName(clusterName, service)
-				pipName := (*publicIPToCleanup)[index+1:]
-				if strings.EqualFold(managedPipName, pipName) {
-					glog.V(5).Infof("Deleting public IP resource %q.", pipName)
-					err = az.ensurePublicIPDeleted(serviceName, pipName)
-					if err != nil {
-						return err
-					}
-				} else {
-					glog.V(5).Infof("Public IP resource %q found, but it does not match managed name %q, skip deleting.", pipName, managedPipName)
-				}
-			}
+			return &v1.LoadBalancerStatus{Ingress: []v1.LoadBalancerIngress{{IP: *lbIP}}}, nil
 		}
 	}
 
-	return nil
+	return nil, nil
+}
+
+func (az *Cloud) determinePublicIPName(clusterName string, service *v1.Service) (string, error) {
+	loadBalancerIP := service.Spec.LoadBalancerIP
+	if len(loadBalancerIP) == 0 {
+		return getPublicIPName(clusterName, service), nil
+	}
+
+	pips, err := az.ListPIPWithRetry()
+	if err != nil {
+		return "", err
+	}
+
+	for _, pip := range pips {
+		if pip.PublicIPAddressPropertiesFormat.IPAddress != nil &&
+			*pip.PublicIPAddressPropertiesFormat.IPAddress == loadBalancerIP {
+			return *pip.Name, nil
+		}
+	}
+	return "", fmt.Errorf("user supplied IP Address %s was not found", loadBalancerIP)
+}
+
+func flipServiceInternalAnnotation(service *v1.Service) *v1.Service {
+	copyService := service.DeepCopy()
+	if _, ok := copyService.Annotations[ServiceAnnotationLoadBalancerInternal]; ok {
+		delete(copyService.Annotations, ServiceAnnotationLoadBalancerInternal)
+	} else {
+		copyService.Annotations[ServiceAnnotationLoadBalancerInternal] = "true"
+	}
+	return copyService
 }
 
 func (az *Cloud) ensurePublicIPExists(serviceName, pipName, domainNameLabel string) (*network.PublicIPAddress, error) {
@@ -494,7 +287,6 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName, domainNameLabel stri
 		}
 	}
 	pip.Tags = &map[string]*string{"service": &serviceName}
-
 	glog.V(3).Infof("ensure(%s): pip(%s) - creating", serviceName, *pip.Name)
 	az.operationPollRateLimiter.Accept()
 	glog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%q): start", *pip.Name)
@@ -523,44 +315,27 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName, domainNameLabel stri
 	}
 
 	return &pip, nil
-
-}
-
-func (az *Cloud) ensurePublicIPDeleted(serviceName, pipName string) error {
-	glog.V(2).Infof("ensure(%s): pip(%s) - deleting", serviceName, pipName)
-	az.operationPollRateLimiter.Accept()
-	glog.V(10).Infof("PublicIPAddressesClient.Delete(%q): start", pipName)
-	resp, deleteErrChan := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil)
-	deleteErr := <-deleteErrChan
-	glog.V(10).Infof("PublicIPAddressesClient.Delete(%q): end", pipName) // response not read yet...
-	if az.CloudProviderBackoff && shouldRetryAPIRequest(<-resp, deleteErr) {
-		glog.V(2).Infof("ensure(%s) backing off: pip(%s) - deleting", serviceName, pipName)
-		retryErr := az.DeletePublicIPWithRetry(pipName)
-		if retryErr != nil {
-			glog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - deleting", serviceName, pipName)
-			return retryErr
-		}
-	}
-	_, realErr := checkResourceExistsFromError(deleteErr)
-	if realErr != nil {
-		return nil
-	}
-	return nil
 }
 
 // This ensures load balancer exists and the frontend ip config is setup.
 // This also reconciles the Service's Ports  with the LoadBalancer config.
 // This entails adding rules/probes for expected Ports and removing stale rules/ports.
-func (az *Cloud) reconcileLoadBalancer(lb network.LoadBalancer, fipConfigurationProperties *network.FrontendIPConfigurationPropertiesFormat, clusterName string, service *v1.Service, nodes []*v1.Node) (network.LoadBalancer, bool, error) {
+// nodes only used if wantLB is true
+func (az *Cloud) reconcileLoadBalancer(clusterName string, service *v1.Service, nodes []*v1.Node, wantLb bool) (*network.LoadBalancer, error) {
 	isInternal := requiresInternalLoadBalancer(service)
-	lbName := getLoadBalancerName(clusterName, isInternal)
 	serviceName := getServiceName(service)
+	glog.V(2).Infof("reconcileLoadBalancer(%s) - wantLB(%t): started", serviceName, wantLb)
+	lb, _, _, err := az.getServiceLoadBalancer(service, clusterName, nodes, wantLb)
+	if err != nil {
+		return nil, err
+	}
+	lbName := *lb.Name
+	glog.V(2).Infof("reconcileLoadBalancer(%s): lb(%s) wantLB(%t) resolved load balancer name", serviceName, lbName, wantLb)
 	lbFrontendIPConfigName := getFrontendIPConfigName(service, subnet(service))
 	lbFrontendIPConfigID := az.getFrontendIPConfigID(lbName, lbFrontendIPConfigName)
 	lbBackendPoolName := getBackendPoolName(clusterName)
 	lbBackendPoolID := az.getBackendPoolID(lbName, lbBackendPoolName)
 
-	wantLb := fipConfigurationProperties != nil
 	dirtyLb := false
 
 	// Ensure LoadBalancer's Backend Pool Configuration
@@ -597,6 +372,7 @@ func (az *Cloud) reconcileLoadBalancer(lb network.LoadBalancer, fipConfiguration
 	if lb.FrontendIPConfigurations != nil {
 		newConfigs = *lb.FrontendIPConfigurations
 	}
+
 	if !wantLb {
 		for i := len(newConfigs) - 1; i >= 0; i-- {
 			config := newConfigs[i]
@@ -625,6 +401,51 @@ func (az *Cloud) reconcileLoadBalancer(lb network.LoadBalancer, fipConfiguration
 			}
 		}
 		if !foundConfig {
+			// construct FrontendIPConfigurationPropertiesFormat
+			var fipConfigurationProperties *network.FrontendIPConfigurationPropertiesFormat
+			if isInternal {
+				subnetName := subnet(service)
+				if subnetName == nil {
+					subnetName = &az.SubnetName
+				}
+				subnet, existsSubnet, err := az.getSubnet(az.VnetName, *subnetName)
+				if err != nil {
+					return nil, err
+				}
+
+				if !existsSubnet {
+					return nil, fmt.Errorf("ensure(%s): lb(%s) - failed to get subnet: %s/%s", serviceName, lbName, az.VnetName, az.SubnetName)
+				}
+
+				configProperties := network.FrontendIPConfigurationPropertiesFormat{
+					Subnet: &subnet,
+				}
+
+				loadBalancerIP := service.Spec.LoadBalancerIP
+				if loadBalancerIP != "" {
+					configProperties.PrivateIPAllocationMethod = network.Static
+					configProperties.PrivateIPAddress = &loadBalancerIP
+				} else {
+					// We'll need to call GetLoadBalancer later to retrieve allocated IP.
+					configProperties.PrivateIPAllocationMethod = network.Dynamic
+				}
+
+				fipConfigurationProperties = &configProperties
+			} else {
+				pipName, err := az.determinePublicIPName(clusterName, service)
+				if err != nil {
+					return nil, err
+				}
+				domainNameLabel := getPublicIPLabel(service)
+				pip, err := az.ensurePublicIPExists(serviceName, pipName, domainNameLabel)
+				if err != nil {
+					return nil, err
+				}
+				fipConfigurationProperties = &network.FrontendIPConfigurationPropertiesFormat{
+					PublicIPAddress: &network.PublicIPAddress{ID: pip.ID},
+				}
+			}
+
 			newConfigs = append(newConfigs,
 				network.FrontendIPConfiguration{
 					Name: to.StringPtr(lbFrontendIPConfigName),
@@ -654,7 +475,7 @@ func (az *Cloud) reconcileLoadBalancer(lb network.LoadBalancer, fipConfiguration
 
 		transportProto, _, probeProto, err := getProtocolsFromKubernetesProtocol(port.Protocol)
 		if err != nil {
-			return lb, false, err
+			return nil, err
 		}
 
 		if serviceapi.NeedsHealthCheck(service) {
@@ -662,7 +483,7 @@ func (az *Cloud) reconcileLoadBalancer(lb network.LoadBalancer, fipConfiguration
 				// ERROR: this isn't supported
 				// health check (aka source ip preservation) is not
 				// compatible with UDP (it uses an HTTP check)
-				return lb, false, fmt.Errorf("services requiring health checks are incompatible with UDP ports")
+				return nil, fmt.Errorf("services requiring health checks are incompatible with UDP ports")
 			}
 
 			podPresencePath, podPresencePort := serviceapi.GetServiceHealthCheckPathPort(service)
@@ -803,24 +624,115 @@ func (az *Cloud) reconcileLoadBalancer(lb network.LoadBalancer, fipConfiguration
 		lb.LoadBalancingRules = &updatedRules
 	}
 
-	return lb, dirtyLb, nil
+	// We don't care if the LB exists or not
+	// We only care about if there is any change in the LB, which means dirtyLB
+	// If it is not exist, and no change to that, we don't CreateOrUpdate LB
+	if dirtyLb {
+		if lb.FrontendIPConfigurations == nil || len(*lb.FrontendIPConfigurations) == 0 {
+			// When FrontendIPConfigurations is empty, we need to delete the Azure LoadBalancer resource itself
+			// Because delete all FrontendIPConfigurations in LB is not supported, we have to delete the LB itself
+			glog.V(3).Infof("delete(%s): lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
+
+			az.operationPollRateLimiter.Accept()
+			glog.V(10).Infof("LoadBalancerClient.Delete(%q): start", lbName)
+			respChan, errChan := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil)
+			resp := <-respChan
+			err := <-errChan
+			glog.V(10).Infof("LoadBalancerClient.Delete(%q): end", lbName)
+			if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) {
+				glog.V(2).Infof("delete(%s) backing off: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
+				retryErr := az.DeleteLBWithRetry(lbName)
+				if retryErr != nil {
+					err = retryErr
+					glog.V(2).Infof("delete(%s) abort backoff: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
+				}
+			}
+			if err != nil {
+				return nil, err
+			}
+
+		} else {
+			glog.V(3).Infof("ensure(%s): lb(%s) - updating", serviceName, lbName)
+			az.operationPollRateLimiter.Accept()
+			glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%q): start", lbName)
+			respChan, errChan := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, lbName, *lb, nil)
+			resp := <-respChan
+			err := <-errChan
+			glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%q): end", lbName)
+			if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
+				glog.V(2).Infof("ensure(%s) backing off: lb(%s) - updating", serviceName, lbName)
+				retryErr := az.CreateOrUpdateLBWithRetry(*lb)
+				if retryErr != nil {
+					glog.V(2).Infof("ensure(%s) abort backoff: lb(%s) - updating", serviceName, lbName)
+					return nil, retryErr
+				}
+			}
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	if wantLb && nodes != nil {
+		// Add the machines to the backend pool if they're not already
+		availabilitySetName := az.mapLoadBalancerNameToAvailabilitySet(lbName, clusterName)
+		hostUpdates := make([]func() error, len(nodes))
+		for i, node := range nodes {
+			localNodeName := node.Name
+			f := func() error {
+				err := az.ensureHostInPool(serviceName, types.NodeName(localNodeName), lbBackendPoolID, availabilitySetName)
+				if err != nil {
+					return fmt.Errorf("ensure(%s): lb(%s) - failed to ensure host in pool: %q", serviceName, lbName, err)
+				}
+				return nil
+			}
+			hostUpdates[i] = f
+		}
+
+		errs := utilerrors.AggregateGoroutines(hostUpdates...)
+		if errs != nil {
+			return nil, utilerrors.Flatten(errs)
+		}
+	}
+
+	glog.V(2).Infof("ensure(%s): lb(%s) finished", serviceName, lbName)
+	return lb, nil
 }
 
 // This reconciles the Network Security Group similar to how the LB is reconciled.
 // This entails adding required, missing SecurityRules and removing stale rules.
-func (az *Cloud) reconcileSecurityGroup(sg network.SecurityGroup, clusterName string, service *v1.Service, lbIP *string, wantLb bool) (network.SecurityGroup, bool, error) {
+func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service, lbStatus *v1.LoadBalancerStatus, wantLb bool) (*network.SecurityGroup, error) {
 	serviceName := getServiceName(service)
+	glog.V(5).Infof("ensure(%s): START clusterName=%q lbName=%q", serviceName, clusterName)
+
 	var ports []v1.ServicePort
 	if wantLb {
 		ports = service.Spec.Ports
 	} else {
 		ports = []v1.ServicePort{}
 	}
+	az.operationPollRateLimiter.Accept()
+	glog.V(10).Infof("SecurityGroupsClient.Get(%q): start", az.SecurityGroupName)
+	sg, err := az.SecurityGroupsClient.Get(az.ResourceGroup, az.SecurityGroupName, "")
+	glog.V(10).Infof("SecurityGroupsClient.Get(%q): end", az.SecurityGroupName)
+	if err != nil {
+		return nil, err
+	}
+
+	az.operationPollRateLimiter.Accept()
+	glog.V(10).Infof("SecurityGroupsClient.Get(%q): start", az.SecurityGroupName)
+	sg, err = az.SecurityGroupsClient.Get(az.ResourceGroup, az.SecurityGroupName, "")
+	glog.V(10).Infof("SecurityGroupsClient.Get(%q): end", az.SecurityGroupName)
+	if err != nil {
+		return nil, err
+	}
 
 	destinationIPAddress := ""
 	if wantLb {
+		// Get lbIP since we make up NSG rules based on ingress IP
+		lbIP := &lbStatus.Ingress[0].IP
 		if lbIP == nil {
-			return sg, false, fmt.Errorf("No load balancer IP for setting up security rules for service %s", service.Name)
+			return &sg, fmt.Errorf("No load balancer IP for setting up security rules for service %s", service.Name)
 		}
 		destinationIPAddress = *lbIP
 	}
@@ -830,7 +742,7 @@ func (az *Cloud) reconcileSecurityGroup(sg network.SecurityGroup, clusterName st
 
 	sourceRanges, err := serviceapi.GetLoadBalancerSourceRanges(service)
 	if err != nil {
-		return sg, false, err
+		return nil, err
 	}
 	var sourceAddressPrefixes []string
 	if sourceRanges == nil || serviceapi.IsAllowAll(sourceRanges) {
@@ -847,7 +759,7 @@ func (az *Cloud) reconcileSecurityGroup(sg network.SecurityGroup, clusterName st
 	for i, port := range ports {
 		_, securityProto, _, err := getProtocolsFromKubernetesProtocol(port.Protocol)
 		if err != nil {
-			return sg, false, err
+			return nil, err
 		}
 		for j := range sourceAddressPrefixes {
 			ix := i*len(sourceAddressPrefixes) + j
@@ -902,7 +814,7 @@ func (az *Cloud) reconcileSecurityGroup(sg network.SecurityGroup, clusterName st
 
 			nextAvailablePriority, err := getNextAvailablePriority(updatedRules)
 			if err != nil {
-				return sg, false, err
+				return nil, err
 			}
 
 			expectedRule.Priority = to.Int32Ptr(nextAvailablePriority)
@@ -912,8 +824,90 @@ func (az *Cloud) reconcileSecurityGroup(sg network.SecurityGroup, clusterName st
 	}
 	if dirtySg {
 		sg.SecurityRules = &updatedRules
+		glog.V(3).Infof("ensure(%s): sg(%s) - updating", serviceName, *sg.Name)
+		az.operationPollRateLimiter.Accept()
+		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%q): start", *sg.Name)
+		respChan, errChan := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil)
+		resp := <-respChan
+		err := <-errChan
+		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%q): end", *sg.Name)
+		if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
+			glog.V(2).Infof("ensure(%s) backing off: sg(%s) - updating", serviceName, *sg.Name)
+			retryErr := az.CreateOrUpdateSGWithRetry(sg)
+			if retryErr != nil {
+				glog.V(2).Infof("ensure(%s) abort backoff: sg(%s) - updating", serviceName, *sg.Name)
+				return nil, retryErr
+			}
+		}
+		if err != nil {
+			return nil, err
+		}
 	}
-	return sg, dirtySg, nil
+	return &sg, nil
+}
+
+// This reconciles the PublicIP resources similar to how the LB is reconciled.
+// This entails adding required, missing SecurityRules and removing stale rules.
+func (az *Cloud) reconcilePublicIP(clusterName string, service *v1.Service, wantLb bool) (*network.PublicIPAddress, error) {
+	isInternal := requiresInternalLoadBalancer(service)
+	serviceName := getServiceName(service)
+	desiredPipName, err := az.determinePublicIPName(clusterName, service)
+	if err != nil {
+		return nil, err
+	}
+
+	pips, err := az.ListPIPWithRetry()
+	if err != nil {
+		return nil, err
+	}
+
+	for _, pip := range pips {
+		if pip.Tags != nil &&
+			(*pip.Tags)["service"] != nil &&
+			*(*pip.Tags)["service"] == serviceName {
+			// We need to process for pips belong to this service
+			pipName := *pip.Name
+			if wantLb && !isInternal && pipName == desiredPipName {
+				// This is the only case we should preserve the
+				// Public ip resource with match service tag
+				// We could do nothing here, we will ensure that out of the loop
+			} else {
+				// We use tag to decide which IP should be removed
+				glog.V(2).Infof("ensure(%s): pip(%s) - deleting", serviceName, pipName)
+				az.operationPollRateLimiter.Accept()
+				glog.V(10).Infof("PublicIPAddressesClient.Delete(%q): start", pipName)
+				resp, deleteErrChan := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil)
+				deleteErr := <-deleteErrChan
+				glog.V(10).Infof("PublicIPAddressesClient.Delete(%q): end", pipName) // response not read yet...
+				if az.CloudProviderBackoff && shouldRetryAPIRequest(<-resp, deleteErr) {
+					glog.V(2).Infof("ensure(%s) backing off: pip(%s) - deleting", serviceName, pipName)
+					retryErr := az.DeletePublicIPWithRetry(pipName)
+					if retryErr != nil {
+						glog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - deleting", serviceName, pipName)
+						return nil, retryErr
+					}
+				}
+
+				deleteErr = ignoreStatusNotFoundFromError(deleteErr)
+				if deleteErr != nil {
+					return nil, deleteErr
+				}
+				glog.V(2).Infof("ensure(%s): pip(%s) - finished", serviceName, pipName)
+			}
+		}
+
+	}
+
+	if !isInternal && wantLb {
+		// Confirm desired public ip resource exists
+		var rpip *network.PublicIPAddress
+		domainNameLabel := getPublicIPLabel(service)
+		if rpip, err = az.ensurePublicIPExists(serviceName, desiredPipName, domainNameLabel); err != nil {
+			return nil, err
+		}
+		return rpip, nil
+	}
+	return nil, nil
 }
 
 func findProbe(probes []network.Probe, probe network.Probe) bool {
@@ -945,7 +939,7 @@ func findSecurityRule(rules []network.SecurityRule, rule network.SecurityRule) b
 
 // This ensures the given VM's Primary NIC's Primary IP Configuration is
 // participating in the specified LoadBalancer Backend Pool.
-func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, backendPoolID string) error {
+func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, backendPoolID string, availabilitySetName string) error {
 	var machine compute.VirtualMachine
 	vmName := mapNodeNameToVMName(nodeName)
 	az.operationPollRateLimiter.Accept()
@@ -975,12 +969,12 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b
 	}
 
 	// Check availability set
-	if az.PrimaryAvailabilitySetName != "" {
-		expectedAvailabilitySetName := az.getAvailabilitySetID(az.PrimaryAvailabilitySetName)
+	if availabilitySetName != "" {
+		expectedAvailabilitySetName := az.getAvailabilitySetID(availabilitySetName)
 		if machine.AvailabilitySet == nil || !strings.EqualFold(*machine.AvailabilitySet.ID, expectedAvailabilitySetName) {
 			glog.V(3).Infof(
-				"nicupdate(%s): skipping nic (%s) since it is not in the primaryAvailabilitSet(%s)",
-				serviceName, nicName, az.PrimaryAvailabilitySetName)
+				"nicupdate(%s): skipping nic (%s) since it is not in the availabilitSet(%s)",
+				serviceName, nicName, availabilitySetName)
 			return nil
 		}
 	}
@@ -1058,3 +1052,16 @@ func subnet(service *v1.Service) *string {
 
 	return nil
 }
+
+func getServiceLoadBalancerMode(service *v1.Service) (hasMode bool, isAuto bool, asl []string) {
+	mode, hasMode := service.Annotations[ServiceAnnotationLoadBalancerMode]
+	isAuto = strings.EqualFold(mode, ServiceAnnotationLoadBalancerAutoModeValue)
+	if !isAuto {
+		asTagList := strings.TrimSpace(mode)
+
+		// Break up list of "AS1,AS2"
+		asl = strings.Split(asTagList, ",")
+	}
+
+	return hasMode, isAuto, asl
+}
diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.md b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
new file mode 100644
index 0000000000..84a77a6784
--- /dev/null
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
@@ -0,0 +1,68 @@
+# Azure LoadBalancer
+
+The way azure define LoadBalancer is different with GCE or AWS. Azure's LB can have multiple frontend IP refs. The GCE and AWS can only allow one, if you want more, you better to have another LB. Because of the fact, Public IP is not part of the LB in Azure. NSG is not part of LB in Azure as well. However, you cannot delete them in parallel, Public IP can only be delete after LB's frontend IP ref is removed. 
+
+For different Azure Resources, such as LB, Public IP, NSG. They are the same tier azure resourceS. We need to make sure there is no connection in their own ensure loops. In another words, They would be eventually reconciled regardless of other resources' state. They should only depends on service state.
+
+And also, For Azure, we cannot afford to have more than 1 worker of service_controller. Because, different services could operate on the same LB, concurrent execution could result in conflict or unexpected result. For AWS and GCE, they apparently doesn't have the problem, they use one LB per service, no such conflict.
+
+There are two load balancers per availability set internal and external. There is a limit on number of services that can be associated with a single load balancer.
+By default primary load balancer is selected. Services can be annotated to allow auto selection of available load balancers. Service annotations can also be used to provide specific availability sets that host the load balancers. Note that in case of auto selection or specific availability set selection, when the availability set is lost incase of downtime or cluster scale down the services are currently not auto assigned to an available load balancer.
+Service Annotation for Auto and specific load balancer mode
+
+- service.beta.kubernetes.io/azure-load-balancer-mode" (__auto__|as1,as2...)
+
+## Introduce Functions
+
+- reconcileLoadBalancer(lb network.LoadBalancer, clusterName string, service *v1.Service, nodes []*v1.Node, wantLB bool) (network.LoadBalancer, error)
+  - Go through lb's properties, update based on wantLB
+  - If any change on the lb, no matter if the lb exists or not
+    - Call az cloud to CreateOrUpdate on this lb, or Delete if nothing left
+  - return lb, err
+
+- reconcileSecurityGroup(sg network.SecurityGroup, clusterName string, service *v1.Service, wantLb bool) (network.SecurityGroup, error)
+  - Go though NSG' properties, update based on wantLB
+  - If any change on the NSG, (the NSG should always exists)
+    - Call az cloud to CreateOrUpdate on this NSG
+  - return sg, err
+
+- reconcilePublicIP(pipName string, clusterName string, service *v1.Service, wantLB bool) (error)
+  - if wantLB and external LB, 
+    - ensure Azure Public IP resource is there
+    - when we ensure Public IP, it needs to be both Name and Tag match with the convention
+      - remove dangling Public IP that could have Name or Tag match with the service, but not both
+  - else, ensure Azure Public IP resource is not there
+
+- getServiceLoadBalancer(service *v1.Service, clusterName string, nodes []*v1.Node, wantLb bool) (lb, status, exists, error)
+  - gets the loadbalancer for the service if it already exits
+  - If wantLb is TRUE then -it selects a new load balancer, the selction helps distribute the services across load balancers
+  - In case the selected load balancer does not exists it returns network.LoadBalancer struct with added metadata (such as name, location) and existsLB set to FALSE 
+  - By default - cluster default LB is returned
+
+## Define interface behaviors
+
+### GetLoadBalancer
+
+- Get LoadBalancer status, return status, error
+  - If not exist, ensure it is there
+
+### EnsureLoadBalancer
+
+- Reconcile LB's related but not owned resources, such as Public IP, NSG rules
+  - Call reconcileSecurityGroup(sg, clusterName, service, true)
+  - Call reconcilePublicIP(pipName, cluster, service, true)
+- Reconcile LB's related and owned resources, such as FrontEndIPConfig, Rules, Probe.
+  - Call reconcileLoadBalancer(lb, clusterName, service, nodes, true)
+
+### UpdateLoadBalancer
+
+- Has no difference with EnsureLoadBalancer
+
+### EnsureLoadBalancerDeleted
+
+- Reconcile NSG first, before reconcile LB, because SG need LB to be there
+  - Call reconcileSecurityGroup(sg, clusterName, service, false)
+- Reconcile LB's related and owned resources, such as FrontEndIPConfig, Rules, Probe.
+  - Call reconcileLoadBalancer(lb, clusterName, service, nodes, false)
+- Reconcile LB's related but not owned resources, such as Public IP
+  - Call reconcilePublicIP(pipName, cluster, service, false)
\ No newline at end of file
diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index c364b7f4d8..3bbdda0e7b 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -26,9 +26,13 @@ import (
 	"testing"
 
 	"k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/util/flowcontrol"
 	serviceapi "k8s.io/kubernetes/pkg/api/v1/service"
+	kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
 
+	"github.com/Azure/azure-sdk-for-go/arm/compute"
 	"github.com/Azure/azure-sdk-for-go/arm/network"
 	"github.com/Azure/go-autorest/autorest/to"
 )
@@ -36,12 +40,10 @@ import (
 var testClusterName = "testCluster"
 
 // Test additional of a new service/port.
-func TestReconcileLoadBalancerAddPort(t *testing.T) {
+func TestAddPort(t *testing.T) {
 	az := getTestCloud()
 	svc := getTestService("servicea", v1.ProtocolTCP, 80)
-	configProperties := getTestPublicFipConfigurationProperties()
-	lb := getTestLoadBalancer()
-	nodes := []*v1.Node{}
+	clusterResources := getClusterResources(az, 1, 1)
 
 	svc.Spec.Ports = append(svc.Spec.Ports, v1.ServicePort{
 		Name:     fmt.Sprintf("port-udp-%d", 1234),
@@ -50,15 +52,11 @@ func TestReconcileLoadBalancerAddPort(t *testing.T) {
 		NodePort: getBackendPort(1234),
 	})
 
-	lb, updated, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 
-	if !updated {
-		t.Error("Expected the loadbalancer to need an update")
-	}
-
 	// ensure we got a frontend ip configuration
 	if len(*lb.FrontendIPConfigurations) != 1 {
 		t.Error("Expected the loadbalancer to have a frontend ip configuration")
@@ -67,24 +65,302 @@ func TestReconcileLoadBalancerAddPort(t *testing.T) {
 	validateLoadBalancer(t, lb, svc)
 }
 
+func TestLoadBalancerInternalServiceModeSelection(t *testing.T) {
+	testLoadBalancerServiceDefaultModeSelection(t, true)
+	testLoadBalancerServiceAutoModeSelection(t, true)
+	testLoadBalancerServicesSpecifiedSelection(t, true)
+	testLoadBalancerMaxRulesServices(t, true)
+	testLoadBalancerServiceAutoModeDeleteSelection(t, true)
+}
+
+func TestLoadBalancerExternalServiceModeSelection(t *testing.T) {
+	testLoadBalancerServiceDefaultModeSelection(t, false)
+	testLoadBalancerServiceAutoModeSelection(t, false)
+	testLoadBalancerServicesSpecifiedSelection(t, false)
+	testLoadBalancerMaxRulesServices(t, false)
+	testLoadBalancerServiceAutoModeDeleteSelection(t, false)
+}
+
+func testLoadBalancerServiceDefaultModeSelection(t *testing.T, isInternal bool) {
+	az := getTestCloud()
+	const vmCount = 8
+	const availabilitySetCount = 4
+	const serviceCount = 9
+
+	clusterResources := getClusterResources(az, vmCount, availabilitySetCount)
+	getTestSecurityGroup(az)
+
+	for index := 1; index <= serviceCount; index++ {
+		svcName := fmt.Sprintf("service-%d", index)
+		var svc v1.Service
+		if isInternal {
+			svc = getInternalTestService(svcName, 8081)
+			addTestSubnet(t, az, &svc)
+		} else {
+			svc = getTestService(svcName, v1.ProtocolTCP, 8081)
+		}
+
+		lbStatus, err := az.EnsureLoadBalancer(testClusterName, &svc, clusterResources.nodes)
+		if err != nil {
+			t.Errorf("Unexpected error: %q", err)
+		}
+		if lbStatus == nil {
+			t.Errorf("Unexpected error: %s", svcName)
+		}
+
+		expectedLBName := testClusterName
+		if isInternal {
+			expectedLBName = testClusterName + "-internal"
+		}
+
+		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
+		lb := (*result.Value)[0]
+		lbCount := len(*result.Value)
+		expectedNumOfLB := 1
+		if lbCount != expectedNumOfLB {
+			t.Errorf("Unexpected number of LB's: Expected (%d) Found (%d)", expectedNumOfLB, lbCount)
+		}
+
+		if !strings.EqualFold(*lb.Name, expectedLBName) {
+			t.Errorf("lb name should be the default LB name Extected (%s) Fouund (%s)", expectedLBName, *lb.Name)
+		}
+
+		ruleCount := len(*lb.LoadBalancingRules)
+		if ruleCount != index {
+			t.Errorf("lb rule could should be equal to nuber of services deployed, expected (%d) Found (%d)", index, ruleCount)
+		}
+	}
+}
+
+// Validate even distribution of external services across load balances
+// based on number of availability sets
+func testLoadBalancerServiceAutoModeSelection(t *testing.T, isInternal bool) {
+	az := getTestCloud()
+	const vmCount = 8
+	const availabilitySetCount = 4
+	const serviceCount = 9
+
+	clusterResources := getClusterResources(az, vmCount, availabilitySetCount)
+	getTestSecurityGroup(az)
+
+	for index := 1; index <= serviceCount; index++ {
+		svcName := fmt.Sprintf("service-%d", index)
+		var svc v1.Service
+		if isInternal {
+			svc = getInternalTestService(svcName, 8081)
+			addTestSubnet(t, az, &svc)
+		} else {
+			svc = getTestService(svcName, v1.ProtocolTCP, 8081)
+		}
+		setLoadBalancerAutoModeAnnotation(&svc)
+		lbStatus, err := az.EnsureLoadBalancer(testClusterName, &svc, clusterResources.nodes)
+		if err != nil {
+			t.Errorf("Unexpected error: %q", err)
+		}
+		if lbStatus == nil {
+			t.Errorf("Unexpected error: %s", svcName)
+		}
+
+		expectedNumOfLB := index % availabilitySetCount
+		if index >= availabilitySetCount {
+			expectedNumOfLB = availabilitySetCount
+		}
+		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
+		lbCount := len(*result.Value)
+		if lbCount != expectedNumOfLB {
+			t.Errorf("Unexpected number of LB's: Expected (%d) Found (%d)", expectedNumOfLB, lbCount)
+		}
+
+		maxRules := 0
+		minRules := serviceCount
+		for x := range *result.Value {
+			lb := (*result.Value)[x]
+			ruleCount := len(*lb.LoadBalancingRules)
+			if ruleCount < minRules {
+				minRules = ruleCount
+			}
+			if ruleCount > maxRules {
+				maxRules = ruleCount
+			}
+		}
+
+		delta := maxRules - minRules
+		if delta > 1 {
+			t.Errorf("Unexpected min or max rule in LB's in resource group: Service Index (%d) Min (%d) Max(%d)", index, minRules, maxRules)
+		}
+	}
+}
+
+// Validate availability set selection of services across load balancers
+// based on provided availability sets through service annotation
+func testLoadBalancerServicesSpecifiedSelection(t *testing.T, isInternal bool) {
+	az := getTestCloud()
+	const vmCount = 8
+	const availabilitySetCount = 4
+	const serviceCount = 9
+
+	clusterResources := getClusterResources(az, vmCount, availabilitySetCount)
+	getTestSecurityGroup(az)
+
+	selectedAvailabilitySetName1 := getASName(az, 1, availabilitySetCount)
+	selectedAvailabilitySetName2 := getASName(az, 2, availabilitySetCount)
+	for index := 1; index <= serviceCount; index++ {
+		svcName := fmt.Sprintf("service-%d", index)
+		var svc v1.Service
+		if isInternal {
+			svc = getInternalTestService(svcName, 8081)
+			addTestSubnet(t, az, &svc)
+		} else {
+			svc = getTestService(svcName, v1.ProtocolTCP, 8081)
+		}
+		lbMode := fmt.Sprintf("%s,%s", selectedAvailabilitySetName1, selectedAvailabilitySetName2)
+		setLoadBalancerModeAnnotation(&svc, lbMode)
+
+		lbStatus, err := az.EnsureLoadBalancer(testClusterName, &svc, clusterResources.nodes)
+		if err != nil {
+			t.Errorf("Unexpected error: %q", err)
+		}
+		if lbStatus == nil {
+			t.Errorf("Unexpected error: %s", svcName)
+		}
+
+		expectedNumOfLB := index % 2
+		if index >= 2 {
+			expectedNumOfLB = 2
+		}
+		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
+		lbCount := len(*result.Value)
+		if lbCount != expectedNumOfLB {
+			t.Errorf("Unexpected number of LB's: Expected (%d) Found (%d)", expectedNumOfLB, lbCount)
+		}
+	}
+}
+
+func testLoadBalancerMaxRulesServices(t *testing.T, isInternal bool) {
+	az := getTestCloud()
+	const vmCount = 1
+	const availabilitySetCount = 1
+
+	clusterResources := getClusterResources(az, vmCount, availabilitySetCount)
+	getTestSecurityGroup(az)
+
+	az.Config.MaximumLoadBalancerRuleCount = 1
+
+	for index := 1; index <= az.Config.MaximumLoadBalancerRuleCount; index++ {
+		svcName := fmt.Sprintf("service-%d", index)
+		var svc v1.Service
+		if isInternal {
+			svc = getInternalTestService(svcName, 8081)
+			addTestSubnet(t, az, &svc)
+		} else {
+			svc = getTestService(svcName, v1.ProtocolTCP, 8081)
+		}
+
+		lbStatus, err := az.EnsureLoadBalancer(testClusterName, &svc, clusterResources.nodes)
+		if err != nil {
+			t.Errorf("Unexpected error: %q", err)
+		}
+		if lbStatus == nil {
+			t.Errorf("Unexpected error: %s", svcName)
+		}
+
+		expectedNumOfLB := index % az.Config.MaximumLoadBalancerRuleCount
+		if index >= az.Config.MaximumLoadBalancerRuleCount {
+			expectedNumOfLB = az.Config.MaximumLoadBalancerRuleCount
+		}
+		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
+		lbCount := len(*result.Value)
+		if lbCount != expectedNumOfLB {
+			t.Errorf("Unexpected number of LB's: Expected (%d) Found (%d)", expectedNumOfLB, lbCount)
+		}
+	}
+
+	// validate adding a new service fails since it will exceed the max limit on LB
+	svcName := fmt.Sprintf("service-%d", az.Config.MaximumLoadBalancerRuleCount+1)
+	var svc v1.Service
+	if isInternal {
+		svc = getInternalTestService(svcName, 8081)
+		addTestSubnet(t, az, &svc)
+	} else {
+		svc = getTestService(svcName, v1.ProtocolTCP, 8081)
+	}
+	_, err := az.EnsureLoadBalancer(testClusterName, &svc, clusterResources.nodes)
+	if err == nil {
+		t.Errorf("Expect any new service to fail as max limit in lb has reached")
+	}
+}
+
+// Validate even distribution of external services across load balances
+// based on number of availability sets
+func testLoadBalancerServiceAutoModeDeleteSelection(t *testing.T, isInternal bool) {
+	az := getTestCloud()
+	const vmCount = 8
+	const availabilitySetCount = 4
+	const serviceCount = 9
+
+	clusterResources := getClusterResources(az, vmCount, availabilitySetCount)
+	getTestSecurityGroup(az)
+
+	for index := 1; index <= serviceCount; index++ {
+		svcName := fmt.Sprintf("service-%d", index)
+		var svc v1.Service
+		if isInternal {
+			svc = getInternalTestService(svcName, 8081)
+			addTestSubnet(t, az, &svc)
+		} else {
+			svc = getTestService(svcName, v1.ProtocolTCP, 8081)
+		}
+		setLoadBalancerAutoModeAnnotation(&svc)
+		lbStatus, err := az.EnsureLoadBalancer(testClusterName, &svc, clusterResources.nodes)
+		if err != nil {
+			t.Errorf("Unexpected error: %q", err)
+		}
+		if lbStatus == nil {
+			t.Errorf("Unexpected error: %s", svcName)
+		}
+	}
+
+	for index := serviceCount; index >= 1; index-- {
+		svcName := fmt.Sprintf("service-%d", index)
+		var svc v1.Service
+		if isInternal {
+			svc = getInternalTestService(svcName, 8081)
+			addTestSubnet(t, az, &svc)
+		} else {
+			svc = getTestService(svcName, v1.ProtocolTCP, 8081)
+		}
+
+		setLoadBalancerAutoModeAnnotation(&svc)
+
+		expectedNumOfLB := index % availabilitySetCount
+		if index >= availabilitySetCount {
+			expectedNumOfLB = availabilitySetCount
+		}
+		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
+		lbCount := len(*result.Value)
+		if lbCount != expectedNumOfLB {
+			t.Errorf("Unexpected number of LB's: Expected (%d) Found (%d)", expectedNumOfLB, lbCount)
+		}
+
+		err := az.EnsureLoadBalancerDeleted(testClusterName, &svc)
+		if err != nil {
+			t.Errorf("Unexpected error: %q", err)
+		}
+	}
+}
+
 // Test addition of a new service on an internal LB with a subnet.
 func TestReconcileLoadBalancerAddServiceOnInternalSubnet(t *testing.T) {
 	az := getTestCloud()
+	clusterResources := getClusterResources(az, 1, 1)
 	svc := getInternalTestService("servicea", 80)
-	addTestSubnet(t, &svc)
-	configProperties := getTestInternalFipConfigurationProperties(to.StringPtr("TestSubnet"))
-	lb := getTestLoadBalancer()
-	nodes := []*v1.Node{}
+	addTestSubnet(t, az, &svc)
 
-	lb, updated, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 
-	if !updated {
-		t.Error("Expected the loadbalancer to need an update")
-	}
-
 	// ensure we got a frontend ip configuration
 	if len(*lb.FrontendIPConfigurations) != 1 {
 		t.Error("Expected the loadbalancer to have a frontend ip configuration")
@@ -96,46 +372,48 @@ func TestReconcileLoadBalancerAddServiceOnInternalSubnet(t *testing.T) {
 // Test addition of services on an internal LB using both default and explicit subnets.
 func TestReconcileLoadBalancerAddServicesOnMultipleSubnets(t *testing.T) {
 	az := getTestCloud()
+	clusterResources := getClusterResources(az, 1, 1)
 	svc1 := getTestService("service1", v1.ProtocolTCP, 8081)
 	svc2 := getInternalTestService("service2", 8081)
-	addTestSubnet(t, &svc2)
-	configProperties1 := getTestPublicFipConfigurationProperties()
-	configProperties2 := getTestInternalFipConfigurationProperties(to.StringPtr("TestSubnet"))
-	lb := getTestLoadBalancer()
-	nodes := []*v1.Node{}
 
-	lb, updated, err := az.reconcileLoadBalancer(lb, &configProperties1, testClusterName, &svc1, nodes)
+	// Internal and External service cannot reside on the same LB resource
+	addTestSubnet(t, az, &svc2)
+
+	// svc1 is using LB without "-internal" suffix
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc1, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error reconciling svc1: %q", err)
 	}
 
-	lb, updated, err = az.reconcileLoadBalancer(lb, &configProperties2, testClusterName, &svc2, nodes)
+	// ensure we got a frontend ip configuration for each service
+	if len(*lb.FrontendIPConfigurations) != 1 {
+		t.Error("Expected the loadbalancer to have 1 frontend ip configurations")
+	}
+
+	validateLoadBalancer(t, lb, svc1)
+
+	// svc2 is using LB with "-internal" suffix
+	lb, err = az.reconcileLoadBalancer(testClusterName, &svc2, nil, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error reconciling svc2: %q", err)
 	}
 
-	if !updated {
-		t.Error("Expected the loadbalancer to need an update")
-	}
-
 	// ensure we got a frontend ip configuration for each service
-	if len(*lb.FrontendIPConfigurations) != 2 {
-		t.Error("Expected the loadbalancer to have 2 frontend ip configurations")
+	if len(*lb.FrontendIPConfigurations) != 1 {
+		t.Error("Expected the loadbalancer to have 1 frontend ip configurations")
 	}
 
-	validateLoadBalancer(t, lb, svc1, svc2)
+	validateLoadBalancer(t, lb, svc2)
 }
 
 // Test moving a service exposure from one subnet to another.
 func TestReconcileLoadBalancerEditServiceSubnet(t *testing.T) {
 	az := getTestCloud()
+	clusterResources := getClusterResources(az, 1, 1)
 	svc := getInternalTestService("service1", 8081)
-	addTestSubnet(t, &svc)
-	configProperties := getTestInternalFipConfigurationProperties(to.StringPtr("TestSubnet"))
-	lb := getTestLoadBalancer()
-	nodes := []*v1.Node{}
+	addTestSubnet(t, az, &svc)
 
-	lb, updated, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error reconciling initial svc: %q", err)
 	}
@@ -143,17 +421,13 @@ func TestReconcileLoadBalancerEditServiceSubnet(t *testing.T) {
 	validateLoadBalancer(t, lb, svc)
 
 	svc.Annotations[ServiceAnnotationLoadBalancerInternalSubnet] = "NewSubnet"
-	configProperties = getTestInternalFipConfigurationProperties(to.StringPtr("NewSubnet"))
+	addTestSubnet(t, az, &svc)
 
-	lb, updated, err = az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err = az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error reconciling edits to svc: %q", err)
 	}
 
-	if !updated {
-		t.Error("Expected the loadbalancer to need an update")
-	}
-
 	// ensure we got a frontend ip configuration for the service
 	if len(*lb.FrontendIPConfigurations) != 1 {
 		t.Error("Expected the loadbalancer to have 1 frontend ip configuration")
@@ -164,23 +438,16 @@ func TestReconcileLoadBalancerEditServiceSubnet(t *testing.T) {
 
 func TestReconcileLoadBalancerNodeHealth(t *testing.T) {
 	az := getTestCloud()
+	clusterResources := getClusterResources(az, 1, 1)
 	svc := getTestService("servicea", v1.ProtocolTCP, 80)
 	svc.Spec.ExternalTrafficPolicy = v1.ServiceExternalTrafficPolicyTypeLocal
 	svc.Spec.HealthCheckNodePort = int32(32456)
-	configProperties := getTestPublicFipConfigurationProperties()
-	lb := getTestLoadBalancer()
 
-	nodes := []*v1.Node{}
-
-	lb, updated, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 
-	if !updated {
-		t.Error("Expected the loadbalancer to need an update")
-	}
-
 	// ensure we got a frontend ip configuration
 	if len(*lb.FrontendIPConfigurations) != 1 {
 		t.Error("Expected the loadbalancer to have a frontend ip configuration")
@@ -192,24 +459,17 @@ func TestReconcileLoadBalancerNodeHealth(t *testing.T) {
 // Test removing all services results in removing the frontend ip configuration
 func TestReconcileLoadBalancerRemoveService(t *testing.T) {
 	az := getTestCloud()
+	clusterResources := getClusterResources(az, 1, 1)
 	svc := getTestService("servicea", v1.ProtocolTCP, 80, 443)
-	lb := getTestLoadBalancer()
-	configProperties := getTestPublicFipConfigurationProperties()
-	nodes := []*v1.Node{}
 
-	lb, updated, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
-	if err != nil {
-		t.Errorf("Unexpected error: %q", err)
-	}
-	validateLoadBalancer(t, lb, svc)
-
-	lb, updated, err = az.reconcileLoadBalancer(lb, nil, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 
-	if !updated {
-		t.Error("Expected the loadbalancer to need an update")
+	lb, err = az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, false /* wantLb */)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
 	}
 
 	// ensure we abandoned the frontend ip configuration
@@ -223,27 +483,21 @@ func TestReconcileLoadBalancerRemoveService(t *testing.T) {
 // Test removing all service ports results in removing the frontend ip configuration
 func TestReconcileLoadBalancerRemoveAllPortsRemovesFrontendConfig(t *testing.T) {
 	az := getTestCloud()
+	clusterResources := getClusterResources(az, 1, 1)
 	svc := getTestService("servicea", v1.ProtocolTCP, 80)
-	lb := getTestLoadBalancer()
-	configProperties := getTestPublicFipConfigurationProperties()
-	nodes := []*v1.Node{}
 
-	lb, updated, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 	validateLoadBalancer(t, lb, svc)
 
 	svcUpdated := getTestService("servicea", v1.ProtocolTCP)
-	lb, updated, err = az.reconcileLoadBalancer(lb, nil, testClusterName, &svcUpdated, nodes)
+	lb, err = az.reconcileLoadBalancer(testClusterName, &svcUpdated, clusterResources.nodes, false /* wantLb*/)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 
-	if !updated {
-		t.Error("Expected the loadbalancer to need an update")
-	}
-
 	// ensure we abandoned the frontend ip configuration
 	if len(*lb.FrontendIPConfigurations) != 0 {
 		t.Error("Expected the loadbalancer to have no frontend ip configuration")
@@ -255,37 +509,36 @@ func TestReconcileLoadBalancerRemoveAllPortsRemovesFrontendConfig(t *testing.T)
 // Test removal of a port from an existing service.
 func TestReconcileLoadBalancerRemovesPort(t *testing.T) {
 	az := getTestCloud()
+	clusterResources := getClusterResources(az, 1, 1)
+
 	svc := getTestService("servicea", v1.ProtocolTCP, 80, 443)
-	configProperties := getTestPublicFipConfigurationProperties()
-	nodes := []*v1.Node{}
-
-	existingLoadBalancer := getTestLoadBalancer(svc)
-
-	svcUpdated := getTestService("servicea", v1.ProtocolTCP, 80)
-	updatedLoadBalancer, _, err := az.reconcileLoadBalancer(existingLoadBalancer, &configProperties, testClusterName, &svcUpdated, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 
-	validateLoadBalancer(t, updatedLoadBalancer, svcUpdated)
+	svcUpdated := getTestService("servicea", v1.ProtocolTCP, 80)
+	lb, err = az.reconcileLoadBalancer(testClusterName, &svcUpdated, clusterResources.nodes, true /* wantLb */)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+
+	validateLoadBalancer(t, lb, svcUpdated)
 }
 
 // Test reconciliation of multiple services on same port
 func TestReconcileLoadBalancerMultipleServices(t *testing.T) {
 	az := getTestCloud()
+	clusterResources := getClusterResources(az, 1, 1)
 	svc1 := getTestService("servicea", v1.ProtocolTCP, 80, 443)
 	svc2 := getTestService("serviceb", v1.ProtocolTCP, 80)
-	configProperties := getTestPublicFipConfigurationProperties()
-	nodes := []*v1.Node{}
 
-	existingLoadBalancer := getTestLoadBalancer()
-
-	updatedLoadBalancer, _, err := az.reconcileLoadBalancer(existingLoadBalancer, &configProperties, testClusterName, &svc1, nodes)
+	updatedLoadBalancer, err := az.reconcileLoadBalancer(testClusterName, &svc1, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 
-	updatedLoadBalancer, _, err = az.reconcileLoadBalancer(updatedLoadBalancer, &configProperties, testClusterName, &svc2, nodes)
+	updatedLoadBalancer, err = az.reconcileLoadBalancer(testClusterName, &svc2, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -378,11 +631,13 @@ func TestServiceRespectsClientIPSessionAffinity(t *testing.T) {
 
 func TestReconcileSecurityGroupNewServiceAddsPort(t *testing.T) {
 	az := getTestCloud()
-	svc1 := getTestService("serviceea", v1.ProtocolTCP, 80)
+	getTestSecurityGroup(az)
+	svc1 := getTestService("servicea", v1.ProtocolTCP, 80)
+	clusterResources := getClusterResources(az, 1, 1)
+	lb, _ := az.reconcileLoadBalancer(testClusterName, &svc1, clusterResources.nodes, true)
+	lbStatus, _ := az.getServiceLoadBalancerStatus(&svc1, lb)
 
-	sg := getTestSecurityGroup()
-
-	sg, _, err := az.reconcileSecurityGroup(sg, testClusterName, &svc1, to.StringPtr("192.168.0.0"), true)
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, lbStatus, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -392,11 +647,14 @@ func TestReconcileSecurityGroupNewServiceAddsPort(t *testing.T) {
 
 func TestReconcileSecurityGroupNewInternalServiceAddsPort(t *testing.T) {
 	az := getTestCloud()
+	getTestSecurityGroup(az)
 	svc1 := getInternalTestService("serviceea", 80)
+	addTestSubnet(t, az, &svc1)
+	clusterResources := getClusterResources(az, 1, 1)
 
-	sg := getTestSecurityGroup()
-
-	sg, _, err := az.reconcileSecurityGroup(sg, testClusterName, &svc1, to.StringPtr("192.168.0.0"), true)
+	lb, _ := az.reconcileLoadBalancer(testClusterName, &svc1, clusterResources.nodes, true)
+	lbStatus, _ := az.getServiceLoadBalancerStatus(&svc1, lb)
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, lbStatus, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -405,14 +663,20 @@ func TestReconcileSecurityGroupNewInternalServiceAddsPort(t *testing.T) {
 }
 
 func TestReconcileSecurityGroupRemoveService(t *testing.T) {
+	az := getTestCloud()
 	service1 := getTestService("servicea", v1.ProtocolTCP, 81)
 	service2 := getTestService("serviceb", v1.ProtocolTCP, 82)
+	clusterResources := getClusterResources(az, 1, 1)
 
-	sg := getTestSecurityGroup(service1, service2)
+	lb, _ := az.reconcileLoadBalancer(testClusterName, &service1, clusterResources.nodes, true)
+	az.reconcileLoadBalancer(testClusterName, &service2, clusterResources.nodes, true)
 
+	lbStatus, _ := az.getServiceLoadBalancerStatus(&service1, lb)
+
+	sg := getTestSecurityGroup(az, service1, service2)
 	validateSecurityGroup(t, sg, service1, service2)
-	az := getTestCloud()
-	sg, _, err := az.reconcileSecurityGroup(sg, testClusterName, &service1, nil, false)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &service1, lbStatus, false /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -423,11 +687,14 @@ func TestReconcileSecurityGroupRemoveService(t *testing.T) {
 func TestReconcileSecurityGroupRemoveServiceRemovesPort(t *testing.T) {
 	az := getTestCloud()
 	svc := getTestService("servicea", v1.ProtocolTCP, 80, 443)
+	clusterResources := getClusterResources(az, 1, 1)
 
-	sg := getTestSecurityGroup(svc)
-
+	sg := getTestSecurityGroup(az, svc)
 	svcUpdated := getTestService("servicea", v1.ProtocolTCP, 80)
-	sg, _, err := az.reconcileSecurityGroup(sg, testClusterName, &svcUpdated, to.StringPtr("192.168.0.0"), true)
+	lb, _ := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true)
+	lbStatus, _ := az.getServiceLoadBalancerStatus(&svc, lb)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svcUpdated, lbStatus, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -442,9 +709,13 @@ func TestReconcileSecurityWithSourceRanges(t *testing.T) {
 		"192.168.0.0/24",
 		"10.0.0.0/32",
 	}
+	clusterResources := getClusterResources(az, 1, 1)
 
-	sg := getTestSecurityGroup(svc)
-	sg, _, err := az.reconcileSecurityGroup(sg, testClusterName, &svc, to.StringPtr("192.168.0.0"), true)
+	sg := getTestSecurityGroup(az, svc)
+	lb, _ := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true)
+	lbStatus, _ := az.getServiceLoadBalancerStatus(&svc, lb)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc, lbStatus, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -452,19 +723,230 @@ func TestReconcileSecurityWithSourceRanges(t *testing.T) {
 	validateSecurityGroup(t, sg, svc)
 }
 
-func getTestCloud() *Cloud {
-	return &Cloud{
+func TestReconcilePublicIPWithNewService(t *testing.T) {
+	az := getTestCloud()
+	svc := getTestService("servicea", v1.ProtocolTCP, 80, 443)
+
+	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+	validatePublicIP(t, pip, &svc, true)
+
+	pip2, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB */)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+	validatePublicIP(t, pip, &svc, true)
+	if pip.Name != pip2.Name ||
+		pip.PublicIPAddressPropertiesFormat.IPAddress != pip2.PublicIPAddressPropertiesFormat.IPAddress {
+		t.Errorf("We should get the exact same public ip resource after a second reconcile")
+	}
+}
+
+func TestReconcilePublicIPRemoveService(t *testing.T) {
+	az := getTestCloud()
+	svc := getTestService("servicea", v1.ProtocolTCP, 80, 443)
+
+	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+
+	validatePublicIP(t, pip, &svc, true)
+
+	// Remove the service
+	pip, err = az.reconcilePublicIP(testClusterName, &svc, false /* wantLB */)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+	validatePublicIP(t, pip, &svc, false)
+
+}
+
+func TestReconcilePublicIPWithInternalService(t *testing.T) {
+	az := getTestCloud()
+	svc := getInternalTestService("servicea", 80, 443)
+
+	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+
+	validatePublicIP(t, pip, &svc, true)
+}
+
+func TestReconcilePublicIPWithExternalAndInternalSwitch(t *testing.T) {
+	az := getTestCloud()
+	svc := getInternalTestService("servicea", 80, 443)
+
+	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+	validatePublicIP(t, pip, &svc, true)
+
+	// Update to external service
+	svcUpdated := getTestService("servicea", v1.ProtocolTCP, 80)
+	pip, err = az.reconcilePublicIP(testClusterName, &svcUpdated, true /* wantLB*/)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+	validatePublicIP(t, pip, &svcUpdated, true)
+
+	// Update to internal service again
+	pip, err = az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+	validatePublicIP(t, pip, &svc, true)
+}
+
+func getTestCloud() (az *Cloud) {
+	az = &Cloud{
 		Config: Config{
-			TenantID:          "tenant",
-			SubscriptionID:    "subscription",
-			ResourceGroup:     "rg",
-			Location:          "westus",
-			VnetName:          "vnet",
-			SubnetName:        "subnet",
-			SecurityGroupName: "nsg",
-			RouteTableName:    "rt",
+			TenantID:                     "tenant",
+			SubscriptionID:               "subscription",
+			ResourceGroup:                "rg",
+			VnetResourceGroup:            "rg",
+			Location:                     "westus",
+			VnetName:                     "vnet",
+			SubnetName:                   "subnet",
+			SecurityGroupName:            "nsg",
+			RouteTableName:               "rt",
+			PrimaryAvailabilitySetName:   "asName",
+			MaximumLoadBalancerRuleCount: 250,
 		},
 	}
+	az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter(100, 100)
+	az.LoadBalancerClient = NewFakeAzureLBClient()
+	az.PublicIPAddressesClient = NewFakeAzurePIPClient(az.Config.SubscriptionID)
+	az.SubnetsClient = NewFakeAzureSubnetsClient()
+	az.SecurityGroupsClient = NewFakeAzureNSGClient()
+	az.VirtualMachinesClient = NewFakeVirtualMachinesClient()
+	az.InterfacesClient = NewFakeInterfacesClient()
+
+	return az
+}
+
+const networkInterfacesIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/networkInterfaces/%s"
+const primaryIPConfigIDTemplate = "%s/ipConfigurations/ipconfig"
+
+// returns the full identifier of a publicIPAddress.
+func getNetworkInterfacesID(subscriptionID string, resourceGroupName, nicName string) string {
+	return fmt.Sprintf(
+		networkInterfacesIDTemplate,
+		subscriptionID,
+		resourceGroupName,
+		nicName)
+}
+
+// returns the full identifier of a private ipconfig of the nic
+func getPrimaryIPConfigID(nicID string) string {
+	return fmt.Sprintf(
+		primaryIPConfigIDTemplate,
+		nicID)
+}
+
+const TestResourceNameFormat = "%s-%d"
+const TestVMResourceBaseName = "vm"
+const TestASResourceBaseName = "as"
+
+func getTestResourceName(resourceBaseName string, index int) string {
+	return fmt.Sprintf(TestResourceNameFormat, resourceBaseName, index)
+}
+
+func getVMName(vmIndex int) string {
+	return getTestResourceName(TestVMResourceBaseName, vmIndex)
+}
+
+func getASName(az *Cloud, vmIndex int, numAS int) string {
+	asIndex := vmIndex % numAS
+	if asIndex == 0 {
+		return az.Config.PrimaryAvailabilitySetName
+	}
+
+	return getTestResourceName(TestASResourceBaseName, asIndex)
+}
+
+func getNICName(vmIndex int) string {
+	// test supporting on 1 nic per vm
+	return getVMName(vmIndex)
+}
+
+type ClusterResources struct {
+	nodes                []*v1.Node
+	availabilitySetNames []string
+}
+
+func getClusterResources(az *Cloud, vmCount int, availabilitySetCount int) (clusterResources *ClusterResources) {
+	if vmCount < availabilitySetCount {
+		return nil
+	}
+	clusterResources = &ClusterResources{}
+	clusterResources.nodes = []*v1.Node{}
+	clusterResources.availabilitySetNames = []string{}
+	for vmIndex := 0; vmIndex < vmCount; vmIndex++ {
+		vmName := getVMName(vmIndex)
+		asName := getASName(az, vmIndex, availabilitySetCount)
+		clusterResources.availabilitySetNames = append(clusterResources.availabilitySetNames, asName)
+
+		nicName := getNICName(vmIndex)
+		nicID := getNetworkInterfacesID(az.Config.SubscriptionID, az.Config.ResourceGroup, nicName)
+		primaryIPConfigID := getPrimaryIPConfigID(nicID)
+		isPrimary := true
+		newNIC := network.Interface{
+			ID:   &nicID,
+			Name: &nicName,
+			InterfacePropertiesFormat: &network.InterfacePropertiesFormat{
+				IPConfigurations: &[]network.InterfaceIPConfiguration{
+					{
+						ID: &primaryIPConfigID,
+						InterfaceIPConfigurationPropertiesFormat: &network.InterfaceIPConfigurationPropertiesFormat{
+							PrivateIPAddress: &nicName,
+							Primary:          &isPrimary,
+						},
+					},
+				},
+			},
+		}
+		az.InterfacesClient.CreateOrUpdate(az.Config.ResourceGroup, nicName, newNIC, nil)
+
+		// create vm
+		asID := az.getAvailabilitySetID(asName)
+		newVM := compute.VirtualMachine{
+			Name:     &vmName,
+			Location: &az.Config.Location,
+			VirtualMachineProperties: &compute.VirtualMachineProperties{
+				AvailabilitySet: &compute.SubResource{
+					ID: &asID,
+				},
+				NetworkProfile: &compute.NetworkProfile{
+					NetworkInterfaces: &[]compute.NetworkInterfaceReference{
+						{
+							ID: &nicID,
+						},
+					},
+				},
+			},
+		}
+
+		_, errChan := az.VirtualMachinesClient.CreateOrUpdate(az.Config.ResourceGroup, vmName, newVM, nil)
+		if err := <-errChan; err != nil {
+		}
+		// add to kubernetes
+		newNode := &v1.Node{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: vmName,
+				Labels: map[string]string{
+					kubeletapis.LabelHostname: vmName,
+				},
+			},
+		}
+		clusterResources.nodes = append(clusterResources.nodes, newNode)
+	}
+
+	return clusterResources
 }
 
 func getBackendPort(port int32) int32 {
@@ -516,10 +998,17 @@ func getTestService(identifier string, proto v1.Protocol, requestedPorts ...int3
 func getInternalTestService(identifier string, requestedPorts ...int32) v1.Service {
 	svc := getTestService(identifier, v1.ProtocolTCP, requestedPorts...)
 	svc.Annotations[ServiceAnnotationLoadBalancerInternal] = "true"
-
 	return svc
 }
 
+func setLoadBalancerModeAnnotation(service *v1.Service, lbMode string) {
+	service.Annotations[ServiceAnnotationLoadBalancerMode] = lbMode
+}
+
+func setLoadBalancerAutoModeAnnotation(service *v1.Service) {
+	setLoadBalancerModeAnnotation(service, ServiceAnnotationLoadBalancerAutoModeValue)
+}
+
 func getTestLoadBalancer(services ...v1.Service) network.LoadBalancer {
 	rules := []network.LoadBalancingRule{}
 	probes := []network.Probe{}
@@ -563,7 +1052,7 @@ func getServiceSourceRanges(service *v1.Service) []string {
 	return service.Spec.LoadBalancerSourceRanges
 }
 
-func getTestSecurityGroup(services ...v1.Service) network.SecurityGroup {
+func getTestSecurityGroup(az *Cloud, services ...v1.Service) *network.SecurityGroup {
 	rules := []network.SecurityRule{}
 
 	for _, service := range services {
@@ -583,15 +1072,22 @@ func getTestSecurityGroup(services ...v1.Service) network.SecurityGroup {
 	}
 
 	sg := network.SecurityGroup{
+		Name: &az.SecurityGroupName,
 		SecurityGroupPropertiesFormat: &network.SecurityGroupPropertiesFormat{
 			SecurityRules: &rules,
 		},
 	}
 
-	return sg
+	az.SecurityGroupsClient.CreateOrUpdate(
+		az.ResourceGroup,
+		az.SecurityGroupName,
+		sg,
+		nil)
+
+	return &sg
 }
 
-func validateLoadBalancer(t *testing.T, loadBalancer network.LoadBalancer, services ...v1.Service) {
+func validateLoadBalancer(t *testing.T, loadBalancer *network.LoadBalancer, services ...v1.Service) {
 	expectedRuleCount := 0
 	expectedFrontendIPCount := 0
 	expectedProbeCount := 0
@@ -718,7 +1214,34 @@ func describeFIPs(frontendIPs []network.FrontendIPConfiguration) string {
 	return description
 }
 
-func validateSecurityGroup(t *testing.T, securityGroup network.SecurityGroup, services ...v1.Service) {
+func validatePublicIP(t *testing.T, publicIP *network.PublicIPAddress, service *v1.Service, wantLB bool) {
+	isInternal := requiresInternalLoadBalancer(service)
+	if isInternal || !wantLB {
+		if publicIP != nil {
+			t.Errorf("Expected publicIP resource to be nil, when it is an internal service or doesn't want LB")
+		}
+		return
+	}
+
+	// For external service
+	if publicIP == nil {
+		t.Errorf("Expected publicIP resource exists, when it is not an internal service")
+	}
+
+	if publicIP.Tags == nil || (*publicIP.Tags)["service"] == nil {
+		t.Errorf("Expected publicIP resource has tags[service]")
+	}
+
+	serviceName := getServiceName(service)
+	if serviceName != *(*publicIP.Tags)["service"] {
+		t.Errorf("Expected publicIP resource has matching tags[service]")
+	}
+	// We cannot use service.Spec.LoadBalancerIP to compare with
+	// Public IP's IPAddress
+	// Becuase service properties are updated outside of cloudprovider code
+}
+
+func validateSecurityGroup(t *testing.T, securityGroup *network.SecurityGroup, services ...v1.Service) {
 	expectedRuleCount := 0
 	for _, svc := range services {
 		for _, wantedRule := range svc.Spec.Ports {
@@ -839,10 +1362,6 @@ func TestNewCloudFromJSON(t *testing.T) {
 		"routeTableName": "--route-table-name--",
 		"primaryAvailabilitySetName": "--primary-availability-set-name--",
 		"cloudProviderBackoff": true,
-		"cloudProviderBackoffRetries": 6,
-		"cloudProviderBackoffExponent": 1.5,
-		"cloudProviderBackoffDuration": 5,
-		"cloudProviderBackoffJitter": 1.0,
 		"cloudProviderRatelimit": true,
 		"cloudProviderRateLimitQPS": 0.5,
 		"cloudProviderRateLimitBucket": 5
@@ -1128,9 +1647,29 @@ func TestMetadataParsing(t *testing.T) {
 	}
 }
 
-func addTestSubnet(t *testing.T, svc *v1.Service) {
+func addTestSubnet(t *testing.T, az *Cloud, svc *v1.Service) {
 	if svc.Annotations[ServiceAnnotationLoadBalancerInternal] != "true" {
 		t.Error("Subnet added to non-internal service")
 	}
-	svc.Annotations[ServiceAnnotationLoadBalancerInternalSubnet] = "TestSubnet"
+	subName := svc.Annotations[ServiceAnnotationLoadBalancerInternalSubnet]
+	if subName == "" {
+		subName = az.SubnetName
+	}
+
+	subnetID := fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/virtualNetworks/%s/subnets/%s",
+		az.SubscriptionID,
+		az.VnetResourceGroup,
+		az.VnetName,
+		subName)
+
+	_, errChan := az.SubnetsClient.CreateOrUpdate(az.VnetResourceGroup, az.VnetName, subName,
+		network.Subnet{
+			ID:   &subnetID,
+			Name: &subName,
+		}, nil)
+
+	if err := <-errChan; err != nil {
+		t.Errorf("Subnet cannot be created or update, %v", err)
+	}
+	svc.Annotations[ServiceAnnotationLoadBalancerInternalSubnet] = subName
 }
diff --git a/pkg/cloudprovider/providers/azure/azure_util.go b/pkg/cloudprovider/providers/azure/azure_util.go
index bfd3e08bce..3c98e4b08d 100644
--- a/pkg/cloudprovider/providers/azure/azure_util.go
+++ b/pkg/cloudprovider/providers/azure/azure_util.go
@@ -20,7 +20,9 @@ import (
 	"errors"
 	"fmt"
 	"hash/crc32"
+	"math"
 	"regexp"
+	"sort"
 	"strconv"
 	"strings"
 
@@ -31,6 +33,7 @@ import (
 	"github.com/Azure/azure-sdk-for-go/arm/network"
 	"github.com/golang/glog"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/sets"
 )
 
 const (
@@ -44,6 +47,12 @@ const (
 	loadBalancerRuleIDTemplate  = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/loadBalancers/%s/loadBalancingRules/%s"
 	loadBalancerProbeIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/loadBalancers/%s/probes/%s"
 	securityRuleIDTemplate      = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/networkSecurityGroups/%s/securityRules/%s"
+
+	// InternalLoadBalancerNameSuffix is load balancer posfix
+	InternalLoadBalancerNameSuffix = "-internal"
+
+	// nodeLabelRole specifies the role of a node
+	nodeLabelRole = "kubernetes.io/role"
 )
 
 var providerIDRE = regexp.MustCompile(`^` + CloudProviderName + `://(?:.*)/Microsoft.Compute/virtualMachines/(.+)$`)
@@ -116,6 +125,197 @@ func (az *Cloud) getSecurityRuleID(securityRuleName string) string {
 		securityRuleName)
 }
 
+// returns the full identifier of a publicIPAddress.
+func (az *Cloud) getpublicIPAddressID(pipName string) string {
+	return fmt.Sprintf(
+		publicIPAddressIDTemplate,
+		az.SubscriptionID,
+		az.ResourceGroup,
+		pipName)
+}
+
+// select load balancer for the service in the cluster
+func (az *Cloud) selectLoadBalancer(clusterName string, service *v1.Service, existingLBs *[]network.LoadBalancer, nodes []*v1.Node) (selectedLB *network.LoadBalancer, existsLb bool, err error) {
+	isInternal := requiresInternalLoadBalancer(service)
+	serviceName := getServiceName(service)
+	glog.V(3).Infof("selectLoadBalancer(%s): isInternal(%s) - start", serviceName, isInternal)
+	availabilitySetNames, err := az.getLoadBalancerAvailabilitySetNames(service, nodes)
+	if err != nil {
+		return nil, false, err
+	}
+	glog.Infof("selectLoadBalancer(%s): isInternal(%s) - availabilitysetsname %v", serviceName, isInternal, *availabilitySetNames)
+	mapExistingLBs := map[string]*network.LoadBalancer{}
+	for lbx := range *existingLBs {
+		lb := (*existingLBs)[lbx]
+		mapExistingLBs[*lb.Name] = &lb
+	}
+	selectedLBRuleCount := math.MaxInt32
+	for asx := range *availabilitySetNames {
+		currASName := (*availabilitySetNames)[asx]
+		currLBName := az.getLoadBalancerName(clusterName, currASName, isInternal)
+		lb, ok := mapExistingLBs[currLBName]
+		if !ok {
+			// select this LB as this is a new LB and will have minimum rules
+			// create tmp lb struct to hold metadata for the new load-balancer
+			selectedLB = &network.LoadBalancer{
+				Name:                         &currLBName,
+				Location:                     &az.Location,
+				LoadBalancerPropertiesFormat: &network.LoadBalancerPropertiesFormat{},
+			}
+
+			return selectedLB, false, nil
+		}
+
+		lbRules := *lb.LoadBalancingRules
+		currLBRuleCount := 0
+		if lbRules != nil {
+			currLBRuleCount = len(lbRules)
+		}
+		if currLBRuleCount < selectedLBRuleCount {
+			selectedLBRuleCount = currLBRuleCount
+			selectedLB = lb
+		}
+	}
+
+	if selectedLB == nil {
+		glog.Errorf("selectLoadBalancer service (%s) - unable to find load balancer for selected availability sets %v", serviceName, *availabilitySetNames)
+		return nil, false, fmt.Errorf("selectLoadBalancer (%s)- unable to find load balancer for selected availability sets %v", serviceName, *availabilitySetNames)
+	}
+	// validate if the selected LB has not exceeded the MaximumLoadBalancerRuleCount
+	if az.Config.MaximumLoadBalancerRuleCount != 0 && selectedLBRuleCount >= az.Config.MaximumLoadBalancerRuleCount {
+		err = fmt.Errorf("selectLoadBalancer service (%s) -  all available load balancers have exceeded maximum rule limit %d", serviceName, selectedLBRuleCount)
+		glog.Error(err)
+		return selectedLB, existsLb, err
+	}
+
+	return selectedLB, existsLb, nil
+}
+
+// getLoadBalancerAvailabilitySetNames selects all possible availability sets for
+// service load balancer, if the service has no loadbalancer mode annotaion returns the
+// primary availability set if service annotation for loadbalancer availability set
+// exists then return the eligible a availability set
+func (az *Cloud) getLoadBalancerAvailabilitySetNames(service *v1.Service, nodes []*v1.Node) (availabilitySetNames *[]string, err error) {
+	hasMode, isAuto, serviceASL := getServiceLoadBalancerMode(service)
+	if !hasMode {
+		// legacy load balancer auto mode load balancer.
+		availabilitySetNames = &[]string{az.Config.PrimaryAvailabilitySetName}
+		return availabilitySetNames, nil
+	}
+	availabilitySetNames, err = az.getAgentPoolAvailabiliySets(nodes)
+	if err != nil {
+		return nil, err
+	}
+	if len(*availabilitySetNames) == 0 {
+		return nil, fmt.Errorf("No availability sets found for nodes, node count(%d)", len(nodes))
+	}
+	// sort the list to have deterministic selection
+	sort.Strings(*availabilitySetNames)
+	if !isAuto {
+		if serviceASL == nil || len(serviceASL) == 0 {
+			return nil, fmt.Errorf("service annotation for LoadBalancerMode is empty, it should have __auto__ or availability sets value")
+		}
+		// validate availability set exists
+		var found bool
+		for sasx := range serviceASL {
+			for asx := range *availabilitySetNames {
+				if strings.EqualFold((*availabilitySetNames)[asx], serviceASL[sasx]) {
+					found = true
+					serviceASL[sasx] = (*availabilitySetNames)[asx]
+					break
+				}
+			}
+			if !found {
+				return nil, fmt.Errorf("availability set (%s) - not found", serviceASL[sasx])
+			}
+		}
+		availabilitySetNames = &serviceASL
+	}
+
+	return availabilitySetNames, nil
+}
+
+// lists the virtual machines for for the resource group and then builds
+// a list of availability sets that match the nodes available to k8s
+func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAs *[]string, err error) {
+	vms, err := az.VirtualMachineClientListWithRetry()
+	if err != nil {
+		return nil, err
+	}
+	vmNameToAvailabilitySetID := make(map[string]string, len(vms))
+	for vmx := range vms {
+		vm := vms[vmx]
+		if vm.AvailabilitySet != nil {
+			vmNameToAvailabilitySetID[*vm.Name] = *vm.AvailabilitySet.ID
+		}
+	}
+	availabilitySetIDs := sets.NewString()
+	agentPoolAs = &[]string{}
+	for nx := range nodes {
+		nodeName := (*nodes[nx]).Name
+		if isMasterNode(nodes[nx]) {
+			continue
+		}
+		asID, ok := vmNameToAvailabilitySetID[nodeName]
+		if !ok {
+			return nil, fmt.Errorf("Node (%s) - has no availability sets", nodeName)
+		}
+		if availabilitySetIDs.Has(asID) {
+			// already added in the list
+			continue
+		}
+		asName, err := getLastSegment(asID)
+		if err != nil {
+			glog.Errorf("az.getNodeAvailabilitySet(%s), getLastSegment(%s), err=%v", nodeName, asID, err)
+			return nil, err
+		}
+		// AvailabilitySet ID is currently upper cased in a indeterministic way
+		// We want to keep it lower case, before the ID get fixed
+		asName = strings.ToLower(asName)
+
+		*agentPoolAs = append(*agentPoolAs, asName)
+	}
+
+	return agentPoolAs, nil
+}
+
+func (az *Cloud) mapLoadBalancerNameToAvailabilitySet(lbName string, clusterName string) (availabilitySetName string) {
+	availabilitySetName = strings.TrimSuffix(lbName, InternalLoadBalancerNameSuffix)
+	if strings.EqualFold(clusterName, lbName) {
+		availabilitySetName = az.Config.PrimaryAvailabilitySetName
+	}
+
+	return availabilitySetName
+}
+
+// For a load balancer, all frontend ip should reference either a subnet or publicIpAddress.
+// Thus Azure do not allow mixed type (public and internal) load balancer.
+// So we'd have a separate name for internal load balancer.
+// This would be the name for Azure LoadBalancer resource.
+func (az *Cloud) getLoadBalancerName(clusterName string, availabilitySetName string, isInternal bool) string {
+	lbNamePrefix := availabilitySetName
+	if strings.EqualFold(availabilitySetName, az.Config.PrimaryAvailabilitySetName) {
+		lbNamePrefix = clusterName
+	}
+	if isInternal {
+		return fmt.Sprintf("%s%s", lbNamePrefix, InternalLoadBalancerNameSuffix)
+	}
+	return lbNamePrefix
+}
+
+// isMasterNode returns returns true is the node has a master role label.
+// The master role is determined by looking for:
+// * a kubernetes.io/role="master" label
+func isMasterNode(node *v1.Node) bool {
+	for k, v := range node.Labels {
+		if k == nodeLabelRole && v == "master" {
+			return true
+		}
+	}
+
+	return false
+}
+
 // returns the deepest child's identifier from a full identifier string.
 func getLastSegment(ID string) (string, error) {
 	parts := strings.Split(ID, "/")
@@ -179,16 +379,8 @@ func getPrimaryIPConfig(nic network.Interface) (*network.InterfaceIPConfiguratio
 	return nil, fmt.Errorf("failed to determine the determine primary ipconfig. nicname=%q", *nic.Name)
 }
 
-// For a load balancer, all frontend ip should reference either a subnet or publicIpAddress.
-// Thus Azure do not allow mixed type (public and internal) load balancer.
-// So we'd have a separate name for internal load balancer.
-// This would be the name for Azure LoadBalancer resource.
-func getLoadBalancerName(clusterName string, isInternal bool) string {
-	if isInternal {
-		return fmt.Sprintf("%s-internal", clusterName)
-	}
-
-	return clusterName
+func isInternalLoadBalancer(lb *network.LoadBalancer) bool {
+	return strings.HasSuffix(*lb.Name, InternalLoadBalancerNameSuffix)
 }
 
 func getBackendPoolName(clusterName string) string {
diff --git a/pkg/cloudprovider/providers/azure/azure_wrap.go b/pkg/cloudprovider/providers/azure/azure_wrap.go
index e9c06dc6fc..8bfa2ca81e 100644
--- a/pkg/cloudprovider/providers/azure/azure_wrap.go
+++ b/pkg/cloudprovider/providers/azure/azure_wrap.go
@@ -40,6 +40,19 @@ func checkResourceExistsFromError(err error) (bool, error) {
 	return false, v
 }
 
+// If it is StatusNotFound return nil,
+// Otherwise, return what it is
+func ignoreStatusNotFoundFromError(err error) error {
+	if err == nil {
+		return nil
+	}
+	v, ok := err.(autorest.DetailedError)
+	if ok && v.StatusCode == http.StatusNotFound {
+		return nil
+	}
+	return err
+}
+
 func (az *Cloud) getVirtualMachine(nodeName types.NodeName) (vm compute.VirtualMachine, exists bool, err error) {
 	var realErr error
 
@@ -103,7 +116,6 @@ func (az *Cloud) getSecurityGroup() (sg network.SecurityGroup, exists bool, err
 
 func (az *Cloud) getAzureLoadBalancer(name string) (lb network.LoadBalancer, exists bool, err error) {
 	var realErr error
-
 	az.operationPollRateLimiter.Accept()
 	glog.V(10).Infof("LoadBalancerClient.Get(%s): start", name)
 	lb, err = az.LoadBalancerClient.Get(az.ResourceGroup, name, "")
@@ -121,6 +133,25 @@ func (az *Cloud) getAzureLoadBalancer(name string) (lb network.LoadBalancer, exi
 	return lb, exists, err
 }
 
+func (az *Cloud) listLoadBalancers() (lbListResult network.LoadBalancerListResult, exists bool, err error) {
+	var realErr error
+
+	az.operationPollRateLimiter.Accept()
+	glog.V(10).Infof("LoadBalancerClient.List(%s): start", az.ResourceGroup)
+	lbListResult, err = az.LoadBalancerClient.List(az.ResourceGroup)
+	glog.V(10).Infof("LoadBalancerClient.List(%s): end", az.ResourceGroup)
+	exists, realErr = checkResourceExistsFromError(err)
+	if realErr != nil {
+		return lbListResult, false, realErr
+	}
+
+	if !exists {
+		return lbListResult, false, nil
+	}
+
+	return lbListResult, exists, err
+}
+
 func (az *Cloud) getPublicIPAddress(name string) (pip network.PublicIPAddress, exists bool, err error) {
 	var realErr error
 

From 443339da0ad3ca6f05c6a143fc1a2f37cba1080c Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Wed, 15 Nov 2017 09:41:13 -0800
Subject: [PATCH 12/33] fix documents, and correct typo

---
 .../providers/azure/azure_loadbalancer.go     |  7 ++-
 .../providers/azure/azure_loadbalancer.md     | 49 +++++++++++--------
 .../providers/azure/azure_test.go             | 20 ++++----
 3 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
index 2afb656830..54f6b7d08f 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
@@ -62,7 +62,6 @@ func (az *Cloud) GetLoadBalancer(clusterName string, service *v1.Service) (statu
 		glog.V(5).Infof("getloadbalancer (cluster:%s) (service:%s)- IP doesn't exist in any of the lbs", clusterName, serviceName)
 		return nil, false, fmt.Errorf("Service(%s) - Loadbalancer not found", serviceName)
 	}
-
 	return status, true, nil
 }
 
@@ -320,17 +319,17 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName, domainNameLabel stri
 // This ensures load balancer exists and the frontend ip config is setup.
 // This also reconciles the Service's Ports  with the LoadBalancer config.
 // This entails adding rules/probes for expected Ports and removing stale rules/ports.
-// nodes only used if wantLB is true
+// nodes only used if wantLb is true
 func (az *Cloud) reconcileLoadBalancer(clusterName string, service *v1.Service, nodes []*v1.Node, wantLb bool) (*network.LoadBalancer, error) {
 	isInternal := requiresInternalLoadBalancer(service)
 	serviceName := getServiceName(service)
-	glog.V(2).Infof("reconcileLoadBalancer(%s) - wantLB(%t): started", serviceName, wantLb)
+	glog.V(2).Infof("reconcileLoadBalancer(%s) - wantLb(%t): started", serviceName, wantLb)
 	lb, _, _, err := az.getServiceLoadBalancer(service, clusterName, nodes, wantLb)
 	if err != nil {
 		return nil, err
 	}
 	lbName := *lb.Name
-	glog.V(2).Infof("reconcileLoadBalancer(%s): lb(%s) wantLB(%t) resolved load balancer name", serviceName, lbName, wantLb)
+	glog.V(2).Infof("reconcileLoadBalancer(%s): lb(%s) wantLb(%t) resolved load balancer name", serviceName, lbName, wantLb)
 	lbFrontendIPConfigName := getFrontendIPConfigName(service, subnet(service))
 	lbFrontendIPConfigID := az.getFrontendIPConfigID(lbName, lbFrontendIPConfigName)
 	lbBackendPoolName := getBackendPoolName(clusterName)
diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.md b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
index 84a77a6784..431056893f 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
@@ -1,8 +1,10 @@
 # Azure LoadBalancer
 
-The way azure define LoadBalancer is different with GCE or AWS. Azure's LB can have multiple frontend IP refs. The GCE and AWS can only allow one, if you want more, you better to have another LB. Because of the fact, Public IP is not part of the LB in Azure. NSG is not part of LB in Azure as well. However, you cannot delete them in parallel, Public IP can only be delete after LB's frontend IP ref is removed. 
+The way azure define LoadBalancer is different with GCE or AWS. Azure's LB can have multiple frontend IP refs. The GCE and AWS can only allow one, if you want more, you better to have another LB. Because of the fact, Public IP is not part of the LB in Azure. NSG is not part of LB in Azure either. However, you cannot delete them in parallel, Public IP can only be delete after LB's frontend IP ref is removed. 
 
-For different Azure Resources, such as LB, Public IP, NSG. They are the same tier azure resourceS. We need to make sure there is no connection in their own ensure loops. In another words, They would be eventually reconciled regardless of other resources' state. They should only depends on service state.
+For different Azure Resources, such as LB, Public IP, NSG. They are the same tier azure resources. We need to make sure there is no connection in their own ensure loops. In another words, They would be eventually reconciled regardless of other resources' state. They should only depends on service state.
+
+Despite the ideal philosophy above, we have to face the reality. NSG depends on LB's frontend ip to adjust NSG rules. So when we want to reconcile NSG, the LB should contain the corresponding frontend ip config.
 
 And also, For Azure, we cannot afford to have more than 1 worker of service_controller. Because, different services could operate on the same LB, concurrent execution could result in conflict or unexpected result. For AWS and GCE, they apparently doesn't have the problem, they use one LB per service, no such conflict.
 
@@ -14,24 +16,25 @@ Service Annotation for Auto and specific load balancer mode
 
 ## Introduce Functions
 
-- reconcileLoadBalancer(lb network.LoadBalancer, clusterName string, service *v1.Service, nodes []*v1.Node, wantLB bool) (network.LoadBalancer, error)
-  - Go through lb's properties, update based on wantLB
+- reconcileLoadBalancer(clusterName string, service *v1.Service, nodes []*v1.Node, wantLb bool) (*network.LoadBalancer, error)
+  - Go through lb's properties, update based on wantLb
   - If any change on the lb, no matter if the lb exists or not
     - Call az cloud to CreateOrUpdate on this lb, or Delete if nothing left
   - return lb, err
 
-- reconcileSecurityGroup(sg network.SecurityGroup, clusterName string, service *v1.Service, wantLb bool) (network.SecurityGroup, error)
-  - Go though NSG' properties, update based on wantLB
+- reconcileSecurityGroup(clusterName string, service *v1.Service, lbStatus *v1.LoadBalancerStatus, wantLb bool) (*network.SecurityGroup, error)
+  - Go though NSG' properties, update based on wantLb
   - If any change on the NSG, (the NSG should always exists)
     - Call az cloud to CreateOrUpdate on this NSG
   - return sg, err
 
-- reconcilePublicIP(pipName string, clusterName string, service *v1.Service, wantLB bool) (error)
-  - if wantLB and external LB, 
-    - ensure Azure Public IP resource is there
-    - when we ensure Public IP, it needs to be both Name and Tag match with the convention
-      - remove dangling Public IP that could have Name or Tag match with the service, but not both
-  - else, ensure Azure Public IP resource is not there
+- reconcilePublicIP(clusterName string, service *v1.Service, wantLb bool) (*network.PublicIPAddress, error)
+  - List all the public ip in the resource group
+  - Make sure we only touch Public IP resources has tags[service] = "namespace/serviceName"
+    - skip for wantLb && !isInternal && pipName == desiredPipName
+    - delete other public ip resources if any
+  - if !isInternal && wantLb 
+    - ensure Public IP with desiredPipName exists
 
 - getServiceLoadBalancer(service *v1.Service, clusterName string, nodes []*v1.Node, wantLb bool) (lb, status, exists, error)
   - gets the loadbalancer for the service if it already exits
@@ -44,15 +47,19 @@ Service Annotation for Auto and specific load balancer mode
 ### GetLoadBalancer
 
 - Get LoadBalancer status, return status, error
-  - If not exist, ensure it is there
+  - return the load balancer status for this service
+  - it will not create or update or delete any resource
 
 ### EnsureLoadBalancer
 
-- Reconcile LB's related but not owned resources, such as Public IP, NSG rules
-  - Call reconcileSecurityGroup(sg, clusterName, service, true)
-  - Call reconcilePublicIP(pipName, cluster, service, true)
+- Reconcile LB for the fliped service
+  - Call reconcileLoadBalancer(clusterName, flipedService, nil, false/* wantLb */)
+- Reconcile Public IP
+  - Call reconcilePublicIP(cluster, service, true)
 - Reconcile LB's related and owned resources, such as FrontEndIPConfig, Rules, Probe.
-  - Call reconcileLoadBalancer(lb, clusterName, service, nodes, true)
+  - Call reconcileLoadBalancer(clusterName, service, nodes, true /* wantLb */)
+- Reconcile NSG rules, it need to be called after reconcileLB
+  - Call reconcileSecurityGroup(clusterName, service, lbStatus, true /* wantLb */)
 
 ### UpdateLoadBalancer
 
@@ -61,8 +68,8 @@ Service Annotation for Auto and specific load balancer mode
 ### EnsureLoadBalancerDeleted
 
 - Reconcile NSG first, before reconcile LB, because SG need LB to be there
-  - Call reconcileSecurityGroup(sg, clusterName, service, false)
+  - Call reconcileSecurityGroup(clusterName, service, nil, false /* wantLb */)
 - Reconcile LB's related and owned resources, such as FrontEndIPConfig, Rules, Probe.
-  - Call reconcileLoadBalancer(lb, clusterName, service, nodes, false)
-- Reconcile LB's related but not owned resources, such as Public IP
-  - Call reconcilePublicIP(pipName, cluster, service, false)
\ No newline at end of file
+  - Call reconcileLoadBalancer(clusterName, service, nodes, false)
+- Reconcile Public IP, public IP needs related LB reconciled first
+  - Call reconcilePublicIP(cluster, service, false)
\ No newline at end of file
diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index 3bbdda0e7b..521cde9bf3 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -727,13 +727,13 @@ func TestReconcilePublicIPWithNewService(t *testing.T) {
 	az := getTestCloud()
 	svc := getTestService("servicea", v1.ProtocolTCP, 80, 443)
 
-	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLb*/)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 	validatePublicIP(t, pip, &svc, true)
 
-	pip2, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB */)
+	pip2, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -748,7 +748,7 @@ func TestReconcilePublicIPRemoveService(t *testing.T) {
 	az := getTestCloud()
 	svc := getTestService("servicea", v1.ProtocolTCP, 80, 443)
 
-	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLb*/)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -756,7 +756,7 @@ func TestReconcilePublicIPRemoveService(t *testing.T) {
 	validatePublicIP(t, pip, &svc, true)
 
 	// Remove the service
-	pip, err = az.reconcilePublicIP(testClusterName, &svc, false /* wantLB */)
+	pip, err = az.reconcilePublicIP(testClusterName, &svc, false /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -768,7 +768,7 @@ func TestReconcilePublicIPWithInternalService(t *testing.T) {
 	az := getTestCloud()
 	svc := getInternalTestService("servicea", 80, 443)
 
-	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLb*/)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -780,7 +780,7 @@ func TestReconcilePublicIPWithExternalAndInternalSwitch(t *testing.T) {
 	az := getTestCloud()
 	svc := getInternalTestService("servicea", 80, 443)
 
-	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	pip, err := az.reconcilePublicIP(testClusterName, &svc, true /* wantLb*/)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -788,14 +788,14 @@ func TestReconcilePublicIPWithExternalAndInternalSwitch(t *testing.T) {
 
 	// Update to external service
 	svcUpdated := getTestService("servicea", v1.ProtocolTCP, 80)
-	pip, err = az.reconcilePublicIP(testClusterName, &svcUpdated, true /* wantLB*/)
+	pip, err = az.reconcilePublicIP(testClusterName, &svcUpdated, true /* wantLb*/)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
 	validatePublicIP(t, pip, &svcUpdated, true)
 
 	// Update to internal service again
-	pip, err = az.reconcilePublicIP(testClusterName, &svc, true /* wantLB*/)
+	pip, err = az.reconcilePublicIP(testClusterName, &svc, true /* wantLb*/)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -1214,9 +1214,9 @@ func describeFIPs(frontendIPs []network.FrontendIPConfiguration) string {
 	return description
 }
 
-func validatePublicIP(t *testing.T, publicIP *network.PublicIPAddress, service *v1.Service, wantLB bool) {
+func validatePublicIP(t *testing.T, publicIP *network.PublicIPAddress, service *v1.Service, wantLb bool) {
 	isInternal := requiresInternalLoadBalancer(service)
-	if isInternal || !wantLB {
+	if isInternal || !wantLb {
 		if publicIP != nil {
 			t.Errorf("Expected publicIP resource to be nil, when it is an internal service or doesn't want LB")
 		}

From 585dabc279c7b32e207f934ec0a072884e3d19ab Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Wed, 15 Nov 2017 10:26:33 -0800
Subject: [PATCH 13/33] rename azure interfaces to conform with golang
 convention

---
 pkg/cloudprovider/providers/azure/azure.go | 30 +++++++++++++---------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure.go b/pkg/cloudprovider/providers/azure/azure.go
index a7cff34e77..dcff662f0f 100644
--- a/pkg/cloudprovider/providers/azure/azure.go
+++ b/pkg/cloudprovider/providers/azure/azure.go
@@ -119,19 +119,22 @@ type Config struct {
 	MaximumLoadBalancerRuleCount int `json:"maximumLoadBalancerRuleCount"`
 }
 
-type iVirtualMachinesClient interface {
+// VirtualMachinesClient defines needed functions for azure network.VirtualMachinesClient
+type VirtualMachinesClient interface {
 	CreateOrUpdate(resourceGroupName string, VMName string, parameters compute.VirtualMachine, cancel <-chan struct{}) (<-chan compute.VirtualMachine, <-chan error)
 	Get(resourceGroupName string, VMName string, expand compute.InstanceViewTypes) (result compute.VirtualMachine, err error)
 	List(resourceGroupName string) (result compute.VirtualMachineListResult, err error)
 	ListNextResults(lastResults compute.VirtualMachineListResult) (result compute.VirtualMachineListResult, err error)
 }
 
-type iInterfacesClient interface {
+// InterfacesClient defines needed functions for azure network.InterfacesClient
+type InterfacesClient interface {
 	CreateOrUpdate(resourceGroupName string, networkInterfaceName string, parameters network.Interface, cancel <-chan struct{}) (<-chan network.Interface, <-chan error)
 	Get(resourceGroupName string, networkInterfaceName string, expand string) (result network.Interface, err error)
 }
 
-type iLoadBalancersClient interface {
+// LoadBalancersClient defines needed functions for azure network.LoadBalancersClient
+type LoadBalancersClient interface {
 	CreateOrUpdate(resourceGroupName string, loadBalancerName string, parameters network.LoadBalancer, cancel <-chan struct{}) (<-chan network.LoadBalancer, <-chan error)
 	Delete(resourceGroupName string, loadBalancerName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
 	Get(resourceGroupName string, loadBalancerName string, expand string) (result network.LoadBalancer, err error)
@@ -139,7 +142,8 @@ type iLoadBalancersClient interface {
 	ListNextResults(lastResult network.LoadBalancerListResult) (result network.LoadBalancerListResult, err error)
 }
 
-type iPublicIPAddressesClient interface {
+// PublicIPAddressesClient defines needed functions for azure network.PublicIPAddressesClient
+type PublicIPAddressesClient interface {
 	CreateOrUpdate(resourceGroupName string, publicIPAddressName string, parameters network.PublicIPAddress, cancel <-chan struct{}) (<-chan network.PublicIPAddress, <-chan error)
 	Delete(resourceGroupName string, publicIPAddressName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
 	Get(resourceGroupName string, publicIPAddressName string, expand string) (result network.PublicIPAddress, err error)
@@ -147,14 +151,16 @@ type iPublicIPAddressesClient interface {
 	ListNextResults(lastResults network.PublicIPAddressListResult) (result network.PublicIPAddressListResult, err error)
 }
 
-type iSubnetsClient interface {
+// SubnetsClient defines needed functions for azure network.SubnetsClient
+type SubnetsClient interface {
 	CreateOrUpdate(resourceGroupName string, virtualNetworkName string, subnetName string, subnetParameters network.Subnet, cancel <-chan struct{}) (<-chan network.Subnet, <-chan error)
 	Delete(resourceGroupName string, virtualNetworkName string, subnetName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
 	Get(resourceGroupName string, virtualNetworkName string, subnetName string, expand string) (result network.Subnet, err error)
 	List(resourceGroupName string, virtualNetworkName string) (result network.SubnetListResult, err error)
 }
 
-type iSecurityGroupsClient interface {
+// SecurityGroupsClient defines needed functions for azure network.SecurityGroupsClient
+type SecurityGroupsClient interface {
 	CreateOrUpdate(resourceGroupName string, networkSecurityGroupName string, parameters network.SecurityGroup, cancel <-chan struct{}) (<-chan network.SecurityGroup, <-chan error)
 	Delete(resourceGroupName string, networkSecurityGroupName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
 	Get(resourceGroupName string, networkSecurityGroupName string, expand string) (result network.SecurityGroup, err error)
@@ -166,13 +172,13 @@ type Cloud struct {
 	Config
 	Environment              azure.Environment
 	RoutesClient             network.RoutesClient
-	SubnetsClient            iSubnetsClient
-	InterfacesClient         iInterfacesClient
+	SubnetsClient            SubnetsClient
+	InterfacesClient         InterfacesClient
 	RouteTablesClient        network.RouteTablesClient
-	LoadBalancerClient       iLoadBalancersClient
-	PublicIPAddressesClient  iPublicIPAddressesClient
-	SecurityGroupsClient     iSecurityGroupsClient
-	VirtualMachinesClient    iVirtualMachinesClient
+	LoadBalancerClient       LoadBalancersClient
+	PublicIPAddressesClient  PublicIPAddressesClient
+	SecurityGroupsClient     SecurityGroupsClient
+	VirtualMachinesClient    VirtualMachinesClient
 	StorageAccountClient     storage.AccountsClient
 	DisksClient              disk.DisksClient
 	operationPollRateLimiter flowcontrol.RateLimiter

From 408f7396183b8d0af8eb791ea37257b302800817 Mon Sep 17 00:00:00 2001
From: NIkhil Bhatia <nbhatia@microsoft.com>
Date: Wed, 15 Nov 2017 12:52:59 -0800
Subject: [PATCH 14/33] code-review- add logs and comments (#11)

add logs and comments & fix getMasterNode
---
 .../providers/azure/azure_util.go             | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_util.go b/pkg/cloudprovider/providers/azure/azure_util.go
index 3c98e4b08d..cdacf7568d 100644
--- a/pkg/cloudprovider/providers/azure/azure_util.go
+++ b/pkg/cloudprovider/providers/azure/azure_util.go
@@ -135,12 +135,16 @@ func (az *Cloud) getpublicIPAddressID(pipName string) string {
 }
 
 // select load balancer for the service in the cluster
+// the selection algorithm selectes the the load balancer with currently has
+// the minimum lb rules, there there are multiple LB's with same number of rules
+// it selects the first one (sorted based on name)
 func (az *Cloud) selectLoadBalancer(clusterName string, service *v1.Service, existingLBs *[]network.LoadBalancer, nodes []*v1.Node) (selectedLB *network.LoadBalancer, existsLb bool, err error) {
 	isInternal := requiresInternalLoadBalancer(service)
 	serviceName := getServiceName(service)
 	glog.V(3).Infof("selectLoadBalancer(%s): isInternal(%s) - start", serviceName, isInternal)
 	availabilitySetNames, err := az.getLoadBalancerAvailabilitySetNames(service, nodes)
 	if err != nil {
+		glog.Errorf("az.selectLoadBalancer: cluster (%s) service(%s) - az.getLoadBalancerAvailabilitySetNames failed, err=(%v)", clusterName, serviceName, err)
 		return nil, false, err
 	}
 	glog.Infof("selectLoadBalancer(%s): isInternal(%s) - availabilitysetsname %v", serviceName, isInternal, *availabilitySetNames)
@@ -198,15 +202,17 @@ func (az *Cloud) selectLoadBalancer(clusterName string, service *v1.Service, exi
 func (az *Cloud) getLoadBalancerAvailabilitySetNames(service *v1.Service, nodes []*v1.Node) (availabilitySetNames *[]string, err error) {
 	hasMode, isAuto, serviceASL := getServiceLoadBalancerMode(service)
 	if !hasMode {
-		// legacy load balancer auto mode load balancer.
+		// no mode specified in service annotation default to PrimaryAvailabilitySetName
 		availabilitySetNames = &[]string{az.Config.PrimaryAvailabilitySetName}
 		return availabilitySetNames, nil
 	}
 	availabilitySetNames, err = az.getAgentPoolAvailabiliySets(nodes)
 	if err != nil {
+		glog.Errorf("az.getLoadBalancerAvailabilitySetNames - getAgentPoolAvailabiliySets failed err=(%v)", err)
 		return nil, err
 	}
 	if len(*availabilitySetNames) == 0 {
+		glog.Errorf("az.getLoadBalancerAvailabilitySetNames - No availability sets found for nodes in the cluster, node count(%d)", len(nodes))
 		return nil, fmt.Errorf("No availability sets found for nodes, node count(%d)", len(nodes))
 	}
 	// sort the list to have deterministic selection
@@ -226,6 +232,7 @@ func (az *Cloud) getLoadBalancerAvailabilitySetNames(service *v1.Service, nodes
 				}
 			}
 			if !found {
+				glog.Errorf("az.getLoadBalancerAvailabilitySetNames - Availability set (%s) in service annotation not found", serviceASL[sasx])
 				return nil, fmt.Errorf("availability set (%s) - not found", serviceASL[sasx])
 			}
 		}
@@ -240,6 +247,7 @@ func (az *Cloud) getLoadBalancerAvailabilitySetNames(service *v1.Service, nodes
 func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAs *[]string, err error) {
 	vms, err := az.VirtualMachineClientListWithRetry()
 	if err != nil {
+		glog.Errorf("az.getNodeAvailabilitySet - VirtualMachineClientListWithRetry failed, err=%v", err)
 		return nil, err
 	}
 	vmNameToAvailabilitySetID := make(map[string]string, len(vms))
@@ -258,6 +266,7 @@ func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAs *[]s
 		}
 		asID, ok := vmNameToAvailabilitySetID[nodeName]
 		if !ok {
+			glog.Errorf("az.getNodeAvailabilitySet - Node(%s) has no availability sets", nodeName)
 			return nil, fmt.Errorf("Node (%s) - has no availability sets", nodeName)
 		}
 		if availabilitySetIDs.Has(asID) {
@@ -266,7 +275,7 @@ func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAs *[]s
 		}
 		asName, err := getLastSegment(asID)
 		if err != nil {
-			glog.Errorf("az.getNodeAvailabilitySet(%s), getLastSegment(%s), err=%v", nodeName, asID, err)
+			glog.Errorf("az.getNodeAvailabilitySet - Node (%s)- getLastSegment(%s), err=%v", nodeName, asID, err)
 			return nil, err
 		}
 		// AvailabilitySet ID is currently upper cased in a indeterministic way
@@ -307,10 +316,8 @@ func (az *Cloud) getLoadBalancerName(clusterName string, availabilitySetName str
 // The master role is determined by looking for:
 // * a kubernetes.io/role="master" label
 func isMasterNode(node *v1.Node) bool {
-	for k, v := range node.Labels {
-		if k == nodeLabelRole && v == "master" {
-			return true
-		}
+	if val, ok := node.Labels[nodeLabelRole]; ok && val == "master" {
+		return true
 	}
 
 	return false

From 69abfa676d91b54e1956051781010a0b749628f4 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Wed, 15 Nov 2017 17:34:09 -0800
Subject: [PATCH 15/33] naming, comment, typo correction

---
 .../providers/azure/azure_backoff.go          | 44 +++++++++----------
 .../providers/azure/azure_loadbalancer.go     | 33 +++++---------
 .../providers/azure/azure_loadbalancer.md     |  4 +-
 .../providers/azure/azure_test.go             | 17 ++++---
 4 files changed, 44 insertions(+), 54 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go
index 32f3a5c051..6988d3c4ed 100644
--- a/pkg/cloudprovider/providers/azure/azure_backoff.go
+++ b/pkg/cloudprovider/providers/azure/azure_backoff.go
@@ -26,10 +26,10 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 )
 
-// getorCreateRequestBackoff returns a new Backoff object steps = 1
+// getOrCreateRequestBackoff returns a new Backoff object steps = 1
 // This is to make sure that the requested command executes
 // at least once
-func (az *Cloud) getorCreateRequestBackoff() (resourceRequestBackoff wait.Backoff) {
+func (az *Cloud) getOrCreateRequestBackoff() (resourceRequestBackoff wait.Backoff) {
 	if az.CloudProviderBackoff {
 		return az.resourceRequestBackoff
 	}
@@ -44,7 +44,7 @@ func (az *Cloud) getorCreateRequestBackoff() (resourceRequestBackoff wait.Backof
 func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.VirtualMachine, bool, error) {
 	var machine compute.VirtualMachine
 	var exists bool
-	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
 		machine, exists, retryErr = az.getVirtualMachine(name)
 		if retryErr != nil {
@@ -60,7 +60,7 @@ func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.Virtua
 // VirtualMachineClientGetWithRetry invokes az.VirtualMachinesClient.Get with exponential backoff retry
 func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string, types compute.InstanceViewTypes) (compute.VirtualMachine, error) {
 	var machine compute.VirtualMachine
-	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
 		az.operationPollRateLimiter.Accept()
 		machine, retryErr = az.VirtualMachinesClient.Get(resourceGroup, vmName, types)
@@ -78,7 +78,7 @@ func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string,
 func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine, error) {
 	allNodes := []compute.VirtualMachine{}
 	var result compute.VirtualMachineListResult
-	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("VirtualMachinesClient.List(%v): start", az.ResourceGroup)
@@ -103,7 +103,7 @@ func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine,
 		appendResults = false
 		// follow the next link to get all the vms for resource group
 		if result.NextLink != nil {
-			err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+			err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 				var retryErr error
 				az.operationPollRateLimiter.Accept()
 				glog.V(10).Infof("VirtualMachinesClient.ListNextResults(%v): start", az.ResourceGroup)
@@ -130,7 +130,7 @@ func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine,
 // GetIPForMachineWithRetry invokes az.getIPForMachine with exponential backoff retry
 func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {
 	var ip string
-	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
 		ip, retryErr = az.getIPForMachine(name)
 		if retryErr != nil {
@@ -145,7 +145,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {
 
 // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%s): start", *sg.Name)
 		respChan, errChan := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil)
@@ -158,7 +158,7 @@ func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {
 
 // CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%s): start", *lb.Name)
 		respChan, errChan := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil)
@@ -169,12 +169,12 @@ func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error {
 	})
 }
 
-// ListLBWithRetry invokes az.VirtualMachinesClient.List with exponential backoff retry
+// ListLBWithRetry invokes az.LoadBalancerClient.List with exponential backoff retry
 func (az *Cloud) ListLBWithRetry() ([]network.LoadBalancer, error) {
 	allLBs := []network.LoadBalancer{}
 	var result network.LoadBalancerListResult
 
-	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.List(%v): start", az.ResourceGroup)
@@ -200,7 +200,7 @@ func (az *Cloud) ListLBWithRetry() ([]network.LoadBalancer, error) {
 
 		// follow the next link to get all the vms for resource group
 		if result.NextLink != nil {
-			err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+			err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 				var retryErr error
 				az.operationPollRateLimiter.Accept()
 				glog.V(10).Infof("LoadBalancerClient.ListNextResults(%v): start", az.ResourceGroup)
@@ -229,7 +229,7 @@ func (az *Cloud) ListLBWithRetry() ([]network.LoadBalancer, error) {
 func (az *Cloud) ListPIPWithRetry() ([]network.PublicIPAddress, error) {
 	allPIPs := []network.PublicIPAddress{}
 	var result network.PublicIPAddressListResult
-	err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		var retryErr error
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.List(%v): start", az.ResourceGroup)
@@ -255,7 +255,7 @@ func (az *Cloud) ListPIPWithRetry() ([]network.PublicIPAddress, error) {
 
 		// follow the next link to get all the vms for resource group
 		if result.NextLink != nil {
-			err := wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+			err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 				var retryErr error
 				az.operationPollRateLimiter.Accept()
 				glog.V(10).Infof("PublicIPAddressesClient.ListNextResults(%v): start", az.ResourceGroup)
@@ -282,7 +282,7 @@ func (az *Cloud) ListPIPWithRetry() ([]network.PublicIPAddress, error) {
 
 // CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s): start", *pip.Name)
 		respChan, errChan := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil)
@@ -295,7 +295,7 @@ func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error {
 
 // CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("InterfacesClient.CreateOrUpdate(%s): start", *nic.Name)
 		respChan, errChan := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil)
@@ -308,7 +308,7 @@ func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error {
 
 // DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry
 func (az *Cloud) DeletePublicIPWithRetry(pipName string) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.Delete(%s): start", pipName)
 		respChan, errChan := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil)
@@ -321,7 +321,7 @@ func (az *Cloud) DeletePublicIPWithRetry(pipName string) error {
 
 // DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry
 func (az *Cloud) DeleteLBWithRetry(lbName string) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.Delete(%s): start", lbName)
 		respChan, errChan := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil)
@@ -334,7 +334,7 @@ func (az *Cloud) DeleteLBWithRetry(lbName string) error {
 
 // CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RouteTablesClient.CreateOrUpdate(%s): start", *routeTable.Name)
 		respChan, errChan := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil)
@@ -347,7 +347,7 @@ func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable
 
 // CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RoutesClient.CreateOrUpdate(%s): start", *route.Name)
 		respChan, errChan := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil)
@@ -360,7 +360,7 @@ func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error {
 
 // DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry
 func (az *Cloud) DeleteRouteWithRetry(routeName string) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RoutesClient.Delete(%s): start", az.RouteTableName)
 		respChan, errChan := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil)
@@ -373,7 +373,7 @@ func (az *Cloud) DeleteRouteWithRetry(routeName string) error {
 
 // CreateOrUpdateVMWithRetry invokes az.VirtualMachinesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualMachine) error {
-	return wait.ExponentialBackoff(az.getorCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s): start", vmName)
 		respChan, errChan := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil)
diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
index 54f6b7d08f..9e52f4a5de 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
@@ -79,8 +79,8 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod
 	// Here we'll firstly ensure service do not lie in the opposite LB.
 	serviceName := getServiceName(service)
 	glog.V(5).Infof("ensureloadbalancer(%s): START clusterName=%q", serviceName, clusterName)
-	flipedService := flipServiceInternalAnnotation(service)
-	if _, err := az.reconcileLoadBalancer(clusterName, flipedService, nil, false /* wantLb */); err != nil {
+	flippedService := flipServiceInternalAnnotation(service)
+	if _, err := az.reconcileLoadBalancer(clusterName, flippedService, nil, false /* wantLb */); err != nil {
 		return nil, err
 	}
 
@@ -136,7 +136,7 @@ func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Servi
 	return nil
 }
 
-// getServiceLoadBalancer gets the loadbalancer for the service if it already exits
+// getServiceLoadBalancer gets the loadbalancer for the service if it already exists
 // If wantLb is TRUE then -it selects a new load balancer
 // In case the selected load balancer does not exists it returns network.LoadBalancer struct
 // with added metadata (such as name, location) and existsLB set to FALSE
@@ -258,9 +258,11 @@ func (az *Cloud) determinePublicIPName(clusterName string, service *v1.Service)
 
 func flipServiceInternalAnnotation(service *v1.Service) *v1.Service {
 	copyService := service.DeepCopy()
-	if _, ok := copyService.Annotations[ServiceAnnotationLoadBalancerInternal]; ok {
+	if v, ok := copyService.Annotations[ServiceAnnotationLoadBalancerInternal]; ok && v == "true" {
+		// If it is internal now, we make it external by remove the annotation
 		delete(copyService.Annotations, ServiceAnnotationLoadBalancerInternal)
 	} else {
+		// If it is external now, we make it internal
 		copyService.Annotations[ServiceAnnotationLoadBalancerInternal] = "true"
 	}
 	return copyService
@@ -628,8 +630,8 @@ func (az *Cloud) reconcileLoadBalancer(clusterName string, service *v1.Service,
 	// If it is not exist, and no change to that, we don't CreateOrUpdate LB
 	if dirtyLb {
 		if lb.FrontendIPConfigurations == nil || len(*lb.FrontendIPConfigurations) == 0 {
-			// When FrontendIPConfigurations is empty, we need to delete the Azure LoadBalancer resource itself
-			// Because delete all FrontendIPConfigurations in LB is not supported, we have to delete the LB itself
+			// When FrontendIPConfigurations is empty, we need to delete the Azure load balancer resource itself,
+			// because an Azure load balancer cannot have an empty FrontendIPConfigurations collection
 			glog.V(3).Infof("delete(%s): lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
 
 			az.operationPollRateLimiter.Accept()
@@ -718,14 +720,6 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 		return nil, err
 	}
 
-	az.operationPollRateLimiter.Accept()
-	glog.V(10).Infof("SecurityGroupsClient.Get(%q): start", az.SecurityGroupName)
-	sg, err = az.SecurityGroupsClient.Get(az.ResourceGroup, az.SecurityGroupName, "")
-	glog.V(10).Infof("SecurityGroupsClient.Get(%q): end", az.SecurityGroupName)
-	if err != nil {
-		return nil, err
-	}
-
 	destinationIPAddress := ""
 	if wantLb {
 		// Get lbIP since we make up NSG rules based on ingress IP
@@ -846,7 +840,6 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 }
 
 // This reconciles the PublicIP resources similar to how the LB is reconciled.
-// This entails adding required, missing SecurityRules and removing stale rules.
 func (az *Cloud) reconcilePublicIP(clusterName string, service *v1.Service, wantLb bool) (*network.PublicIPAddress, error) {
 	isInternal := requiresInternalLoadBalancer(service)
 	serviceName := getServiceName(service)
@@ -869,9 +862,7 @@ func (az *Cloud) reconcilePublicIP(clusterName string, service *v1.Service, want
 			if wantLb && !isInternal && pipName == desiredPipName {
 				// This is the only case we should preserve the
 				// Public ip resource with match service tag
-				// We could do nothing here, we will ensure that out of the loop
 			} else {
-				// We use tag to decide which IP should be removed
 				glog.V(2).Infof("ensure(%s): pip(%s) - deleting", serviceName, pipName)
 				az.operationPollRateLimiter.Accept()
 				glog.V(10).Infof("PublicIPAddressesClient.Delete(%q): start", pipName)
@@ -899,12 +890,12 @@ func (az *Cloud) reconcilePublicIP(clusterName string, service *v1.Service, want
 
 	if !isInternal && wantLb {
 		// Confirm desired public ip resource exists
-		var rpip *network.PublicIPAddress
+		var pip *network.PublicIPAddress
 		domainNameLabel := getPublicIPLabel(service)
-		if rpip, err = az.ensurePublicIPExists(serviceName, desiredPipName, domainNameLabel); err != nil {
+		if pip, err = az.ensurePublicIPExists(serviceName, desiredPipName, domainNameLabel); err != nil {
 			return nil, err
 		}
-		return rpip, nil
+		return pip, nil
 	}
 	return nil, nil
 }
@@ -972,7 +963,7 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b
 		expectedAvailabilitySetName := az.getAvailabilitySetID(availabilitySetName)
 		if machine.AvailabilitySet == nil || !strings.EqualFold(*machine.AvailabilitySet.ID, expectedAvailabilitySetName) {
 			glog.V(3).Infof(
-				"nicupdate(%s): skipping nic (%s) since it is not in the availabilitSet(%s)",
+				"nicupdate(%s): skipping nic (%s) since it is not in the availabilitySet(%s)",
 				serviceName, nicName, availabilitySetName)
 			return nil
 		}
diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.md b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
index 431056893f..05a560b75b 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
@@ -37,7 +37,7 @@ Service Annotation for Auto and specific load balancer mode
     - ensure Public IP with desiredPipName exists
 
 - getServiceLoadBalancer(service *v1.Service, clusterName string, nodes []*v1.Node, wantLb bool) (lb, status, exists, error)
-  - gets the loadbalancer for the service if it already exits
+  - gets the loadbalancer for the service if it already exists
   - If wantLb is TRUE then -it selects a new load balancer, the selction helps distribute the services across load balancers
   - In case the selected load balancer does not exists it returns network.LoadBalancer struct with added metadata (such as name, location) and existsLB set to FALSE 
   - By default - cluster default LB is returned
@@ -52,7 +52,7 @@ Service Annotation for Auto and specific load balancer mode
 
 ### EnsureLoadBalancer
 
-- Reconcile LB for the fliped service
+- Reconcile LB for the flipped service
   - Call reconcileLoadBalancer(clusterName, flipedService, nil, false/* wantLb */)
 - Reconcile Public IP
   - Call reconcilePublicIP(cluster, service, true)
diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index 521cde9bf3..8d6343d18f 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -127,12 +127,12 @@ func testLoadBalancerServiceDefaultModeSelection(t *testing.T, isInternal bool)
 
 		ruleCount := len(*lb.LoadBalancingRules)
 		if ruleCount != index {
-			t.Errorf("lb rule could should be equal to nuber of services deployed, expected (%d) Found (%d)", index, ruleCount)
+			t.Errorf("lb rule count should be equal to nuber of services deployed, expected (%d) Found (%d)", index, ruleCount)
 		}
 	}
 }
 
-// Validate even distribution of external services across load balances
+// Validate even distribution of external services across load balancers
 // based on number of availability sets
 func testLoadBalancerServiceAutoModeSelection(t *testing.T, isInternal bool) {
 	az := getTestCloud()
@@ -173,8 +173,7 @@ func testLoadBalancerServiceAutoModeSelection(t *testing.T, isInternal bool) {
 
 		maxRules := 0
 		minRules := serviceCount
-		for x := range *result.Value {
-			lb := (*result.Value)[x]
+		for _, lb := range *result.Value {
 			ruleCount := len(*lb.LoadBalancingRules)
 			if ruleCount < minRules {
 				minRules = ruleCount
@@ -737,7 +736,7 @@ func TestReconcilePublicIPWithNewService(t *testing.T) {
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
-	validatePublicIP(t, pip, &svc, true)
+	validatePublicIP(t, pip2, &svc, true)
 	if pip.Name != pip2.Name ||
 		pip.PublicIPAddressPropertiesFormat.IPAddress != pip2.PublicIPAddressPropertiesFormat.IPAddress {
 		t.Errorf("We should get the exact same public ip resource after a second reconcile")
@@ -814,7 +813,7 @@ func getTestCloud() (az *Cloud) {
 			SubnetName:                   "subnet",
 			SecurityGroupName:            "nsg",
 			RouteTableName:               "rt",
-			PrimaryAvailabilitySetName:   "asName",
+			PrimaryAvailabilitySetName:   "as",
 			MaximumLoadBalancerRuleCount: 250,
 		},
 	}
@@ -832,8 +831,8 @@ func getTestCloud() (az *Cloud) {
 const networkInterfacesIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/networkInterfaces/%s"
 const primaryIPConfigIDTemplate = "%s/ipConfigurations/ipconfig"
 
-// returns the full identifier of a publicIPAddress.
-func getNetworkInterfacesID(subscriptionID string, resourceGroupName, nicName string) string {
+// returns the full identifier of Network Interface.
+func getNetworkInterfaceID(subscriptionID string, resourceGroupName, nicName string) string {
 	return fmt.Sprintf(
 		networkInterfacesIDTemplate,
 		subscriptionID,
@@ -892,7 +891,7 @@ func getClusterResources(az *Cloud, vmCount int, availabilitySetCount int) (clus
 		clusterResources.availabilitySetNames = append(clusterResources.availabilitySetNames, asName)
 
 		nicName := getNICName(vmIndex)
-		nicID := getNetworkInterfacesID(az.Config.SubscriptionID, az.Config.ResourceGroup, nicName)
+		nicID := getNetworkInterfaceID(az.Config.SubscriptionID, az.Config.ResourceGroup, nicName)
 		primaryIPConfigID := getPrimaryIPConfigID(nicID)
 		isPrimary := true
 		newNIC := network.Interface{

From e8c65f713009b9b6429611d70021b7eed1489d6f Mon Sep 17 00:00:00 2001
From: NIkhil Bhatia <nbhatia@microsoft.com>
Date: Thu, 16 Nov 2017 10:23:21 -0800
Subject: [PATCH 16/33] address more code review comments

---
 .../providers/azure/azure_backoff.go          |  43 +++----
 .../providers/azure/azure_loadbalancer.go     | 116 +++++++++++++++---
 .../providers/azure/azure_test.go             |  52 ++++----
 .../providers/azure/azure_util.go             |  86 ++-----------
 4 files changed, 161 insertions(+), 136 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_backoff.go b/pkg/cloudprovider/providers/azure/azure_backoff.go
index 6988d3c4ed..3947e912a3 100644
--- a/pkg/cloudprovider/providers/azure/azure_backoff.go
+++ b/pkg/cloudprovider/providers/azure/azure_backoff.go
@@ -26,10 +26,11 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 )
 
-// getOrCreateRequestBackoff returns a new Backoff object steps = 1
+// requestBackoff if backoff is disabled in cloud provider it
+// returns a new Backoff object steps = 1
 // This is to make sure that the requested command executes
 // at least once
-func (az *Cloud) getOrCreateRequestBackoff() (resourceRequestBackoff wait.Backoff) {
+func (az *Cloud) requestBackoff() (resourceRequestBackoff wait.Backoff) {
 	if az.CloudProviderBackoff {
 		return az.resourceRequestBackoff
 	}
@@ -44,7 +45,7 @@ func (az *Cloud) getOrCreateRequestBackoff() (resourceRequestBackoff wait.Backof
 func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.VirtualMachine, bool, error) {
 	var machine compute.VirtualMachine
 	var exists bool
-	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		machine, exists, retryErr = az.getVirtualMachine(name)
 		if retryErr != nil {
@@ -60,7 +61,7 @@ func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.Virtua
 // VirtualMachineClientGetWithRetry invokes az.VirtualMachinesClient.Get with exponential backoff retry
 func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string, types compute.InstanceViewTypes) (compute.VirtualMachine, error) {
 	var machine compute.VirtualMachine
-	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		az.operationPollRateLimiter.Accept()
 		machine, retryErr = az.VirtualMachinesClient.Get(resourceGroup, vmName, types)
@@ -78,7 +79,7 @@ func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string,
 func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine, error) {
 	allNodes := []compute.VirtualMachine{}
 	var result compute.VirtualMachineListResult
-	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("VirtualMachinesClient.List(%v): start", az.ResourceGroup)
@@ -103,7 +104,7 @@ func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine,
 		appendResults = false
 		// follow the next link to get all the vms for resource group
 		if result.NextLink != nil {
-			err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+			err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 				var retryErr error
 				az.operationPollRateLimiter.Accept()
 				glog.V(10).Infof("VirtualMachinesClient.ListNextResults(%v): start", az.ResourceGroup)
@@ -130,7 +131,7 @@ func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine,
 // GetIPForMachineWithRetry invokes az.getIPForMachine with exponential backoff retry
 func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {
 	var ip string
-	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		ip, retryErr = az.getIPForMachine(name)
 		if retryErr != nil {
@@ -145,7 +146,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {
 
 // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%s): start", *sg.Name)
 		respChan, errChan := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil)
@@ -158,7 +159,7 @@ func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {
 
 // CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%s): start", *lb.Name)
 		respChan, errChan := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil)
@@ -174,7 +175,7 @@ func (az *Cloud) ListLBWithRetry() ([]network.LoadBalancer, error) {
 	allLBs := []network.LoadBalancer{}
 	var result network.LoadBalancerListResult
 
-	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.List(%v): start", az.ResourceGroup)
@@ -200,7 +201,7 @@ func (az *Cloud) ListLBWithRetry() ([]network.LoadBalancer, error) {
 
 		// follow the next link to get all the vms for resource group
 		if result.NextLink != nil {
-			err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+			err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 				var retryErr error
 				az.operationPollRateLimiter.Accept()
 				glog.V(10).Infof("LoadBalancerClient.ListNextResults(%v): start", az.ResourceGroup)
@@ -229,7 +230,7 @@ func (az *Cloud) ListLBWithRetry() ([]network.LoadBalancer, error) {
 func (az *Cloud) ListPIPWithRetry() ([]network.PublicIPAddress, error) {
 	allPIPs := []network.PublicIPAddress{}
 	var result network.PublicIPAddressListResult
-	err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.List(%v): start", az.ResourceGroup)
@@ -255,7 +256,7 @@ func (az *Cloud) ListPIPWithRetry() ([]network.PublicIPAddress, error) {
 
 		// follow the next link to get all the vms for resource group
 		if result.NextLink != nil {
-			err := wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+			err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 				var retryErr error
 				az.operationPollRateLimiter.Accept()
 				glog.V(10).Infof("PublicIPAddressesClient.ListNextResults(%v): start", az.ResourceGroup)
@@ -282,7 +283,7 @@ func (az *Cloud) ListPIPWithRetry() ([]network.PublicIPAddress, error) {
 
 // CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s): start", *pip.Name)
 		respChan, errChan := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil)
@@ -295,7 +296,7 @@ func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error {
 
 // CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("InterfacesClient.CreateOrUpdate(%s): start", *nic.Name)
 		respChan, errChan := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil)
@@ -308,7 +309,7 @@ func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error {
 
 // DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry
 func (az *Cloud) DeletePublicIPWithRetry(pipName string) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.Delete(%s): start", pipName)
 		respChan, errChan := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil)
@@ -321,7 +322,7 @@ func (az *Cloud) DeletePublicIPWithRetry(pipName string) error {
 
 // DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry
 func (az *Cloud) DeleteLBWithRetry(lbName string) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.Delete(%s): start", lbName)
 		respChan, errChan := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil)
@@ -334,7 +335,7 @@ func (az *Cloud) DeleteLBWithRetry(lbName string) error {
 
 // CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RouteTablesClient.CreateOrUpdate(%s): start", *routeTable.Name)
 		respChan, errChan := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil)
@@ -347,7 +348,7 @@ func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable
 
 // CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RoutesClient.CreateOrUpdate(%s): start", *route.Name)
 		respChan, errChan := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil)
@@ -360,7 +361,7 @@ func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error {
 
 // DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry
 func (az *Cloud) DeleteRouteWithRetry(routeName string) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RoutesClient.Delete(%s): start", az.RouteTableName)
 		respChan, errChan := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil)
@@ -373,7 +374,7 @@ func (az *Cloud) DeleteRouteWithRetry(routeName string) error {
 
 // CreateOrUpdateVMWithRetry invokes az.VirtualMachinesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualMachine) error {
-	return wait.ExponentialBackoff(az.getOrCreateRequestBackoff(), func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s): start", vmName)
 		respChan, errChan := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil)
diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
index 9e52f4a5de..72ad6cfca6 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
@@ -18,11 +18,13 @@ package azure
 
 import (
 	"fmt"
+	"math"
 	"strconv"
 	"strings"
 
 	"k8s.io/api/core/v1"
 	utilerrors "k8s.io/apimachinery/pkg/util/errors"
+	"k8s.io/apimachinery/pkg/util/sets"
 	serviceapi "k8s.io/kubernetes/pkg/api/v1/service"
 
 	"github.com/Azure/azure-sdk-for-go/arm/compute"
@@ -41,6 +43,13 @@ const ServiceAnnotationLoadBalancerInternalSubnet = "service.beta.kubernetes.io/
 
 // ServiceAnnotationLoadBalancerMode is the annotation used on the service to specify the
 // Azure load balancer selection based on availability sets
+// There are currently three possible load balancer selection modes :
+// 1. Default mode - service has no annotation ("service.beta.kubernetes.io/azure-load-balancer-mode")
+//	  In this case the Loadbalancer of the primary Availability set is selected
+// 2. "__auto__" mode - service is annotated with __auto__ value, this when loadbalancer from any availability set
+//    is selected which has the miinimum rules associated with it.
+// 3. "as1,as2" mode - this is when the laod balancer from the specified availability sets is selected that has the
+//    miinimum rules associated with it.
 const ServiceAnnotationLoadBalancerMode = "service.beta.kubernetes.io/azure-load-balancer-mode"
 
 // ServiceAnnotationLoadBalancerAutoModeValue the annotation used on the service to specify the
@@ -146,20 +155,21 @@ func (az *Cloud) getServiceLoadBalancer(service *v1.Service, clusterName string,
 	var defaultLB *network.LoadBalancer
 	defaultLBName := az.getLoadBalancerName(clusterName, az.Config.PrimaryAvailabilitySetName, isInternal)
 
-	lbs, err := az.ListLBWithRetry()
+	existingLBs, err := az.ListLBWithRetry()
 	if err != nil {
 		return nil, nil, false, err
 	}
-	if lbs != nil {
-		for lbx := range lbs {
-			lb := &(lbs[lbx])
-			if strings.EqualFold(*lb.Name, defaultLBName) {
-				defaultLB = lb
+
+	// check if the service already has a load balancer
+	if existingLBs != nil {
+		for _, existingLB := range existingLBs {
+			if strings.EqualFold(*existingLB.Name, defaultLBName) {
+				defaultLB = &existingLB
 			}
-			if isInternalLoadBalancer(lb) != isInternal {
+			if isInternalLoadBalancer(&existingLB) != isInternal {
 				continue
 			}
-			status, err = az.getServiceLoadBalancerStatus(service, lb)
+			status, err = az.getServiceLoadBalancerStatus(service, &existingLB)
 			if err != nil {
 				return nil, nil, false, err
 			}
@@ -168,19 +178,22 @@ func (az *Cloud) getServiceLoadBalancer(service *v1.Service, clusterName string,
 				continue
 			}
 
-			return lb, status, true, nil
+			return &existingLB, status, true, nil
 		}
 	}
+
 	// service does not have a load balancer, select one
 	if wantLb {
 		// select new load balancer for service
-		lb, exists, err = az.selectLoadBalancer(clusterName, service, &lbs, nodes)
+		selectedLB, exists, err := az.selectLoadBalancer(clusterName, service, &existingLBs, nodes)
 		if err != nil {
 			return nil, nil, false, err
 		}
 
-		return lb, nil, exists, err
+		return selectedLB, nil, exists, err
 	}
+
+	// create a default LB with meta data if not present
 	if defaultLB == nil {
 		defaultLB = &network.LoadBalancer{
 			Name:                         &defaultLBName,
@@ -192,6 +205,66 @@ func (az *Cloud) getServiceLoadBalancer(service *v1.Service, clusterName string,
 	return defaultLB, nil, false, nil
 }
 
+// select load balancer for the service in the cluster
+// the selection algorithm selectes the the load balancer with currently has
+// the minimum lb rules, there there are multiple LB's with same number of rules
+// it selects the first one (sorted based on name)
+func (az *Cloud) selectLoadBalancer(clusterName string, service *v1.Service, existingLBs *[]network.LoadBalancer, nodes []*v1.Node) (selectedLB *network.LoadBalancer, existsLb bool, err error) {
+	isInternal := requiresInternalLoadBalancer(service)
+	serviceName := getServiceName(service)
+	glog.V(3).Infof("selectLoadBalancer(%s): isInternal(%s) - start", serviceName, isInternal)
+	availabilitySetNames, err := az.getLoadBalancerAvailabilitySetNames(service, nodes)
+	if err != nil {
+		glog.Errorf("az.selectLoadBalancer: cluster(%s) service(%s) isInternal(%t) - az.getLoadBalancerAvailabilitySetNames failed, err=(%v)", clusterName, serviceName, isInternal, err)
+		return nil, false, err
+	}
+	glog.Infof("selectLoadBalancer: cluster(%s) service(%s) isInternal(%t) - availabilitysetsnames %v", clusterName, serviceName, isInternal, *availabilitySetNames)
+	mapExistingLBs := map[string]network.LoadBalancer{}
+	for _, lb := range *existingLBs {
+		mapExistingLBs[*lb.Name] = lb
+	}
+	selectedLBRuleCount := math.MaxInt32
+	for _, currASName := range *availabilitySetNames {
+		currLBName := az.getLoadBalancerName(clusterName, currASName, isInternal)
+		lb, exists := mapExistingLBs[currLBName]
+		if !exists {
+			// select this LB as this is a new LB and will have minimum rules
+			// create tmp lb struct to hold metadata for the new load-balancer
+			selectedLB = &network.LoadBalancer{
+				Name:                         &currLBName,
+				Location:                     &az.Location,
+				LoadBalancerPropertiesFormat: &network.LoadBalancerPropertiesFormat{},
+			}
+
+			return selectedLB, false, nil
+		}
+
+		lbRules := *lb.LoadBalancingRules
+		currLBRuleCount := 0
+		if lbRules != nil {
+			currLBRuleCount = len(lbRules)
+		}
+		if currLBRuleCount < selectedLBRuleCount {
+			selectedLBRuleCount = currLBRuleCount
+			selectedLB = &lb
+		}
+	}
+
+	if selectedLB == nil {
+		err = fmt.Errorf("selectLoadBalancer: cluster(%s) service(%s) isInternal(%t) - unable to find load balancer for selected availability sets %v", clusterName, serviceName, isInternal, *availabilitySetNames)
+		glog.Error(err)
+		return nil, false, err
+	}
+	// validate if the selected LB has not exceeded the MaximumLoadBalancerRuleCount
+	if az.Config.MaximumLoadBalancerRuleCount != 0 && selectedLBRuleCount >= az.Config.MaximumLoadBalancerRuleCount {
+		err = fmt.Errorf("selectLoadBalancer: cluster(%s) service(%s) isInternal(%t) -  all available load balancers have exceeded maximum rule limit %d, availabilitysetnames (%v)", clusterName, serviceName, isInternal, selectedLBRuleCount, *availabilitySetNames)
+		glog.Error(err)
+		return selectedLB, existsLb, err
+	}
+
+	return selectedLB, existsLb, nil
+}
+
 func (az *Cloud) getServiceLoadBalancerStatus(service *v1.Service, lb *network.LoadBalancer) (status *v1.LoadBalancerStatus, err error) {
 	if lb == nil {
 		glog.V(10).Infof("getServiceLoadBalancerStatus lb is nil")
@@ -1043,15 +1116,26 @@ func subnet(service *v1.Service) *string {
 	return nil
 }
 
-func getServiceLoadBalancerMode(service *v1.Service) (hasMode bool, isAuto bool, asl []string) {
+// getServiceLoadBalancerMode parses the mode value
+// if the value is __auto__ it returns isAuto = TRUE
+// if anything else it returns the unique availability set names after triming spaces
+func getServiceLoadBalancerMode(service *v1.Service) (hasMode bool, isAuto bool, availabilitySetNames []string) {
 	mode, hasMode := service.Annotations[ServiceAnnotationLoadBalancerMode]
+	mode = strings.TrimSpace(mode)
 	isAuto = strings.EqualFold(mode, ServiceAnnotationLoadBalancerAutoModeValue)
 	if !isAuto {
-		asTagList := strings.TrimSpace(mode)
-
 		// Break up list of "AS1,AS2"
-		asl = strings.Split(asTagList, ",")
+		availabilitySetParsedList := strings.Split(mode, ",")
+
+		// Trim the availability set names and remove duplicates
+		//  e.g. {"AS1"," AS2", "AS3", "AS3"} => {"AS1", "AS2", "AS3"}
+		availabilitySetNameSet := sets.NewString()
+		for _, v := range availabilitySetParsedList {
+			availabilitySetNameSet.Insert(strings.TrimSpace(v))
+		}
+
+		availabilitySetNames = availabilitySetNameSet.List()
 	}
 
-	return hasMode, isAuto, asl
+	return hasMode, isAuto, availabilitySetNames
 }
diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index 8d6343d18f..73af15642c 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -19,6 +19,7 @@ package azure
 import (
 	"encoding/json"
 	"fmt"
+	"math"
 	"net/http"
 	"net/http/httptest"
 	"reflect"
@@ -161,10 +162,8 @@ func testLoadBalancerServiceAutoModeSelection(t *testing.T, isInternal bool) {
 			t.Errorf("Unexpected error: %s", svcName)
 		}
 
-		expectedNumOfLB := index % availabilitySetCount
-		if index >= availabilitySetCount {
-			expectedNumOfLB = availabilitySetCount
-		}
+		// expected is MIN(index, availabilitySetCount)
+		expectedNumOfLB := int(math.Min(float64(index), float64(availabilitySetCount)))
 		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
 		lbCount := len(*result.Value)
 		if lbCount != expectedNumOfLB {
@@ -192,6 +191,9 @@ func testLoadBalancerServiceAutoModeSelection(t *testing.T, isInternal bool) {
 
 // Validate availability set selection of services across load balancers
 // based on provided availability sets through service annotation
+// The scenario is that there are 4 availability sets in the agent pool but the
+// services will be assigned load balancers that are part of the provided availability sets
+// specified in service annotation
 func testLoadBalancerServicesSpecifiedSelection(t *testing.T, isInternal bool) {
 	az := getTestCloud()
 	const vmCount = 8
@@ -201,8 +203,8 @@ func testLoadBalancerServicesSpecifiedSelection(t *testing.T, isInternal bool) {
 	clusterResources := getClusterResources(az, vmCount, availabilitySetCount)
 	getTestSecurityGroup(az)
 
-	selectedAvailabilitySetName1 := getASName(az, 1, availabilitySetCount)
-	selectedAvailabilitySetName2 := getASName(az, 2, availabilitySetCount)
+	selectedAvailabilitySetName1 := getAvailabilitySetName(az, 1, availabilitySetCount)
+	selectedAvailabilitySetName2 := getAvailabilitySetName(az, 2, availabilitySetCount)
 	for index := 1; index <= serviceCount; index++ {
 		svcName := fmt.Sprintf("service-%d", index)
 		var svc v1.Service
@@ -223,10 +225,8 @@ func testLoadBalancerServicesSpecifiedSelection(t *testing.T, isInternal bool) {
 			t.Errorf("Unexpected error: %s", svcName)
 		}
 
-		expectedNumOfLB := index % 2
-		if index >= 2 {
-			expectedNumOfLB = 2
-		}
+		// expected is MIN(index, 2)
+		expectedNumOfLB := int(math.Min(float64(index), float64(2)))
 		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
 		lbCount := len(*result.Value)
 		if lbCount != expectedNumOfLB {
@@ -263,14 +263,12 @@ func testLoadBalancerMaxRulesServices(t *testing.T, isInternal bool) {
 			t.Errorf("Unexpected error: %s", svcName)
 		}
 
-		expectedNumOfLB := index % az.Config.MaximumLoadBalancerRuleCount
-		if index >= az.Config.MaximumLoadBalancerRuleCount {
-			expectedNumOfLB = az.Config.MaximumLoadBalancerRuleCount
-		}
+		// expected is MIN(index, az.Config.MaximumLoadBalancerRuleCount)
+		expectedNumOfLBRules := int(math.Min(float64(index), float64(az.Config.MaximumLoadBalancerRuleCount)))
 		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
 		lbCount := len(*result.Value)
-		if lbCount != expectedNumOfLB {
-			t.Errorf("Unexpected number of LB's: Expected (%d) Found (%d)", expectedNumOfLB, lbCount)
+		if lbCount != expectedNumOfLBRules {
+			t.Errorf("Unexpected number of LB's: Expected (%d) Found (%d)", expectedNumOfLBRules, lbCount)
 		}
 	}
 
@@ -286,11 +284,15 @@ func testLoadBalancerMaxRulesServices(t *testing.T, isInternal bool) {
 	_, err := az.EnsureLoadBalancer(testClusterName, &svc, clusterResources.nodes)
 	if err == nil {
 		t.Errorf("Expect any new service to fail as max limit in lb has reached")
+	} else {
+		expectedErrMessageSubString := "all available load balancers have exceeded maximum rule limit"
+		if !strings.Contains(err.Error(), expectedErrMessageSubString) {
+			t.Errorf("Error message returned is not expected, expected sub string=%s, actual error message=%v", expectedErrMessageSubString, err)
+		}
 	}
 }
 
-// Validate even distribution of external services across load balances
-// based on number of availability sets
+// Validate service deletion in lb auto selection mode
 func testLoadBalancerServiceAutoModeDeleteSelection(t *testing.T, isInternal bool) {
 	az := getTestCloud()
 	const vmCount = 8
@@ -331,10 +333,8 @@ func testLoadBalancerServiceAutoModeDeleteSelection(t *testing.T, isInternal boo
 
 		setLoadBalancerAutoModeAnnotation(&svc)
 
-		expectedNumOfLB := index % availabilitySetCount
-		if index >= availabilitySetCount {
-			expectedNumOfLB = availabilitySetCount
-		}
+		// expected is MIN(index, availabilitySetCount)
+		expectedNumOfLB := int(math.Min(float64(index), float64(availabilitySetCount)))
 		result, _ := az.LoadBalancerClient.List(az.Config.ResourceGroup)
 		lbCount := len(*result.Value)
 		if lbCount != expectedNumOfLB {
@@ -859,7 +859,7 @@ func getVMName(vmIndex int) string {
 	return getTestResourceName(TestVMResourceBaseName, vmIndex)
 }
 
-func getASName(az *Cloud, vmIndex int, numAS int) string {
+func getAvailabilitySetName(az *Cloud, vmIndex int, numAS int) string {
 	asIndex := vmIndex % numAS
 	if asIndex == 0 {
 		return az.Config.PrimaryAvailabilitySetName
@@ -868,8 +868,10 @@ func getASName(az *Cloud, vmIndex int, numAS int) string {
 	return getTestResourceName(TestASResourceBaseName, asIndex)
 }
 
+// test supporting on 1 nic per vm
+// we really dont care about the name of the nic
+// just using the vm name for testing purposes
 func getNICName(vmIndex int) string {
-	// test supporting on 1 nic per vm
 	return getVMName(vmIndex)
 }
 
@@ -887,7 +889,7 @@ func getClusterResources(az *Cloud, vmCount int, availabilitySetCount int) (clus
 	clusterResources.availabilitySetNames = []string{}
 	for vmIndex := 0; vmIndex < vmCount; vmIndex++ {
 		vmName := getVMName(vmIndex)
-		asName := getASName(az, vmIndex, availabilitySetCount)
+		asName := getAvailabilitySetName(az, vmIndex, availabilitySetCount)
 		clusterResources.availabilitySetNames = append(clusterResources.availabilitySetNames, asName)
 
 		nicName := getNICName(vmIndex)
diff --git a/pkg/cloudprovider/providers/azure/azure_util.go b/pkg/cloudprovider/providers/azure/azure_util.go
index cdacf7568d..04ff821e76 100644
--- a/pkg/cloudprovider/providers/azure/azure_util.go
+++ b/pkg/cloudprovider/providers/azure/azure_util.go
@@ -20,7 +20,6 @@ import (
 	"errors"
 	"fmt"
 	"hash/crc32"
-	"math"
 	"regexp"
 	"sort"
 	"strconv"
@@ -134,73 +133,12 @@ func (az *Cloud) getpublicIPAddressID(pipName string) string {
 		pipName)
 }
 
-// select load balancer for the service in the cluster
-// the selection algorithm selectes the the load balancer with currently has
-// the minimum lb rules, there there are multiple LB's with same number of rules
-// it selects the first one (sorted based on name)
-func (az *Cloud) selectLoadBalancer(clusterName string, service *v1.Service, existingLBs *[]network.LoadBalancer, nodes []*v1.Node) (selectedLB *network.LoadBalancer, existsLb bool, err error) {
-	isInternal := requiresInternalLoadBalancer(service)
-	serviceName := getServiceName(service)
-	glog.V(3).Infof("selectLoadBalancer(%s): isInternal(%s) - start", serviceName, isInternal)
-	availabilitySetNames, err := az.getLoadBalancerAvailabilitySetNames(service, nodes)
-	if err != nil {
-		glog.Errorf("az.selectLoadBalancer: cluster (%s) service(%s) - az.getLoadBalancerAvailabilitySetNames failed, err=(%v)", clusterName, serviceName, err)
-		return nil, false, err
-	}
-	glog.Infof("selectLoadBalancer(%s): isInternal(%s) - availabilitysetsname %v", serviceName, isInternal, *availabilitySetNames)
-	mapExistingLBs := map[string]*network.LoadBalancer{}
-	for lbx := range *existingLBs {
-		lb := (*existingLBs)[lbx]
-		mapExistingLBs[*lb.Name] = &lb
-	}
-	selectedLBRuleCount := math.MaxInt32
-	for asx := range *availabilitySetNames {
-		currASName := (*availabilitySetNames)[asx]
-		currLBName := az.getLoadBalancerName(clusterName, currASName, isInternal)
-		lb, ok := mapExistingLBs[currLBName]
-		if !ok {
-			// select this LB as this is a new LB and will have minimum rules
-			// create tmp lb struct to hold metadata for the new load-balancer
-			selectedLB = &network.LoadBalancer{
-				Name:                         &currLBName,
-				Location:                     &az.Location,
-				LoadBalancerPropertiesFormat: &network.LoadBalancerPropertiesFormat{},
-			}
-
-			return selectedLB, false, nil
-		}
-
-		lbRules := *lb.LoadBalancingRules
-		currLBRuleCount := 0
-		if lbRules != nil {
-			currLBRuleCount = len(lbRules)
-		}
-		if currLBRuleCount < selectedLBRuleCount {
-			selectedLBRuleCount = currLBRuleCount
-			selectedLB = lb
-		}
-	}
-
-	if selectedLB == nil {
-		glog.Errorf("selectLoadBalancer service (%s) - unable to find load balancer for selected availability sets %v", serviceName, *availabilitySetNames)
-		return nil, false, fmt.Errorf("selectLoadBalancer (%s)- unable to find load balancer for selected availability sets %v", serviceName, *availabilitySetNames)
-	}
-	// validate if the selected LB has not exceeded the MaximumLoadBalancerRuleCount
-	if az.Config.MaximumLoadBalancerRuleCount != 0 && selectedLBRuleCount >= az.Config.MaximumLoadBalancerRuleCount {
-		err = fmt.Errorf("selectLoadBalancer service (%s) -  all available load balancers have exceeded maximum rule limit %d", serviceName, selectedLBRuleCount)
-		glog.Error(err)
-		return selectedLB, existsLb, err
-	}
-
-	return selectedLB, existsLb, nil
-}
-
 // getLoadBalancerAvailabilitySetNames selects all possible availability sets for
 // service load balancer, if the service has no loadbalancer mode annotaion returns the
 // primary availability set if service annotation for loadbalancer availability set
 // exists then return the eligible a availability set
 func (az *Cloud) getLoadBalancerAvailabilitySetNames(service *v1.Service, nodes []*v1.Node) (availabilitySetNames *[]string, err error) {
-	hasMode, isAuto, serviceASL := getServiceLoadBalancerMode(service)
+	hasMode, isAuto, serviceAvailabilitySetNames := getServiceLoadBalancerMode(service)
 	if !hasMode {
 		// no mode specified in service annotation default to PrimaryAvailabilitySetName
 		availabilitySetNames = &[]string{az.Config.PrimaryAvailabilitySetName}
@@ -218,25 +156,25 @@ func (az *Cloud) getLoadBalancerAvailabilitySetNames(service *v1.Service, nodes
 	// sort the list to have deterministic selection
 	sort.Strings(*availabilitySetNames)
 	if !isAuto {
-		if serviceASL == nil || len(serviceASL) == 0 {
+		if serviceAvailabilitySetNames == nil || len(serviceAvailabilitySetNames) == 0 {
 			return nil, fmt.Errorf("service annotation for LoadBalancerMode is empty, it should have __auto__ or availability sets value")
 		}
 		// validate availability set exists
 		var found bool
-		for sasx := range serviceASL {
+		for sasx := range serviceAvailabilitySetNames {
 			for asx := range *availabilitySetNames {
-				if strings.EqualFold((*availabilitySetNames)[asx], serviceASL[sasx]) {
+				if strings.EqualFold((*availabilitySetNames)[asx], serviceAvailabilitySetNames[sasx]) {
 					found = true
-					serviceASL[sasx] = (*availabilitySetNames)[asx]
+					serviceAvailabilitySetNames[sasx] = (*availabilitySetNames)[asx]
 					break
 				}
 			}
 			if !found {
-				glog.Errorf("az.getLoadBalancerAvailabilitySetNames - Availability set (%s) in service annotation not found", serviceASL[sasx])
-				return nil, fmt.Errorf("availability set (%s) - not found", serviceASL[sasx])
+				glog.Errorf("az.getLoadBalancerAvailabilitySetNames - Availability set (%s) in service annotation not found", serviceAvailabilitySetNames[sasx])
+				return nil, fmt.Errorf("availability set (%s) - not found", serviceAvailabilitySetNames[sasx])
 			}
 		}
-		availabilitySetNames = &serviceASL
+		availabilitySetNames = &serviceAvailabilitySetNames
 	}
 
 	return availabilitySetNames, nil
@@ -244,7 +182,7 @@ func (az *Cloud) getLoadBalancerAvailabilitySetNames(service *v1.Service, nodes
 
 // lists the virtual machines for for the resource group and then builds
 // a list of availability sets that match the nodes available to k8s
-func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAs *[]string, err error) {
+func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAvailabilitySets *[]string, err error) {
 	vms, err := az.VirtualMachineClientListWithRetry()
 	if err != nil {
 		glog.Errorf("az.getNodeAvailabilitySet - VirtualMachineClientListWithRetry failed, err=%v", err)
@@ -258,7 +196,7 @@ func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAs *[]s
 		}
 	}
 	availabilitySetIDs := sets.NewString()
-	agentPoolAs = &[]string{}
+	agentPoolAvailabilitySets = &[]string{}
 	for nx := range nodes {
 		nodeName := (*nodes[nx]).Name
 		if isMasterNode(nodes[nx]) {
@@ -282,10 +220,10 @@ func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAs *[]s
 		// We want to keep it lower case, before the ID get fixed
 		asName = strings.ToLower(asName)
 
-		*agentPoolAs = append(*agentPoolAs, asName)
+		*agentPoolAvailabilitySets = append(*agentPoolAvailabilitySets, asName)
 	}
 
-	return agentPoolAs, nil
+	return agentPoolAvailabilitySets, nil
 }
 
 func (az *Cloud) mapLoadBalancerNameToAvailabilitySet(lbName string, clusterName string) (availabilitySetName string) {

From 88aab6f67b6ed288db3d0785e7c06587bca9d438 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 10:55:36 -0800
Subject: [PATCH 17/33] fix azure bazel BUILD

---
 pkg/cloudprovider/providers/azure/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pkg/cloudprovider/providers/azure/BUILD b/pkg/cloudprovider/providers/azure/BUILD
index c1d7bb6d73..acd41bdd71 100644
--- a/pkg/cloudprovider/providers/azure/BUILD
+++ b/pkg/cloudprovider/providers/azure/BUILD
@@ -49,6 +49,7 @@ go_library(
         "//vendor/k8s.io/api/core/v1:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
         "//vendor/k8s.io/client-go/util/flowcontrol:go_default_library",
     ],
@@ -61,10 +62,14 @@ go_test(
     library = ":go_default_library",
     deps = [
         "//pkg/api/v1/service:go_default_library",
+        "//pkg/kubelet/apis:go_default_library",
+        "//vendor/github.com/Azure/azure-sdk-for-go/arm/compute:go_default_library",
         "//vendor/github.com/Azure/azure-sdk-for-go/arm/network:go_default_library",
         "//vendor/github.com/Azure/go-autorest/autorest/to:go_default_library",
         "//vendor/k8s.io/api/core/v1:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
+        "//vendor/k8s.io/client-go/util/flowcontrol:go_default_library",
     ],
 )
 

From faec1d7f463bc5ed573454a5d5cf6617fbc0f1d1 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 11:18:18 -0800
Subject: [PATCH 18/33] for error case, return nil for SG

---
 pkg/cloudprovider/providers/azure/azure_loadbalancer.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
index 72ad6cfca6..80b40050eb 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
@@ -798,7 +798,7 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 		// Get lbIP since we make up NSG rules based on ingress IP
 		lbIP := &lbStatus.Ingress[0].IP
 		if lbIP == nil {
-			return &sg, fmt.Errorf("No load balancer IP for setting up security rules for service %s", service.Name)
+			return nil, fmt.Errorf("No load balancer IP for setting up security rules for service %s", service.Name)
 		}
 		destinationIPAddress = *lbIP
 	}

From 6b36a70d7995b337d82b4ca938a1d5679f6e9ba2 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 11:25:51 -0800
Subject: [PATCH 19/33] fix fake name convention

---
 .../providers/azure/azure_fakes.go            | 24 +++++++++----------
 .../providers/azure/azure_test.go             |  4 ++--
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_fakes.go b/pkg/cloudprovider/providers/azure/azure_fakes.go
index 0351f4efaa..45e3eee904 100644
--- a/pkg/cloudprovider/providers/azure/azure_fakes.go
+++ b/pkg/cloudprovider/providers/azure/azure_fakes.go
@@ -264,20 +264,20 @@ func (fAPC fakeAzurePIPClient) List(resourceGroupName string) (result network.Pu
 	return result, nil
 }
 
-type fakeInterfacesClient struct {
+type fakeAzureInterfacesClient struct {
 	mutex     *sync.Mutex
 	FakeStore map[string]map[string]network.Interface
 }
 
-func NewFakeInterfacesClient() fakeInterfacesClient {
-	fIC := fakeInterfacesClient{}
+func NewFakeAzureInterfacesClient() fakeAzureInterfacesClient {
+	fIC := fakeAzureInterfacesClient{}
 	fIC.FakeStore = make(map[string]map[string]network.Interface)
 	fIC.mutex = &sync.Mutex{}
 
 	return fIC
 }
 
-func (fIC fakeInterfacesClient) CreateOrUpdate(resourceGroupName string, networkInterfaceName string, parameters network.Interface, cancel <-chan struct{}) (<-chan network.Interface, <-chan error) {
+func (fIC fakeAzureInterfacesClient) CreateOrUpdate(resourceGroupName string, networkInterfaceName string, parameters network.Interface, cancel <-chan struct{}) (<-chan network.Interface, <-chan error) {
 	fIC.mutex.Lock()
 	defer fIC.mutex.Unlock()
 	resultChan := make(chan network.Interface, 1)
@@ -300,7 +300,7 @@ func (fIC fakeInterfacesClient) CreateOrUpdate(resourceGroupName string, network
 	return resultChan, errChan
 }
 
-func (fIC fakeInterfacesClient) Get(resourceGroupName string, networkInterfaceName string, expand string) (result network.Interface, err error) {
+func (fIC fakeAzureInterfacesClient) Get(resourceGroupName string, networkInterfaceName string, expand string) (result network.Interface, err error) {
 	fIC.mutex.Lock()
 	defer fIC.mutex.Unlock()
 	if _, ok := fIC.FakeStore[resourceGroupName]; ok {
@@ -314,19 +314,19 @@ func (fIC fakeInterfacesClient) Get(resourceGroupName string, networkInterfaceNa
 	}
 }
 
-type fakeVirtualMachinesClient struct {
+type fakeAzureVirtualMachinesClient struct {
 	mutex     *sync.Mutex
 	FakeStore map[string]map[string]compute.VirtualMachine
 }
 
-func NewFakeVirtualMachinesClient() fakeVirtualMachinesClient {
-	fVMC := fakeVirtualMachinesClient{}
+func NewFakeAzureVirtualMachinesClient() fakeAzureVirtualMachinesClient {
+	fVMC := fakeAzureVirtualMachinesClient{}
 	fVMC.FakeStore = make(map[string]map[string]compute.VirtualMachine)
 	fVMC.mutex = &sync.Mutex{}
 	return fVMC
 }
 
-func (fVMC fakeVirtualMachinesClient) CreateOrUpdate(resourceGroupName string, VMName string, parameters compute.VirtualMachine, cancel <-chan struct{}) (<-chan compute.VirtualMachine, <-chan error) {
+func (fVMC fakeAzureVirtualMachinesClient) CreateOrUpdate(resourceGroupName string, VMName string, parameters compute.VirtualMachine, cancel <-chan struct{}) (<-chan compute.VirtualMachine, <-chan error) {
 	fVMC.mutex.Lock()
 	defer fVMC.mutex.Unlock()
 	resultChan := make(chan compute.VirtualMachine, 1)
@@ -348,7 +348,7 @@ func (fVMC fakeVirtualMachinesClient) CreateOrUpdate(resourceGroupName string, V
 	return resultChan, errChan
 }
 
-func (fVMC fakeVirtualMachinesClient) Get(resourceGroupName string, VMName string, expand compute.InstanceViewTypes) (result compute.VirtualMachine, err error) {
+func (fVMC fakeAzureVirtualMachinesClient) Get(resourceGroupName string, VMName string, expand compute.InstanceViewTypes) (result compute.VirtualMachine, err error) {
 	fVMC.mutex.Lock()
 	defer fVMC.mutex.Unlock()
 	if _, ok := fVMC.FakeStore[resourceGroupName]; ok {
@@ -362,7 +362,7 @@ func (fVMC fakeVirtualMachinesClient) Get(resourceGroupName string, VMName strin
 	}
 }
 
-func (fVMC fakeVirtualMachinesClient) List(resourceGroupName string) (result compute.VirtualMachineListResult, err error) {
+func (fVMC fakeAzureVirtualMachinesClient) List(resourceGroupName string) (result compute.VirtualMachineListResult, err error) {
 	fVMC.mutex.Lock()
 	defer fVMC.mutex.Unlock()
 	var value []compute.VirtualMachine
@@ -378,7 +378,7 @@ func (fVMC fakeVirtualMachinesClient) List(resourceGroupName string) (result com
 	result.Value = &value
 	return result, nil
 }
-func (fVMC fakeVirtualMachinesClient) ListNextResults(lastResults compute.VirtualMachineListResult) (result compute.VirtualMachineListResult, err error) {
+func (fVMC fakeAzureVirtualMachinesClient) ListNextResults(lastResults compute.VirtualMachineListResult) (result compute.VirtualMachineListResult, err error) {
 	fVMC.mutex.Lock()
 	defer fVMC.mutex.Unlock()
 	return compute.VirtualMachineListResult{}, nil
diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index 73af15642c..51d9fa12a1 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -822,8 +822,8 @@ func getTestCloud() (az *Cloud) {
 	az.PublicIPAddressesClient = NewFakeAzurePIPClient(az.Config.SubscriptionID)
 	az.SubnetsClient = NewFakeAzureSubnetsClient()
 	az.SecurityGroupsClient = NewFakeAzureNSGClient()
-	az.VirtualMachinesClient = NewFakeVirtualMachinesClient()
-	az.InterfacesClient = NewFakeInterfacesClient()
+	az.VirtualMachinesClient = NewFakeAzureVirtualMachinesClient()
+	az.InterfacesClient = NewFakeAzureInterfacesClient()
 
 	return az
 }

From 83f18ca3f0bace5db472555e6c493c5f5500e868 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 11:33:48 -0800
Subject: [PATCH 20/33] refactor fake Delete function

---
 .../providers/azure/azure_fakes.go            | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_fakes.go b/pkg/cloudprovider/providers/azure/azure_fakes.go
index 45e3eee904..755452ac72 100644
--- a/pkg/cloudprovider/providers/azure/azure_fakes.go
+++ b/pkg/cloudprovider/providers/azure/azure_fakes.go
@@ -71,9 +71,9 @@ func (fLBC fakeAzureLBClient) Delete(resourceGroupName string, loadBalancerName
 		close(respChan)
 		close(errChan)
 	}()
-	if _, ok := fLBC.FakeStore[resourceGroupName]; ok {
-		if _, ok := fLBC.FakeStore[resourceGroupName][loadBalancerName]; ok {
-			delete(fLBC.FakeStore[resourceGroupName], loadBalancerName)
+	if rgLBs, ok := fLBC.FakeStore[resourceGroupName]; ok {
+		if _, ok := rgLBs[loadBalancerName]; ok {
+			delete(rgLBs, loadBalancerName)
 			resp.Response = &http.Response{
 				StatusCode: http.StatusAccepted,
 			}
@@ -207,9 +207,9 @@ func (fAPC fakeAzurePIPClient) Delete(resourceGroupName string, publicIPAddressN
 		close(respChan)
 		close(errChan)
 	}()
-	if _, ok := fAPC.FakeStore[resourceGroupName]; ok {
-		if _, ok := fAPC.FakeStore[resourceGroupName][publicIPAddressName]; ok {
-			delete(fAPC.FakeStore[resourceGroupName], publicIPAddressName)
+	if rgPIPs, ok := fAPC.FakeStore[resourceGroupName]; ok {
+		if _, ok := rgPIPs[publicIPAddressName]; ok {
+			delete(rgPIPs, publicIPAddressName)
 			resp.Response = &http.Response{
 				StatusCode: http.StatusAccepted,
 			}
@@ -434,9 +434,9 @@ func (fASC fakeAzureSubnetsClient) Delete(resourceGroupName string, virtualNetwo
 	}()
 
 	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
-	if _, ok := fASC.FakeStore[rgVnet]; ok {
-		if _, ok := fASC.FakeStore[rgVnet][subnetName]; ok {
-			delete(fASC.FakeStore[rgVnet], subnetName)
+	if rgSubnets, ok := fASC.FakeStore[rgVnet]; ok {
+		if _, ok := rgSubnets[subnetName]; ok {
+			delete(rgSubnets, subnetName)
 			resp.Response = &http.Response{
 				StatusCode: http.StatusAccepted,
 			}
@@ -532,9 +532,9 @@ func (fNSG fakeAzureNSGClient) Delete(resourceGroupName string, networkSecurityG
 		close(respChan)
 		close(errChan)
 	}()
-	if _, ok := fNSG.FakeStore[resourceGroupName]; ok {
-		if _, ok := fNSG.FakeStore[resourceGroupName][networkSecurityGroupName]; ok {
-			delete(fNSG.FakeStore[resourceGroupName], networkSecurityGroupName)
+	if rgSGs, ok := fNSG.FakeStore[resourceGroupName]; ok {
+		if _, ok := rgSGs[networkSecurityGroupName]; ok {
+			delete(rgSGs, networkSecurityGroupName)
 			resp.Response = &http.Response{
 				StatusCode: http.StatusAccepted,
 			}

From 1bf1c0d4d5811cd459b3d9a2a7e977ded8ee82c3 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 12:29:55 -0800
Subject: [PATCH 21/33] add azure_fakes.go Boilerplate header

---
 pkg/cloudprovider/providers/azure/azure_fakes.go | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/pkg/cloudprovider/providers/azure/azure_fakes.go b/pkg/cloudprovider/providers/azure/azure_fakes.go
index 755452ac72..b5dbb1798b 100644
--- a/pkg/cloudprovider/providers/azure/azure_fakes.go
+++ b/pkg/cloudprovider/providers/azure/azure_fakes.go
@@ -1,3 +1,19 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
 package azure
 
 import (

From c3050e3ab4bf1a7b24ba7778b4436a893219d85c Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 13:23:45 -0800
Subject: [PATCH 22/33] make newFake* functions unexported

---
 pkg/cloudprovider/providers/azure/azure_fakes.go | 12 ++++++------
 pkg/cloudprovider/providers/azure/azure_test.go  | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_fakes.go b/pkg/cloudprovider/providers/azure/azure_fakes.go
index b5dbb1798b..c96c8cd869 100644
--- a/pkg/cloudprovider/providers/azure/azure_fakes.go
+++ b/pkg/cloudprovider/providers/azure/azure_fakes.go
@@ -36,7 +36,7 @@ type fakeAzureLBClient struct {
 	FakeStore map[string]map[string]network.LoadBalancer
 }
 
-func NewFakeAzureLBClient() fakeAzureLBClient {
+func newFakeAzureLBClient() fakeAzureLBClient {
 	fLBC := fakeAzureLBClient{}
 	fLBC.FakeStore = make(map[string]map[string]network.LoadBalancer)
 	fLBC.mutex = &sync.Mutex{}
@@ -166,7 +166,7 @@ func getpublicIPAddressID(subscriptionID string, resourceGroupName, pipName stri
 		pipName)
 }
 
-func NewFakeAzurePIPClient(subscriptionID string) fakeAzurePIPClient {
+func newFakeAzurePIPClient(subscriptionID string) fakeAzurePIPClient {
 	fAPC := fakeAzurePIPClient{}
 	fAPC.FakeStore = make(map[string]map[string]network.PublicIPAddress)
 	fAPC.SubscriptionID = subscriptionID
@@ -285,7 +285,7 @@ type fakeAzureInterfacesClient struct {
 	FakeStore map[string]map[string]network.Interface
 }
 
-func NewFakeAzureInterfacesClient() fakeAzureInterfacesClient {
+func newFakeAzureInterfacesClient() fakeAzureInterfacesClient {
 	fIC := fakeAzureInterfacesClient{}
 	fIC.FakeStore = make(map[string]map[string]network.Interface)
 	fIC.mutex = &sync.Mutex{}
@@ -335,7 +335,7 @@ type fakeAzureVirtualMachinesClient struct {
 	FakeStore map[string]map[string]compute.VirtualMachine
 }
 
-func NewFakeAzureVirtualMachinesClient() fakeAzureVirtualMachinesClient {
+func newFakeAzureVirtualMachinesClient() fakeAzureVirtualMachinesClient {
 	fVMC := fakeAzureVirtualMachinesClient{}
 	fVMC.FakeStore = make(map[string]map[string]compute.VirtualMachine)
 	fVMC.mutex = &sync.Mutex{}
@@ -405,7 +405,7 @@ type fakeAzureSubnetsClient struct {
 	FakeStore map[string]map[string]network.Subnet
 }
 
-func NewFakeAzureSubnetsClient() fakeAzureSubnetsClient {
+func newFakeAzureSubnetsClient() fakeAzureSubnetsClient {
 	fASC := fakeAzureSubnetsClient{}
 	fASC.FakeStore = make(map[string]map[string]network.Subnet)
 	fASC.mutex = &sync.Mutex{}
@@ -506,7 +506,7 @@ type fakeAzureNSGClient struct {
 	FakeStore map[string]map[string]network.SecurityGroup
 }
 
-func NewFakeAzureNSGClient() fakeAzureNSGClient {
+func newFakeAzureNSGClient() fakeAzureNSGClient {
 	fNSG := fakeAzureNSGClient{}
 	fNSG.FakeStore = make(map[string]map[string]network.SecurityGroup)
 	fNSG.mutex = &sync.Mutex{}
diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index 51d9fa12a1..07279227f3 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -818,12 +818,12 @@ func getTestCloud() (az *Cloud) {
 		},
 	}
 	az.operationPollRateLimiter = flowcontrol.NewTokenBucketRateLimiter(100, 100)
-	az.LoadBalancerClient = NewFakeAzureLBClient()
-	az.PublicIPAddressesClient = NewFakeAzurePIPClient(az.Config.SubscriptionID)
-	az.SubnetsClient = NewFakeAzureSubnetsClient()
-	az.SecurityGroupsClient = NewFakeAzureNSGClient()
-	az.VirtualMachinesClient = NewFakeAzureVirtualMachinesClient()
-	az.InterfacesClient = NewFakeAzureInterfacesClient()
+	az.LoadBalancerClient = newFakeAzureLBClient()
+	az.PublicIPAddressesClient = newFakeAzurePIPClient(az.Config.SubscriptionID)
+	az.SubnetsClient = newFakeAzureSubnetsClient()
+	az.SecurityGroupsClient = newFakeAzureNSGClient()
+	az.VirtualMachinesClient = newFakeAzureVirtualMachinesClient()
+	az.InterfacesClient = newFakeAzureInterfacesClient()
 
 	return az
 }

From ff961163aaa1994e2cf61d3d8c820ade2cf7c0b6 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 15:04:08 -0800
Subject: [PATCH 23/33] clean up retry logic, since we try at least once

---
 .../providers/azure/azure_fakes.go            |  18 +++
 .../providers/azure/azure_instances.go        |  15 +--
 .../providers/azure/azure_loadbalancer.go     | 106 +++++-------------
 3 files changed, 48 insertions(+), 91 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_fakes.go b/pkg/cloudprovider/providers/azure/azure_fakes.go
index c96c8cd869..862627c450 100644
--- a/pkg/cloudprovider/providers/azure/azure_fakes.go
+++ b/pkg/cloudprovider/providers/azure/azure_fakes.go
@@ -70,6 +70,9 @@ func (fLBC fakeAzureLBClient) CreateOrUpdate(resourceGroupName string, loadBalan
 	}
 	fLBC.FakeStore[resourceGroupName][loadBalancerName] = parameters
 	result = fLBC.FakeStore[resourceGroupName][loadBalancerName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
 	err = nil
 	return resultChan, errChan
 }
@@ -206,6 +209,9 @@ func (fAPC fakeAzurePIPClient) CreateOrUpdate(resourceGroupName string, publicIP
 
 	fAPC.FakeStore[resourceGroupName][publicIPAddressName] = parameters
 	result = fAPC.FakeStore[resourceGroupName][publicIPAddressName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
 	err = nil
 	return resultChan, errChan
 }
@@ -311,6 +317,9 @@ func (fIC fakeAzureInterfacesClient) CreateOrUpdate(resourceGroupName string, ne
 	}
 	fIC.FakeStore[resourceGroupName][networkInterfaceName] = parameters
 	result = fIC.FakeStore[resourceGroupName][networkInterfaceName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
 	err = nil
 
 	return resultChan, errChan
@@ -360,6 +369,9 @@ func (fVMC fakeAzureVirtualMachinesClient) CreateOrUpdate(resourceGroupName stri
 	}
 	fVMC.FakeStore[resourceGroupName][VMName] = parameters
 	result = fVMC.FakeStore[resourceGroupName][VMName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
 	err = nil
 	return resultChan, errChan
 }
@@ -431,6 +443,9 @@ func (fASC fakeAzureSubnetsClient) CreateOrUpdate(resourceGroupName string, virt
 	}
 	fASC.FakeStore[rgVnet][subnetName] = subnetParameters
 	result = fASC.FakeStore[rgVnet][subnetName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
 	err = nil
 	return resultChan, errChan
 }
@@ -531,6 +546,9 @@ func (fNSG fakeAzureNSGClient) CreateOrUpdate(resourceGroupName string, networkS
 	}
 	fNSG.FakeStore[resourceGroupName][networkSecurityGroupName] = parameters
 	result = fNSG.FakeStore[resourceGroupName][networkSecurityGroupName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
 	err = nil
 	return resultChan, errChan
 }
diff --git a/pkg/cloudprovider/providers/azure/azure_instances.go b/pkg/cloudprovider/providers/azure/azure_instances.go
index fe9ed07ae0..bde33ab323 100644
--- a/pkg/cloudprovider/providers/azure/azure_instances.go
+++ b/pkg/cloudprovider/providers/azure/azure_instances.go
@@ -48,19 +48,10 @@ func (az *Cloud) NodeAddresses(name types.NodeName) ([]v1.NodeAddress, error) {
 		}
 		return addresses, nil
 	}
-	ip, err := az.getIPForMachine(name)
+	ip, err := az.GetIPForMachineWithRetry(name)
 	if err != nil {
-		if az.CloudProviderBackoff {
-			glog.V(2).Infof("NodeAddresses(%s) backing off", name)
-			ip, err = az.GetIPForMachineWithRetry(name)
-			if err != nil {
-				glog.V(2).Infof("NodeAddresses(%s) abort backoff", name)
-				return nil, err
-			}
-		} else {
-			glog.Errorf("error: az.NodeAddresses, az.getIPForMachine(%s), err=%v", name, err)
-			return nil, err
-		}
+		glog.V(2).Infof("NodeAddresses(%s) abort backoff", name)
+		return nil, err
 	}
 
 	return []v1.NodeAddress{
diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
index 80b40050eb..41d44e9a7b 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
@@ -363,22 +363,13 @@ func (az *Cloud) ensurePublicIPExists(serviceName, pipName, domainNameLabel stri
 	pip.Tags = &map[string]*string{"service": &serviceName}
 	glog.V(3).Infof("ensure(%s): pip(%s) - creating", serviceName, *pip.Name)
 	az.operationPollRateLimiter.Accept()
-	glog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%q): start", *pip.Name)
-	respChan, errChan := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil)
-	resp := <-respChan
-	err = <-errChan
-	glog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%q): end", *pip.Name)
-	if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
-		glog.V(2).Infof("ensure(%s) backing off: pip(%s) - creating", serviceName, *pip.Name)
-		retryErr := az.CreateOrUpdatePIPWithRetry(pip)
-		if retryErr != nil {
-			glog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - creating", serviceName, *pip.Name)
-			err = retryErr
-		}
-	}
+	glog.V(10).Infof("CreateOrUpdatePIPWithRetry(%q): start", *pip.Name)
+	err = az.CreateOrUpdatePIPWithRetry(pip)
 	if err != nil {
+		glog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - creating", serviceName, *pip.Name)
 		return nil, err
 	}
+	glog.V(10).Infof("CreateOrUpdatePIPWithRetry(%q): end", *pip.Name)
 
 	az.operationPollRateLimiter.Accept()
 	glog.V(10).Infof("PublicIPAddressesClient.Get(%q): start", *pip.Name)
@@ -709,39 +700,17 @@ func (az *Cloud) reconcileLoadBalancer(clusterName string, service *v1.Service,
 
 			az.operationPollRateLimiter.Accept()
 			glog.V(10).Infof("LoadBalancerClient.Delete(%q): start", lbName)
-			respChan, errChan := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil)
-			resp := <-respChan
-			err := <-errChan
-			glog.V(10).Infof("LoadBalancerClient.Delete(%q): end", lbName)
-			if az.CloudProviderBackoff && shouldRetryAPIRequest(resp, err) {
-				glog.V(2).Infof("delete(%s) backing off: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
-				retryErr := az.DeleteLBWithRetry(lbName)
-				if retryErr != nil {
-					err = retryErr
-					glog.V(2).Infof("delete(%s) abort backoff: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
-				}
-			}
+			err := az.DeleteLBWithRetry(lbName)
 			if err != nil {
+				glog.V(2).Infof("delete(%s) abort backoff: lb(%s) - deleting; no remaining frontendipconfigs", serviceName, lbName)
 				return nil, err
 			}
-
+			glog.V(10).Infof("LoadBalancerClient.Delete(%q): end", lbName)
 		} else {
 			glog.V(3).Infof("ensure(%s): lb(%s) - updating", serviceName, lbName)
-			az.operationPollRateLimiter.Accept()
-			glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%q): start", lbName)
-			respChan, errChan := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, lbName, *lb, nil)
-			resp := <-respChan
-			err := <-errChan
-			glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%q): end", lbName)
-			if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
-				glog.V(2).Infof("ensure(%s) backing off: lb(%s) - updating", serviceName, lbName)
-				retryErr := az.CreateOrUpdateLBWithRetry(*lb)
-				if retryErr != nil {
-					glog.V(2).Infof("ensure(%s) abort backoff: lb(%s) - updating", serviceName, lbName)
-					return nil, retryErr
-				}
-			}
+			err := az.CreateOrUpdateLBWithRetry(*lb)
 			if err != nil {
+				glog.V(2).Infof("ensure(%s) abort backoff: lb(%s) - updating", serviceName, lbName)
 				return nil, err
 			}
 		}
@@ -892,22 +861,13 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 		sg.SecurityRules = &updatedRules
 		glog.V(3).Infof("ensure(%s): sg(%s) - updating", serviceName, *sg.Name)
 		az.operationPollRateLimiter.Accept()
-		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%q): start", *sg.Name)
-		respChan, errChan := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil)
-		resp := <-respChan
-		err := <-errChan
-		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%q): end", *sg.Name)
-		if az.CloudProviderBackoff && shouldRetryAPIRequest(resp.Response, err) {
-			glog.V(2).Infof("ensure(%s) backing off: sg(%s) - updating", serviceName, *sg.Name)
-			retryErr := az.CreateOrUpdateSGWithRetry(sg)
-			if retryErr != nil {
-				glog.V(2).Infof("ensure(%s) abort backoff: sg(%s) - updating", serviceName, *sg.Name)
-				return nil, retryErr
-			}
-		}
+		glog.V(10).Infof("CreateOrUpdateSGWithRetry(%q): start", *sg.Name)
+		err := az.CreateOrUpdateSGWithRetry(sg)
 		if err != nil {
+			glog.V(2).Infof("ensure(%s) abort backoff: sg(%s) - updating", serviceName, *sg.Name)
 			return nil, err
 		}
+		glog.V(10).Infof("CreateOrUpdateSGWithRetry(%q): end", *sg.Name)
 	}
 	return &sg, nil
 }
@@ -938,22 +898,18 @@ func (az *Cloud) reconcilePublicIP(clusterName string, service *v1.Service, want
 			} else {
 				glog.V(2).Infof("ensure(%s): pip(%s) - deleting", serviceName, pipName)
 				az.operationPollRateLimiter.Accept()
-				glog.V(10).Infof("PublicIPAddressesClient.Delete(%q): start", pipName)
-				resp, deleteErrChan := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil)
-				deleteErr := <-deleteErrChan
-				glog.V(10).Infof("PublicIPAddressesClient.Delete(%q): end", pipName) // response not read yet...
-				if az.CloudProviderBackoff && shouldRetryAPIRequest(<-resp, deleteErr) {
-					glog.V(2).Infof("ensure(%s) backing off: pip(%s) - deleting", serviceName, pipName)
-					retryErr := az.DeletePublicIPWithRetry(pipName)
-					if retryErr != nil {
-						glog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - deleting", serviceName, pipName)
-						return nil, retryErr
-					}
+				glog.V(10).Infof("DeletePublicIPWithRetry(%q): start", pipName)
+				err = az.DeletePublicIPWithRetry(pipName)
+				if err != nil {
+					glog.V(2).Infof("ensure(%s) abort backoff: pip(%s) - deleting", serviceName, pipName)
+					// We let err to pass through
+					// It may be ignorable
 				}
+				glog.V(10).Infof("DeletePublicIPWithRetry(%q): end", pipName) // response not read yet...
 
-				deleteErr = ignoreStatusNotFoundFromError(deleteErr)
-				if deleteErr != nil {
-					return nil, deleteErr
+				err = ignoreStatusNotFoundFromError(err)
+				if err != nil {
+					return nil, err
 				}
 				glog.V(2).Infof("ensure(%s): pip(%s) - finished", serviceName, pipName)
 			}
@@ -1007,20 +963,12 @@ func (az *Cloud) ensureHostInPool(serviceName string, nodeName types.NodeName, b
 	vmName := mapNodeNameToVMName(nodeName)
 	az.operationPollRateLimiter.Accept()
 	glog.V(10).Infof("VirtualMachinesClient.Get(%q): start", vmName)
-	machine, err := az.VirtualMachinesClient.Get(az.ResourceGroup, vmName, "")
-	glog.V(10).Infof("VirtualMachinesClient.Get(%q): end", vmName)
+	machine, err := az.VirtualMachineClientGetWithRetry(az.ResourceGroup, vmName, "")
 	if err != nil {
-		if az.CloudProviderBackoff {
-			glog.V(2).Infof("ensureHostInPool(%s, %s, %s) backing off", serviceName, nodeName, backendPoolID)
-			machine, err = az.VirtualMachineClientGetWithRetry(az.ResourceGroup, vmName, "")
-			if err != nil {
-				glog.V(2).Infof("ensureHostInPool(%s, %s, %s) abort backoff", serviceName, nodeName, backendPoolID)
-				return err
-			}
-		} else {
-			return err
-		}
+		glog.V(2).Infof("ensureHostInPool(%s, %s, %s) abort backoff", serviceName, nodeName, backendPoolID)
+		return err
 	}
+	glog.V(10).Infof("VirtualMachinesClient.Get(%q): end", vmName)
 
 	primaryNicID, err := getPrimaryInterfaceID(machine)
 	if err != nil {

From 1b9b3fd7c774576e460abe84fef980a60599a83e Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 16:09:08 -0800
Subject: [PATCH 24/33] assign random ip instead of hard code

---
 pkg/cloudprovider/providers/azure/azure_fakes.go | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_fakes.go b/pkg/cloudprovider/providers/azure/azure_fakes.go
index 862627c450..98f35b1c57 100644
--- a/pkg/cloudprovider/providers/azure/azure_fakes.go
+++ b/pkg/cloudprovider/providers/azure/azure_fakes.go
@@ -64,7 +64,9 @@ func (fLBC fakeAzureLBClient) CreateOrUpdate(resourceGroupName string, loadBalan
 	if parameters.FrontendIPConfigurations != nil {
 		for idx, config := range *parameters.FrontendIPConfigurations {
 			if config.PrivateIPAllocationMethod == network.Dynamic {
-				(*parameters.FrontendIPConfigurations)[idx].PrivateIPAddress = to.StringPtr("10.0.0.19")
+				// Here we randomly assign an ip as private ip
+				// It dosen't smart enough to know whether it is in the subnet's range
+				(*parameters.FrontendIPConfigurations)[idx].PrivateIPAddress = getRandomIPPtr()
 			}
 		}
 	}
@@ -202,9 +204,7 @@ func (fAPC fakeAzurePIPClient) CreateOrUpdate(resourceGroupName string, publicIP
 	if parameters.PublicIPAddressPropertiesFormat != nil &&
 		parameters.PublicIPAddressPropertiesFormat.PublicIPAllocationMethod == network.Static {
 		// assign ip
-		rand.Seed(time.Now().UnixNano())
-		randomIP := fmt.Sprintf("%d.%d.%d.%d", rand.Intn(256), rand.Intn(256), rand.Intn(256), rand.Intn(256))
-		parameters.IPAddress = &randomIP
+		parameters.IPAddress = getRandomIPPtr()
 	}
 
 	fAPC.FakeStore[resourceGroupName][publicIPAddressName] = parameters
@@ -616,3 +616,8 @@ func (fNSG fakeAzureNSGClient) List(resourceGroupName string) (result network.Se
 	result.Value = &value
 	return result, nil
 }
+
+func getRandomIPPtr() *string {
+	rand.Seed(time.Now().UnixNano())
+	return to.StringPtr(fmt.Sprintf("%d.%d.%d.%d", rand.Intn(256), rand.Intn(256), rand.Intn(256), rand.Intn(256)))
+}

From 839e7f4c38ad3887add2eaca7fe54d504a36b395 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Thu, 16 Nov 2017 16:32:02 -0800
Subject: [PATCH 25/33] add test for flipServiceInternalAnnotation

---
 .../providers/azure/azure_test.go             | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index 07279227f3..811cdec45c 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -40,6 +40,30 @@ import (
 
 var testClusterName = "testCluster"
 
+// Test flipServiceInternalAnnotation
+func TestFlipServiceInternalAnnotation(t *testing.T) {
+	svc := getTestService("servicea", v1.ProtocolTCP, 80)
+	svcUpdated := flipServiceInternalAnnotation(&svc)
+	if !requiresInternalLoadBalancer(svcUpdated) {
+		t.Errorf("Expected svc to be an internal service")
+	}
+	svcUpdated = flipServiceInternalAnnotation(svcUpdated)
+	if requiresInternalLoadBalancer(svcUpdated) {
+		t.Errorf("Expected svc to be an external service")
+	}
+
+	svc2 := getInternalTestService("serviceb", 8081)
+	svc2Updated := flipServiceInternalAnnotation(&svc2)
+	if requiresInternalLoadBalancer(svc2Updated) {
+		t.Errorf("Expected svc to be an external service")
+	}
+
+	svc2Updated = flipServiceInternalAnnotation(svc2Updated)
+	if !requiresInternalLoadBalancer(svc2Updated) {
+		t.Errorf("Expected svc to be an internal service")
+	}
+}
+
 // Test additional of a new service/port.
 func TestAddPort(t *testing.T) {
 	az := getTestCloud()

From 422dac5d9be0efc59f8918293a282eb8c72163ca Mon Sep 17 00:00:00 2001
From: itowlson <github@hestia.cc>
Date: Fri, 17 Nov 2017 14:05:51 +1300
Subject: [PATCH 26/33] Option to consolidate Azure NSG rules for services
 (#13)

* Option to consolidate Azure NSG rules for services

* Fixed panic checking for service on other Azure LB
---
 .../providers/azure/azure_loadbalancer.go     | 336 ++++++-
 .../providers/azure/azure_test.go             | 903 +++++++++++++++++-
 .../providers/azure/azure_util.go             |   4 +
 3 files changed, 1198 insertions(+), 45 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
index 41d44e9a7b..a4eee6e7d6 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
@@ -59,6 +59,14 @@ const ServiceAnnotationLoadBalancerAutoModeValue = "__auto__"
 // ServiceAnnotationDNSLabelName annotation speficying the DNS label name for the service.
 const ServiceAnnotationDNSLabelName = "service.beta.kubernetes.io/azure-dns-label-name"
 
+// ServiceAnnotationSharedSecurityRule is the annotation used on the service
+// to specify that the service should be exposed using an Azure security rule
+// that may be shared with other service, trading specificity of rules for an
+// increase in the number of services that can be exposed. This relies on the
+// Azure "augmented security rules" feature which at the time of writing is in
+// preview and available only in certain regions.
+const ServiceAnnotationSharedSecurityRule = "service.beta.kubernetes.io/azure-shared-securityrule"
+
 // GetLoadBalancer returns whether the specified load balancer exists, and
 // if so, what its status is.
 func (az *Cloud) GetLoadBalancer(clusterName string, service *v1.Service) (status *v1.LoadBalancerStatus, exists bool, err error) {
@@ -107,7 +115,12 @@ func (az *Cloud) EnsureLoadBalancer(clusterName string, service *v1.Service, nod
 		return nil, err
 	}
 
-	if _, err := az.reconcileSecurityGroup(clusterName, service, lbStatus, true /* wantLb */); err != nil {
+	var serviceIP *string
+	if lbStatus != nil && len(lbStatus.Ingress) > 0 {
+		serviceIP = &lbStatus.Ingress[0].IP
+	}
+	glog.V(10).Infof("Calling reconcileSecurityGroup from EnsureLoadBalancer for %s with IP %s, wantLb = true", service.Name, logSafe(serviceIP))
+	if _, err := az.reconcileSecurityGroup(clusterName, service, serviceIP, true /* wantLb */); err != nil {
 		return nil, err
 	}
 
@@ -127,9 +140,17 @@ func (az *Cloud) UpdateLoadBalancer(clusterName string, service *v1.Service, nod
 // have multiple underlying components, meaning a Get could say that the LB
 // doesn't exist even if some part of it is still laying around.
 func (az *Cloud) EnsureLoadBalancerDeleted(clusterName string, service *v1.Service) error {
+	isInternal := requiresInternalLoadBalancer(service)
 	serviceName := getServiceName(service)
 	glog.V(5).Infof("delete(%s): START clusterName=%q", serviceName, clusterName)
-	if _, err := az.reconcileSecurityGroup(clusterName, service, nil, false /* wantLb */); err != nil {
+
+	serviceIPToCleanup, err := az.findServiceIPAddress(clusterName, service, isInternal)
+	if err != nil {
+		return err
+	}
+
+	glog.V(10).Infof("Calling reconcileSecurityGroup from EnsureLoadBalancerDeleted for %s with IP %s, wantLb = false", service.Name, serviceIPToCleanup)
+	if _, err := az.reconcileSecurityGroup(clusterName, service, &serviceIPToCleanup, false /* wantLb */); err != nil {
 		return err
 	}
 
@@ -331,6 +352,9 @@ func (az *Cloud) determinePublicIPName(clusterName string, service *v1.Service)
 
 func flipServiceInternalAnnotation(service *v1.Service) *v1.Service {
 	copyService := service.DeepCopy()
+	if copyService.Annotations == nil {
+		copyService.Annotations = map[string]string{}
+	}
 	if v, ok := copyService.Annotations[ServiceAnnotationLoadBalancerInternal]; ok && v == "true" {
 		// If it is internal now, we make it external by remove the annotation
 		delete(copyService.Annotations, ServiceAnnotationLoadBalancerInternal)
@@ -341,6 +365,25 @@ func flipServiceInternalAnnotation(service *v1.Service) *v1.Service {
 	return copyService
 }
 
+func (az *Cloud) findServiceIPAddress(clusterName string, service *v1.Service, isInternalLb bool) (string, error) {
+	if len(service.Spec.LoadBalancerIP) > 0 {
+		return service.Spec.LoadBalancerIP, nil
+	}
+
+	lbStatus, existsLb, err := az.GetLoadBalancer(clusterName, service)
+	if err != nil {
+		return "", err
+	}
+	if !existsLb {
+		return "", fmt.Errorf("Expected to find an IP address for service %s but did not", service.Name)
+	}
+	if len(lbStatus.Ingress) < 1 {
+		return "", fmt.Errorf("Expected to find an IP address for service %s but it had no ingresses", service.Name)
+	}
+
+	return lbStatus.Ingress[0].IP, nil
+}
+
 func (az *Cloud) ensurePublicIPExists(serviceName, pipName, domainNameLabel string) (*network.PublicIPAddress, error) {
 	pip, existsPip, err := az.getPublicIPAddress(pipName)
 	if err != nil {
@@ -744,16 +787,19 @@ func (az *Cloud) reconcileLoadBalancer(clusterName string, service *v1.Service,
 
 // This reconciles the Network Security Group similar to how the LB is reconciled.
 // This entails adding required, missing SecurityRules and removing stale rules.
-func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service, lbStatus *v1.LoadBalancerStatus, wantLb bool) (*network.SecurityGroup, error) {
+func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service, lbIP *string, wantLb bool) (*network.SecurityGroup, error) {
 	serviceName := getServiceName(service)
-	glog.V(5).Infof("ensure(%s): START clusterName=%q lbName=%q", serviceName, clusterName)
+	glog.V(5).Infof("reconcileSecurityGroup(%s): START clusterName=%q lbName=%q", serviceName, clusterName)
 
-	var ports []v1.ServicePort
-	if wantLb {
-		ports = service.Spec.Ports
-	} else {
+	ports := service.Spec.Ports
+	if ports == nil {
+		if useSharedSecurityRule(service) {
+			glog.V(2).Infof("Attempting to reconcile security group for service %s, but service uses shared rule and we don't know which port it's for", service.Name)
+			return nil, fmt.Errorf("No port info for reconciling shared rule for service %s", service.Name)
+		}
 		ports = []v1.ServicePort{}
 	}
+
 	az.operationPollRateLimiter.Accept()
 	glog.V(10).Infof("SecurityGroupsClient.Get(%q): start", az.SecurityGroupName)
 	sg, err := az.SecurityGroupsClient.Get(az.ResourceGroup, az.SecurityGroupName, "")
@@ -763,12 +809,10 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 	}
 
 	destinationIPAddress := ""
-	if wantLb {
-		// Get lbIP since we make up NSG rules based on ingress IP
-		lbIP := &lbStatus.Ingress[0].IP
-		if lbIP == nil {
-			return nil, fmt.Errorf("No load balancer IP for setting up security rules for service %s", service.Name)
-		}
+	if wantLb && lbIP == nil {
+		return nil, fmt.Errorf("No load balancer IP for setting up security rules for service %s", service.Name)
+	}
+	if lbIP != nil {
 		destinationIPAddress = *lbIP
 	}
 	if destinationIPAddress == "" {
@@ -789,38 +833,52 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 			sourceAddressPrefixes = append(sourceAddressPrefixes, ip.String())
 		}
 	}
-	expectedSecurityRules := make([]network.SecurityRule, len(ports)*len(sourceAddressPrefixes))
+	expectedSecurityRules := []network.SecurityRule{}
 
-	for i, port := range ports {
-		_, securityProto, _, err := getProtocolsFromKubernetesProtocol(port.Protocol)
-		if err != nil {
-			return nil, err
-		}
-		for j := range sourceAddressPrefixes {
-			ix := i*len(sourceAddressPrefixes) + j
-			securityRuleName := getSecurityRuleName(service, port, sourceAddressPrefixes[j])
-			expectedSecurityRules[ix] = network.SecurityRule{
-				Name: to.StringPtr(securityRuleName),
-				SecurityRulePropertiesFormat: &network.SecurityRulePropertiesFormat{
-					Protocol:                 *securityProto,
-					SourcePortRange:          to.StringPtr("*"),
-					DestinationPortRange:     to.StringPtr(strconv.Itoa(int(port.Port))),
-					SourceAddressPrefix:      to.StringPtr(sourceAddressPrefixes[j]),
-					DestinationAddressPrefix: to.StringPtr(destinationIPAddress),
-					Access:    network.SecurityRuleAccessAllow,
-					Direction: network.SecurityRuleDirectionInbound,
-				},
+	if wantLb {
+		expectedSecurityRules = make([]network.SecurityRule, len(ports)*len(sourceAddressPrefixes))
+
+		for i, port := range ports {
+			_, securityProto, _, err := getProtocolsFromKubernetesProtocol(port.Protocol)
+			if err != nil {
+				return nil, err
+			}
+			for j := range sourceAddressPrefixes {
+				ix := i*len(sourceAddressPrefixes) + j
+				securityRuleName := getSecurityRuleName(service, port, sourceAddressPrefixes[j])
+				expectedSecurityRules[ix] = network.SecurityRule{
+					Name: to.StringPtr(securityRuleName),
+					SecurityRulePropertiesFormat: &network.SecurityRulePropertiesFormat{
+						Protocol:                 *securityProto,
+						SourcePortRange:          to.StringPtr("*"),
+						DestinationPortRange:     to.StringPtr(strconv.Itoa(int(port.Port))),
+						SourceAddressPrefix:      to.StringPtr(sourceAddressPrefixes[j]),
+						DestinationAddressPrefix: to.StringPtr(destinationIPAddress),
+						Access:    network.SecurityRuleAccessAllow,
+						Direction: network.SecurityRuleDirectionInbound,
+					},
+				}
 			}
 		}
 	}
 
+	for _, r := range expectedSecurityRules {
+		glog.V(10).Infof("Expecting security rule for %s: %s:%s -> %s:%s", service.Name, *r.SourceAddressPrefix, *r.SourcePortRange, *r.DestinationAddressPrefix, *r.DestinationPortRange)
+	}
+
 	// update security rules
 	dirtySg := false
 	var updatedRules []network.SecurityRule
 	if sg.SecurityRules != nil {
 		updatedRules = *sg.SecurityRules
 	}
-	// update security rules: remove unwanted
+
+	for _, r := range updatedRules {
+		glog.V(10).Infof("Existing security rule while processing %s: %s:%s -> %s:%s", service.Name, logSafe(r.SourceAddressPrefix), logSafe(r.SourcePortRange), logSafeCollection(r.DestinationAddressPrefix, r.DestinationAddressPrefixes), logSafe(r.DestinationPortRange))
+	}
+
+	// update security rules: remove unwanted rules that belong privately
+	// to this service
 	for i := len(updatedRules) - 1; i >= 0; i-- {
 		existingRule := updatedRules[i]
 		if serviceOwnsRule(service, *existingRule.Name) {
@@ -837,6 +895,50 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 			}
 		}
 	}
+	// update security rules: if the service uses a shared rule and is being deleted,
+	// then remove it from the shared rule
+	if useSharedSecurityRule(service) && !wantLb {
+		for _, port := range ports {
+			for _, sourceAddressPrefix := range sourceAddressPrefixes {
+				sharedRuleName := getSecurityRuleName(service, port, sourceAddressPrefix)
+				sharedIndex, sharedRule, sharedRuleFound := findSecurityRuleByName(updatedRules, sharedRuleName)
+				if !sharedRuleFound {
+					glog.V(4).Infof("Expected to find shared rule %s for service %s being deleted, but did not", sharedRuleName, service.Name)
+					return nil, fmt.Errorf("Expected to find shared rule %s for service %s being deleted, but did not", sharedRuleName, service.Name)
+				}
+				if sharedRule.DestinationAddressPrefixes == nil {
+					glog.V(4).Infof("Expected to have array of destinations in shared rule for service %s being deleted, but did not", service.Name)
+					return nil, fmt.Errorf("Expected to have array of destinations in shared rule for service %s being deleted, but did not", service.Name)
+				}
+				existingPrefixes := *sharedRule.DestinationAddressPrefixes
+				addressIndex, found := findIndex(existingPrefixes, destinationIPAddress)
+				if !found {
+					glog.V(4).Infof("Expected to find destination address %s in shared rule %s for service %s being deleted, but did not", destinationIPAddress, sharedRuleName, service.Name)
+					return nil, fmt.Errorf("Expected to find destination address %s in shared rule %s for service %s being deleted, but did not", destinationIPAddress, sharedRuleName, service.Name)
+				}
+				if len(existingPrefixes) == 1 {
+					updatedRules = append(updatedRules[:sharedIndex], updatedRules[sharedIndex+1:]...)
+				} else {
+					newDestinations := append(existingPrefixes[:addressIndex], existingPrefixes[addressIndex+1:]...)
+					sharedRule.DestinationAddressPrefixes = &newDestinations
+					updatedRules[sharedIndex] = sharedRule
+				}
+				dirtySg = true
+			}
+		}
+	}
+
+	// update security rules: prepare rules for consolidation
+	for index, rule := range updatedRules {
+		if allowsConsolidation(rule) {
+			updatedRules[index] = makeConsolidatable(rule)
+		}
+	}
+	for index, rule := range expectedSecurityRules {
+		if allowsConsolidation(rule) {
+			expectedSecurityRules[index] = makeConsolidatable(rule)
+		}
+	}
 	// update security rules: add needed
 	for _, expectedRule := range expectedSecurityRules {
 		foundRule := false
@@ -844,6 +946,11 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 			glog.V(10).Infof("reconcile(%s)(%t): sg rule(%s) - already exists", serviceName, wantLb, *expectedRule.Name)
 			foundRule = true
 		}
+		if foundRule && allowsConsolidation(expectedRule) {
+			index, _ := findConsolidationCandidate(updatedRules, expectedRule)
+			updatedRules[index] = consolidate(updatedRules[index], expectedRule)
+			dirtySg = true
+		}
 		if !foundRule {
 			glog.V(10).Infof("reconcile(%s)(%t): sg rule(%s) - adding", serviceName, wantLb, *expectedRule.Name)
 
@@ -857,6 +964,11 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 			dirtySg = true
 		}
 	}
+
+	for _, r := range updatedRules {
+		glog.V(10).Infof("Updated security rule while processing %s: %s:%s -> %s:%s", service.Name, logSafe(r.SourceAddressPrefix), logSafe(r.SourcePortRange), logSafeCollection(r.DestinationAddressPrefix, r.DestinationAddressPrefixes), logSafe(r.DestinationPortRange))
+	}
+
 	if dirtySg {
 		sg.SecurityRules = &updatedRules
 		glog.V(3).Infof("ensure(%s): sg(%s) - updating", serviceName, *sg.Name)
@@ -865,6 +977,14 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 		err := az.CreateOrUpdateSGWithRetry(sg)
 		if err != nil {
 			glog.V(2).Infof("ensure(%s) abort backoff: sg(%s) - updating", serviceName, *sg.Name)
+			// TODO (Nov 2017): remove when augmented security rules are out of preview
+			// we could try to parse the response but it's not worth it for bridging a preview
+			errorDescription := err.Error()
+			if strings.Contains(errorDescription, "SubscriptionNotRegisteredForFeature") && strings.Contains(errorDescription, "Microsoft.Network/AllowAccessRuleExtendedProperties") {
+				sharedRuleError := fmt.Errorf("Shared security rules are not available in this Azure region. Details: %v", errorDescription)
+				return nil, sharedRuleError
+			}
+			// END TODO
 			return nil, err
 		}
 		glog.V(10).Infof("CreateOrUpdateSGWithRetry(%q): end", *sg.Name)
@@ -872,6 +992,144 @@ func (az *Cloud) reconcileSecurityGroup(clusterName string, service *v1.Service,
 	return &sg, nil
 }
 
+func logSafe(s *string) string {
+	if s == nil {
+		return "(nil)"
+	}
+	return *s
+}
+
+func logSafeCollection(s *string, strs *[]string) string {
+	if s == nil {
+		if strs == nil {
+			return "(nil)"
+		}
+		return "[" + strings.Join(*strs, ",") + "]"
+	}
+	return *s
+}
+
+func findSecurityRuleByName(rules []network.SecurityRule, ruleName string) (int, network.SecurityRule, bool) {
+	for index, rule := range rules {
+		if rule.Name != nil && strings.EqualFold(*rule.Name, ruleName) {
+			return index, rule, true
+		}
+	}
+	return 0, network.SecurityRule{}, false
+}
+
+func findIndex(strs []string, s string) (int, bool) {
+	for index, str := range strs {
+		if strings.EqualFold(str, s) {
+			return index, true
+		}
+	}
+	return 0, false
+}
+
+func allowsConsolidation(rule network.SecurityRule) bool {
+	return strings.HasPrefix(*rule.Name, "shared")
+}
+
+func findConsolidationCandidate(rules []network.SecurityRule, rule network.SecurityRule) (int, bool) {
+	for index, r := range rules {
+		if allowsConsolidation(r) {
+			if strings.EqualFold(*r.Name, *rule.Name) {
+				return index, true
+			}
+		}
+	}
+
+	return 0, false
+}
+
+func makeConsolidatable(rule network.SecurityRule) network.SecurityRule {
+	return network.SecurityRule{
+		Name: rule.Name,
+		SecurityRulePropertiesFormat: &network.SecurityRulePropertiesFormat{
+			Priority:                   rule.Priority,
+			Protocol:                   rule.Protocol,
+			SourcePortRange:            rule.SourcePortRange,
+			SourcePortRanges:           rule.SourcePortRanges,
+			DestinationPortRange:       rule.DestinationPortRange,
+			DestinationPortRanges:      rule.DestinationPortRanges,
+			SourceAddressPrefix:        rule.SourceAddressPrefix,
+			SourceAddressPrefixes:      rule.SourceAddressPrefixes,
+			DestinationAddressPrefixes: collectionOrSingle(rule.DestinationAddressPrefixes, rule.DestinationAddressPrefix),
+			Access:    rule.Access,
+			Direction: rule.Direction,
+		},
+	}
+}
+
+func consolidate(existingRule network.SecurityRule, newRule network.SecurityRule) network.SecurityRule {
+	destinations := appendElements(existingRule.SecurityRulePropertiesFormat.DestinationAddressPrefixes, newRule.DestinationAddressPrefix, newRule.DestinationAddressPrefixes)
+	destinations = deduplicate(destinations) // there are transient conditions during controller startup where it tries to add a service that is already added
+
+	return network.SecurityRule{
+		Name: existingRule.Name,
+		SecurityRulePropertiesFormat: &network.SecurityRulePropertiesFormat{
+			Priority:                   existingRule.Priority,
+			Protocol:                   existingRule.Protocol,
+			SourcePortRange:            existingRule.SourcePortRange,
+			SourcePortRanges:           existingRule.SourcePortRanges,
+			DestinationPortRange:       existingRule.DestinationPortRange,
+			DestinationPortRanges:      existingRule.DestinationPortRanges,
+			SourceAddressPrefix:        existingRule.SourceAddressPrefix,
+			SourceAddressPrefixes:      existingRule.SourceAddressPrefixes,
+			DestinationAddressPrefixes: destinations,
+			Access:    existingRule.Access,
+			Direction: existingRule.Direction,
+		},
+	}
+}
+
+func collectionOrSingle(collection *[]string, s *string) *[]string {
+	if collection != nil && len(*collection) > 0 {
+		return collection
+	}
+	if s == nil {
+		return &[]string{}
+	}
+	return &[]string{*s}
+}
+
+func appendElements(collection *[]string, appendString *string, appendStrings *[]string) *[]string {
+	newCollection := []string{}
+
+	if collection != nil {
+		newCollection = append(newCollection, *collection...)
+	}
+	if appendString != nil {
+		newCollection = append(newCollection, *appendString)
+	}
+	if appendStrings != nil {
+		newCollection = append(newCollection, *appendStrings...)
+	}
+
+	return &newCollection
+}
+
+func deduplicate(collection *[]string) *[]string {
+	if collection == nil {
+		return nil
+	}
+
+	seen := map[string]bool{}
+	result := make([]string, 0, len(*collection))
+
+	for _, v := range *collection {
+		if seen[v] == true {
+			// skip this element
+		} else {
+			seen[v] = true
+			result = append(result, v)
+		}
+	}
+
+	return &result
+}
+
 // This reconciles the PublicIP resources similar to how the LB is reconciled.
 func (az *Cloud) reconcilePublicIP(clusterName string, service *v1.Service, wantLb bool) (*network.PublicIPAddress, error) {
 	isInternal := requiresInternalLoadBalancer(service)
@@ -1087,3 +1345,11 @@ func getServiceLoadBalancerMode(service *v1.Service) (hasMode bool, isAuto bool,
 
 	return hasMode, isAuto, availabilitySetNames
 }
+
+func useSharedSecurityRule(service *v1.Service) bool {
+	if l, ok := service.Annotations[ServiceAnnotationSharedSecurityRule]; ok {
+		return l == "true"
+	}
+
+	return false
+}
diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index 811cdec45c..db6fac8d3f 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -660,7 +660,7 @@ func TestReconcileSecurityGroupNewServiceAddsPort(t *testing.T) {
 	lb, _ := az.reconcileLoadBalancer(testClusterName, &svc1, clusterResources.nodes, true)
 	lbStatus, _ := az.getServiceLoadBalancerStatus(&svc1, lb)
 
-	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, lbStatus, true /* wantLb */)
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, &lbStatus.Ingress[0].IP, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -677,7 +677,7 @@ func TestReconcileSecurityGroupNewInternalServiceAddsPort(t *testing.T) {
 
 	lb, _ := az.reconcileLoadBalancer(testClusterName, &svc1, clusterResources.nodes, true)
 	lbStatus, _ := az.getServiceLoadBalancerStatus(&svc1, lb)
-	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, lbStatus, true /* wantLb */)
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, &lbStatus.Ingress[0].IP, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -699,7 +699,7 @@ func TestReconcileSecurityGroupRemoveService(t *testing.T) {
 	sg := getTestSecurityGroup(az, service1, service2)
 	validateSecurityGroup(t, sg, service1, service2)
 
-	sg, err := az.reconcileSecurityGroup(testClusterName, &service1, lbStatus, false /* wantLb */)
+	sg, err := az.reconcileSecurityGroup(testClusterName, &service1, &lbStatus.Ingress[0].IP, false /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -717,7 +717,7 @@ func TestReconcileSecurityGroupRemoveServiceRemovesPort(t *testing.T) {
 	lb, _ := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true)
 	lbStatus, _ := az.getServiceLoadBalancerStatus(&svc, lb)
 
-	sg, err := az.reconcileSecurityGroup(testClusterName, &svcUpdated, lbStatus, true /* wantLb */)
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svcUpdated, &lbStatus.Ingress[0].IP, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -738,7 +738,7 @@ func TestReconcileSecurityWithSourceRanges(t *testing.T) {
 	lb, _ := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true)
 	lbStatus, _ := az.getServiceLoadBalancerStatus(&svc, lb)
 
-	sg, err := az.reconcileSecurityGroup(testClusterName, &svc, lbStatus, true /* wantLb */)
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc, &lbStatus.Ingress[0].IP, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error: %q", err)
 	}
@@ -1266,19 +1266,73 @@ func validatePublicIP(t *testing.T, publicIP *network.PublicIPAddress, service *
 	// Becuase service properties are updated outside of cloudprovider code
 }
 
+func contains(ruleValues []string, targetValue string) bool {
+	for _, ruleValue := range ruleValues {
+		if strings.EqualFold(ruleValue, targetValue) {
+			return true
+		}
+	}
+	return false
+}
+
+func securityRuleMatches(serviceSourceRange string, servicePort v1.ServicePort, serviceIP string, securityRule network.SecurityRule) error {
+	ruleSource := securityRule.SourceAddressPrefixes
+	if ruleSource == nil || len(*ruleSource) == 0 {
+		if securityRule.SourceAddressPrefix == nil {
+			ruleSource = &[]string{}
+		} else {
+			ruleSource = &[]string{*securityRule.SourceAddressPrefix}
+		}
+	}
+
+	rulePorts := securityRule.DestinationPortRanges
+	if rulePorts == nil || len(*rulePorts) == 0 {
+		if securityRule.DestinationPortRange == nil {
+			rulePorts = &[]string{}
+		} else {
+			rulePorts = &[]string{*securityRule.DestinationPortRange}
+		}
+	}
+
+	ruleDestination := securityRule.DestinationAddressPrefixes
+	if ruleDestination == nil || len(*ruleDestination) == 0 {
+		if securityRule.DestinationAddressPrefix == nil {
+			ruleDestination = &[]string{}
+		} else {
+			ruleDestination = &[]string{*securityRule.DestinationAddressPrefix}
+		}
+	}
+
+	if !contains(*ruleSource, serviceSourceRange) {
+		return fmt.Errorf("Rule does not contain source %s", serviceSourceRange)
+	}
+
+	if !contains(*rulePorts, fmt.Sprintf("%d", servicePort.Port)) {
+		return fmt.Errorf("Rule does not contain port %d", servicePort.Port)
+	}
+
+	if serviceIP != "" && !contains(*ruleDestination, serviceIP) {
+		return fmt.Errorf("Rule does not contain destination %s", serviceIP)
+	}
+
+	return nil
+}
+
 func validateSecurityGroup(t *testing.T, securityGroup *network.SecurityGroup, services ...v1.Service) {
-	expectedRuleCount := 0
+	seenRules := make(map[string]string)
 	for _, svc := range services {
 		for _, wantedRule := range svc.Spec.Ports {
 			sources := getServiceSourceRanges(&svc)
 			for _, source := range sources {
 				wantedRuleName := getSecurityRuleName(&svc, wantedRule, source)
-				expectedRuleCount++
+				seenRules[wantedRuleName] = wantedRuleName
 				foundRule := false
 				for _, actualRule := range *securityGroup.SecurityRules {
-					if strings.EqualFold(*actualRule.Name, wantedRuleName) &&
-						*actualRule.SourceAddressPrefix == source &&
-						*actualRule.DestinationPortRange == fmt.Sprintf("%d", wantedRule.Port) {
+					if strings.EqualFold(*actualRule.Name, wantedRuleName) {
+						err := securityRuleMatches(source, wantedRule, svc.Spec.LoadBalancerIP, actualRule)
+						if err != nil {
+							t.Errorf("Found matching security rule %q but properties were incorrect: %v", wantedRuleName, err)
+						}
 						foundRule = true
 						break
 					}
@@ -1291,6 +1345,7 @@ func validateSecurityGroup(t *testing.T, securityGroup *network.SecurityGroup, s
 	}
 
 	lenRules := len(*securityGroup.SecurityRules)
+	expectedRuleCount := len(seenRules)
 	if lenRules != expectedRuleCount {
 		t.Errorf("Expected the loadbalancer to have %d rules. Found %d.\n", expectedRuleCount, lenRules)
 	}
@@ -1698,3 +1753,831 @@ func addTestSubnet(t *testing.T, az *Cloud, svc *v1.Service) {
 	}
 	svc.Annotations[ServiceAnnotationLoadBalancerInternalSubnet] = subName
 }
+
+func TestIfServiceSpecifiesSharedRuleAndRuleDoesNotExistItIsCreated(t *testing.T) {
+	az := getTestCloud()
+	svc := getTestService("servicesr", v1.ProtocolTCP, 80)
+	svc.Spec.LoadBalancerIP = "192.168.77.88"
+	svc.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc, to.StringPtr(svc.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc)
+
+	expectedRuleName := "shared-TCP-80-Internet"
+	_, securityRule, ruleFound := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName)
+	if !ruleFound {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 80}, "192.168.77.88", securityRule)
+	if err != nil {
+		t.Errorf("Shared rule was not updated with new service IP: %v", err)
+	}
+
+	if securityRule.Priority == nil {
+		t.Errorf("Shared rule %s had no priority", expectedRuleName)
+	}
+
+	if securityRule.Access != network.SecurityRuleAccessAllow {
+		t.Errorf("Shared rule %s did not have Allow access", expectedRuleName)
+	}
+
+	if securityRule.Direction != network.SecurityRuleDirectionInbound {
+		t.Errorf("Shared rule %s did not have Inbound direction", expectedRuleName)
+	}
+}
+
+func TestIfServiceSpecifiesSharedRuleAndRuleExistsThenTheServicesPortAndAddressAreAdded(t *testing.T) {
+	az := getTestCloud()
+	svc := getTestService("servicesr", v1.ProtocolTCP, 80)
+	svc.Spec.LoadBalancerIP = "192.168.77.88"
+	svc.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	expectedRuleName := "shared-TCP-80-Internet"
+
+	sg := getTestSecurityGroup(az)
+	sg.SecurityRules = &[]network.SecurityRule{
+		{
+			Name: &expectedRuleName,
+			SecurityRulePropertiesFormat: &network.SecurityRulePropertiesFormat{
+				Protocol:                 network.SecurityRuleProtocolTCP,
+				SourcePortRange:          to.StringPtr("*"),
+				SourceAddressPrefix:      to.StringPtr("Internet"),
+				DestinationPortRange:     to.StringPtr("80"),
+				DestinationAddressPrefix: to.StringPtr("192.168.33.44"),
+				Access:    network.SecurityRuleAccessAllow,
+				Direction: network.SecurityRuleDirectionInbound,
+			},
+		},
+	}
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc, to.StringPtr(svc.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc)
+
+	_, securityRule, ruleFound := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName)
+	if !ruleFound {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName)
+	}
+
+	expectedDestinationIPCount := 2
+	if len(*securityRule.DestinationAddressPrefixes) != expectedDestinationIPCount {
+		t.Errorf("Shared rule should have had %d destination IP addresses but had %d", expectedDestinationIPCount, len(*securityRule.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 80}, "192.168.33.44", securityRule)
+	if err != nil {
+		t.Errorf("Shared rule no longer matched other service IP: %v", err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 80}, "192.168.77.88", securityRule)
+	if err != nil {
+		t.Errorf("Shared rule was not updated with new service IP: %v", err)
+	}
+}
+
+func TestIfServicesSpecifySharedRuleButDifferentPortsThenSeparateRulesAreCreated(t *testing.T) {
+	az := getTestCloud()
+
+	svc1 := getTestService("servicesr1", v1.ProtocolTCP, 4444)
+	svc1.Spec.LoadBalancerIP = "192.168.77.88"
+	svc1.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc2 := getTestService("servicesr2", v1.ProtocolTCP, 8888)
+	svc2.Spec.LoadBalancerIP = "192.168.33.44"
+	svc2.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	expectedRuleName1 := "shared-TCP-4444-Internet"
+	expectedRuleName2 := "shared-TCP-8888-Internet"
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc2, to.StringPtr(svc2.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc2: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc1, svc2)
+
+	_, securityRule1, rule1Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName1)
+	if !rule1Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName1)
+	}
+
+	_, securityRule2, rule2Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName2)
+	if !rule2Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName2)
+	}
+
+	expectedDestinationIPCount1 := 1
+	if len(*securityRule1.DestinationAddressPrefixes) != expectedDestinationIPCount1 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName1, expectedDestinationIPCount1, len(*securityRule1.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule1)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName1, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.33.44", securityRule1)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName1)
+	}
+
+	expectedDestinationIPCount2 := 1
+	if len(*securityRule2.DestinationAddressPrefixes) != expectedDestinationIPCount2 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName2, expectedDestinationIPCount2, len(*securityRule2.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.33.44", securityRule2)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName2, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName2)
+	}
+}
+
+func TestIfServicesSpecifySharedRuleButDifferentProtocolsThenSeparateRulesAreCreated(t *testing.T) {
+	az := getTestCloud()
+
+	svc1 := getTestService("servicesr1", v1.ProtocolTCP, 4444)
+	svc1.Spec.LoadBalancerIP = "192.168.77.88"
+	svc1.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc2 := getTestService("servicesr2", v1.ProtocolUDP, 4444)
+	svc2.Spec.LoadBalancerIP = "192.168.77.88"
+	svc2.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	expectedRuleName1 := "shared-TCP-4444-Internet"
+	expectedRuleName2 := "shared-UDP-4444-Internet"
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc2, to.StringPtr(svc2.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc2: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc1, svc2)
+
+	_, securityRule1, rule1Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName1)
+	if !rule1Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName1)
+	}
+
+	_, securityRule2, rule2Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName2)
+	if !rule2Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName2)
+	}
+
+	expectedDestinationIPCount1 := 1
+	if len(*securityRule1.DestinationAddressPrefixes) != expectedDestinationIPCount1 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName1, expectedDestinationIPCount1, len(*securityRule1.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule1)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName1, err)
+	}
+
+	if securityRule1.Protocol != network.SecurityRuleProtocolTCP {
+		t.Errorf("Shared rule %s should have been %s but was %s", expectedRuleName1, network.SecurityRuleProtocolTCP, securityRule1.Protocol)
+	}
+
+	expectedDestinationIPCount2 := 1
+	if len(*securityRule2.DestinationAddressPrefixes) != expectedDestinationIPCount2 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName2, expectedDestinationIPCount2, len(*securityRule2.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule2)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName2, err)
+	}
+
+	if securityRule2.Protocol != network.SecurityRuleProtocolUDP {
+		t.Errorf("Shared rule %s should have been %s but was %s", expectedRuleName2, network.SecurityRuleProtocolUDP, securityRule2.Protocol)
+	}
+}
+
+func TestIfServicesSpecifySharedRuleButDifferentSourceAddressesThenSeparateRulesAreCreated(t *testing.T) {
+	az := getTestCloud()
+
+	svc1 := getTestService("servicesr1", v1.ProtocolTCP, 80)
+	svc1.Spec.LoadBalancerIP = "192.168.77.88"
+	svc1.Spec.LoadBalancerSourceRanges = []string{"192.168.12.0/24"}
+	svc1.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc2 := getTestService("servicesr2", v1.ProtocolTCP, 80)
+	svc2.Spec.LoadBalancerIP = "192.168.33.44"
+	svc2.Spec.LoadBalancerSourceRanges = []string{"192.168.34.0/24"}
+	svc2.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	expectedRuleName1 := "shared-TCP-80-192.168.12.0_24"
+	expectedRuleName2 := "shared-TCP-80-192.168.34.0_24"
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc2, to.StringPtr(svc2.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc2: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc1, svc2)
+
+	_, securityRule1, rule1Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName1)
+	if !rule1Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName1)
+	}
+
+	_, securityRule2, rule2Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName2)
+	if !rule2Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName2)
+	}
+
+	expectedDestinationIPCount1 := 1
+	if len(*securityRule1.DestinationAddressPrefixes) != expectedDestinationIPCount1 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName1, expectedDestinationIPCount1, len(*securityRule1.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches(svc1.Spec.LoadBalancerSourceRanges[0], v1.ServicePort{Port: 80}, "192.168.77.88", securityRule1)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName1, err)
+	}
+
+	err = securityRuleMatches(svc2.Spec.LoadBalancerSourceRanges[0], v1.ServicePort{Port: 80}, "192.168.33.44", securityRule1)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName1)
+	}
+
+	expectedDestinationIPCount2 := 1
+	if len(*securityRule2.DestinationAddressPrefixes) != expectedDestinationIPCount2 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName2, expectedDestinationIPCount2, len(*securityRule2.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches(svc2.Spec.LoadBalancerSourceRanges[0], v1.ServicePort{Port: 80}, "192.168.33.44", securityRule2)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName2, err)
+	}
+
+	err = securityRuleMatches(svc1.Spec.LoadBalancerSourceRanges[0], v1.ServicePort{Port: 80}, "192.168.77.88", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName2)
+	}
+}
+
+func TestIfServicesSpecifySharedRuleButSomeAreOnDifferentPortsThenRulesAreSeparatedOrConsoliatedByPort(t *testing.T) {
+	az := getTestCloud()
+
+	svc1 := getTestService("servicesr1", v1.ProtocolTCP, 4444)
+	svc1.Spec.LoadBalancerIP = "192.168.77.88"
+	svc1.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc2 := getTestService("servicesr2", v1.ProtocolTCP, 8888)
+	svc2.Spec.LoadBalancerIP = "192.168.33.44"
+	svc2.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc3 := getTestService("servicesr3", v1.ProtocolTCP, 4444)
+	svc3.Spec.LoadBalancerIP = "192.168.99.11"
+	svc3.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	expectedRuleName13 := "shared-TCP-4444-Internet"
+	expectedRuleName2 := "shared-TCP-8888-Internet"
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc2, to.StringPtr(svc2.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc2: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc3, to.StringPtr(svc3.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc3: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc1, svc2, svc3)
+
+	_, securityRule13, rule13Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName13)
+	if !rule13Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName13)
+	}
+
+	_, securityRule2, rule2Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName2)
+	if !rule2Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName2)
+	}
+
+	expectedDestinationIPCount13 := 2
+	if len(*securityRule13.DestinationAddressPrefixes) != expectedDestinationIPCount13 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName13, expectedDestinationIPCount13, len(*securityRule13.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule13)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName13, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.99.11", securityRule13)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName13, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.33.44", securityRule13)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName13)
+	}
+
+	if securityRule13.Priority == nil {
+		t.Errorf("Shared rule %s had no priority", expectedRuleName13)
+	}
+
+	if securityRule13.Access != network.SecurityRuleAccessAllow {
+		t.Errorf("Shared rule %s did not have Allow access", expectedRuleName13)
+	}
+
+	if securityRule13.Direction != network.SecurityRuleDirectionInbound {
+		t.Errorf("Shared rule %s did not have Inbound direction", expectedRuleName13)
+	}
+
+	expectedDestinationIPCount2 := 1
+	if len(*securityRule2.DestinationAddressPrefixes) != expectedDestinationIPCount2 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName2, expectedDestinationIPCount2, len(*securityRule2.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.33.44", securityRule2)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName2, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName2)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.99.11", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName2)
+	}
+}
+
+func TestIfServiceSpecifiesSharedRuleAndServiceIsDeletedThenTheServicesPortAndAddressAreRemoved(t *testing.T) {
+	az := getTestCloud()
+
+	svc1 := getTestService("servicesr1", v1.ProtocolTCP, 80)
+	svc1.Spec.LoadBalancerIP = "192.168.77.88"
+	svc1.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc2 := getTestService("servicesr2", v1.ProtocolTCP, 80)
+	svc2.Spec.LoadBalancerIP = "192.168.33.44"
+	svc2.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	expectedRuleName := "shared-TCP-80-Internet"
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc2, to.StringPtr(svc2.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc2: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc1, svc2)
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), false)
+	if err != nil {
+		t.Errorf("Unexpected error removing svc1: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc2)
+
+	_, securityRule, ruleFound := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName)
+	if !ruleFound {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName)
+	}
+
+	expectedDestinationIPCount := 1
+	if len(*securityRule.DestinationAddressPrefixes) != expectedDestinationIPCount {
+		t.Errorf("Shared rule should have had %d destination IP addresses but had %d", expectedDestinationIPCount, len(*securityRule.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 80}, "192.168.33.44", securityRule)
+	if err != nil {
+		t.Errorf("Shared rule no longer matched other service IP: %v", err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 80}, "192.168.77.88", securityRule)
+	if err == nil {
+		t.Error("Shared rule was not updated to remove deleted service IP")
+	}
+}
+
+func TestIfSomeServicesShareARuleAndOneIsDeletedItIsRemovedFromTheRightRule(t *testing.T) {
+	az := getTestCloud()
+
+	svc1 := getTestService("servicesr1", v1.ProtocolTCP, 4444)
+	svc1.Spec.LoadBalancerIP = "192.168.77.88"
+	svc1.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc2 := getTestService("servicesr2", v1.ProtocolTCP, 8888)
+	svc2.Spec.LoadBalancerIP = "192.168.33.44"
+	svc2.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc3 := getTestService("servicesr3", v1.ProtocolTCP, 4444)
+	svc3.Spec.LoadBalancerIP = "192.168.99.11"
+	svc3.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	expectedRuleName13 := "shared-TCP-4444-Internet"
+	expectedRuleName2 := "shared-TCP-8888-Internet"
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc2, to.StringPtr(svc2.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc2: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc3, to.StringPtr(svc3.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc3: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc1, svc2, svc3)
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), false)
+	if err != nil {
+		t.Errorf("Unexpected error removing svc1: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc2, svc3)
+
+	_, securityRule13, rule13Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName13)
+	if !rule13Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName13)
+	}
+
+	_, securityRule2, rule2Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName2)
+	if !rule2Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName2)
+	}
+
+	expectedDestinationIPCount13 := 1
+	if len(*securityRule13.DestinationAddressPrefixes) != expectedDestinationIPCount13 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName13, expectedDestinationIPCount13, len(*securityRule13.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule13)
+	if err == nil {
+		t.Errorf("Shared rule %s should have had svc1 removed but did not", expectedRuleName13)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.99.11", securityRule13)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName13, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.33.44", securityRule13)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName13)
+	}
+
+	if securityRule13.Priority == nil {
+		t.Errorf("Shared rule %s had no priority", expectedRuleName13)
+	}
+
+	if securityRule13.Access != network.SecurityRuleAccessAllow {
+		t.Errorf("Shared rule %s did not have Allow access", expectedRuleName13)
+	}
+
+	if securityRule13.Direction != network.SecurityRuleDirectionInbound {
+		t.Errorf("Shared rule %s did not have Inbound direction", expectedRuleName13)
+	}
+
+	expectedDestinationIPCount2 := 1
+	if len(*securityRule2.DestinationAddressPrefixes) != expectedDestinationIPCount2 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName2, expectedDestinationIPCount2, len(*securityRule2.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.33.44", securityRule2)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName2, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName2)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.99.11", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName2)
+	}
+}
+
+func TestIfServiceSpecifiesSharedRuleAndLastServiceIsDeletedThenRuleIsDeleted(t *testing.T) {
+	az := getTestCloud()
+
+	svc1 := getTestService("servicesr1", v1.ProtocolTCP, 4444)
+	svc1.Spec.LoadBalancerIP = "192.168.77.88"
+	svc1.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc2 := getTestService("servicesr2", v1.ProtocolTCP, 8888)
+	svc2.Spec.LoadBalancerIP = "192.168.33.44"
+	svc2.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc3 := getTestService("servicesr3", v1.ProtocolTCP, 4444)
+	svc3.Spec.LoadBalancerIP = "192.168.99.11"
+	svc3.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	expectedRuleName13 := "shared-TCP-4444-Internet"
+	expectedRuleName2 := "shared-TCP-8888-Internet"
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc2, to.StringPtr(svc2.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc2: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc3, to.StringPtr(svc3.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc3: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc1, svc2, svc3)
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), false)
+	if err != nil {
+		t.Errorf("Unexpected error removing svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc3, to.StringPtr(svc3.Spec.LoadBalancerIP), false)
+	if err != nil {
+		t.Errorf("Unexpected error removing svc3: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc2)
+
+	_, _, rule13Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName13)
+	if rule13Found {
+		t.Fatalf("Expected security rule %q to have been deleted but it was still present", expectedRuleName13)
+	}
+
+	_, securityRule2, rule2Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName2)
+	if !rule2Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName2)
+	}
+
+	expectedDestinationIPCount2 := 1
+	if len(*securityRule2.DestinationAddressPrefixes) != expectedDestinationIPCount2 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName2, expectedDestinationIPCount2, len(*securityRule2.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.33.44", securityRule2)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName2, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName2)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.99.11", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong service's port and IP", expectedRuleName2)
+	}
+}
+
+func TestCanCombineSharedAndPrivateRulesInSameGroup(t *testing.T) {
+	az := getTestCloud()
+
+	svc1 := getTestService("servicesr1", v1.ProtocolTCP, 4444)
+	svc1.Spec.LoadBalancerIP = "192.168.77.88"
+	svc1.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc2 := getTestService("servicesr2", v1.ProtocolTCP, 8888)
+	svc2.Spec.LoadBalancerIP = "192.168.33.44"
+	svc2.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc3 := getTestService("servicesr3", v1.ProtocolTCP, 4444)
+	svc3.Spec.LoadBalancerIP = "192.168.99.11"
+	svc3.Annotations[ServiceAnnotationSharedSecurityRule] = "true"
+
+	svc4 := getTestService("servicesr4", v1.ProtocolTCP, 4444)
+	svc4.Spec.LoadBalancerIP = "192.168.22.33"
+	svc4.Annotations[ServiceAnnotationSharedSecurityRule] = "false"
+
+	svc5 := getTestService("servicesr5", v1.ProtocolTCP, 8888)
+	svc5.Spec.LoadBalancerIP = "192.168.22.33"
+	svc5.Annotations[ServiceAnnotationSharedSecurityRule] = "false"
+
+	expectedRuleName13 := "shared-TCP-4444-Internet"
+	expectedRuleName2 := "shared-TCP-8888-Internet"
+	expectedRuleName4 := getSecurityRuleName(&svc4, v1.ServicePort{Port: 4444, Protocol: v1.ProtocolTCP}, "Internet")
+	expectedRuleName5 := getSecurityRuleName(&svc5, v1.ServicePort{Port: 8888, Protocol: v1.ProtocolTCP}, "Internet")
+
+	sg := getTestSecurityGroup(az)
+
+	sg, err := az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc2, to.StringPtr(svc2.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc2: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc3, to.StringPtr(svc3.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc3: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc4, to.StringPtr(svc4.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc4: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc5, to.StringPtr(svc5.Spec.LoadBalancerIP), true)
+	if err != nil {
+		t.Errorf("Unexpected error adding svc4: %q", err)
+	}
+
+	validateSecurityGroup(t, sg, svc1, svc2, svc3, svc4, svc5)
+
+	expectedRuleCount := 4
+	if len(*sg.SecurityRules) != expectedRuleCount {
+		t.Errorf("Expected security group to have %d rules but it had %d", expectedRuleCount, len(*sg.SecurityRules))
+	}
+
+	_, securityRule13, rule13Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName13)
+	if !rule13Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName13)
+	}
+
+	_, securityRule2, rule2Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName2)
+	if !rule2Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName2)
+	}
+
+	_, securityRule4, rule4Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName4)
+	if !rule4Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName4)
+	}
+
+	_, securityRule5, rule5Found := findSecurityRuleByName(*sg.SecurityRules, expectedRuleName5)
+	if !rule5Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName5)
+	}
+
+	expectedDestinationIPCount13 := 2
+	if len(*securityRule13.DestinationAddressPrefixes) != expectedDestinationIPCount13 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName13, expectedDestinationIPCount13, len(*securityRule13.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.77.88", securityRule13)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName13, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.99.11", securityRule13)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName13, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 4444}, "192.168.22.33", securityRule13)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong (unshared) service's port and IP", expectedRuleName13)
+	}
+
+	expectedDestinationIPCount2 := 1
+	if len(*securityRule2.DestinationAddressPrefixes) != expectedDestinationIPCount2 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName2, expectedDestinationIPCount2, len(*securityRule2.DestinationAddressPrefixes))
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.33.44", securityRule2)
+	if err != nil {
+		t.Errorf("Shared rule %s did not match service IP: %v", expectedRuleName2, err)
+	}
+
+	err = securityRuleMatches("Internet", v1.ServicePort{Port: 8888}, "192.168.22.33", securityRule2)
+	if err == nil {
+		t.Errorf("Shared rule %s matched wrong (unshared) service's port and IP", expectedRuleName2)
+	}
+
+	if securityRule4.DestinationAddressPrefixes != nil {
+		t.Errorf("Expected unshared rule %s to use single destination IP address but used collection", expectedRuleName4)
+	}
+
+	if securityRule4.DestinationAddressPrefix == nil {
+		t.Errorf("Expected unshared rule %s to have a destination IP address", expectedRuleName4)
+	} else {
+		if !strings.EqualFold(*securityRule4.DestinationAddressPrefix, svc4.Spec.LoadBalancerIP) {
+			t.Errorf("Expected unshared rule %s to have a destination %s but had %s", expectedRuleName4, svc4.Spec.LoadBalancerIP, *securityRule4.DestinationAddressPrefix)
+		}
+	}
+
+	if securityRule5.DestinationAddressPrefixes != nil {
+		t.Errorf("Expected unshared rule %s to use single destination IP address but used collection", expectedRuleName5)
+	}
+
+	if securityRule5.DestinationAddressPrefix == nil {
+		t.Errorf("Expected unshared rule %s to have a destination IP address", expectedRuleName5)
+	} else {
+		if !strings.EqualFold(*securityRule5.DestinationAddressPrefix, svc5.Spec.LoadBalancerIP) {
+			t.Errorf("Expected unshared rule %s to have a destination %s but had %s", expectedRuleName5, svc5.Spec.LoadBalancerIP, *securityRule5.DestinationAddressPrefix)
+		}
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc1, to.StringPtr(svc1.Spec.LoadBalancerIP), false)
+	if err != nil {
+		t.Errorf("Unexpected error removing svc1: %q", err)
+	}
+
+	sg, err = az.reconcileSecurityGroup(testClusterName, &svc5, to.StringPtr(svc5.Spec.LoadBalancerIP), false)
+	if err != nil {
+		t.Errorf("Unexpected error removing svc5: %q", err)
+	}
+
+	_, securityRule13, rule13Found = findSecurityRuleByName(*sg.SecurityRules, expectedRuleName13)
+	if !rule13Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName13)
+	}
+
+	_, securityRule2, rule2Found = findSecurityRuleByName(*sg.SecurityRules, expectedRuleName2)
+	if !rule2Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName2)
+	}
+
+	_, securityRule4, rule4Found = findSecurityRuleByName(*sg.SecurityRules, expectedRuleName4)
+	if !rule4Found {
+		t.Fatalf("Expected security rule %q but it was not present", expectedRuleName4)
+	}
+
+	_, _, rule5Found = findSecurityRuleByName(*sg.SecurityRules, expectedRuleName5)
+	if rule5Found {
+		t.Fatalf("Expected security rule %q to have been removed but it was not present", expectedRuleName5)
+	}
+
+	expectedDestinationIPCount13 = 1
+	if len(*securityRule13.DestinationAddressPrefixes) != expectedDestinationIPCount13 {
+		t.Errorf("Shared rule %s should have had %d destination IP addresses but had %d", expectedRuleName13, expectedDestinationIPCount13, len(*securityRule13.DestinationAddressPrefixes))
+	}
+}
+
+// TODO: sanity check if the same IP address incorrectly gets put in twice?
+// (shouldn't happen but...)
+
+// func TestIfServiceIsEditedFromOwnRuleToSharedRuleThenOwnRuleIsDeletedAndSharedRuleIsCreated(t *testing.T) {
+// 	t.Error()
+// }
+
+// func TestIfServiceIsEditedFromSharedRuleToOwnRuleThenItIsRemovedFromSharedRuleAndOwnRuleIsCreated(t *testing.T) {
+// 	t.Error()
+// }
diff --git a/pkg/cloudprovider/providers/azure/azure_util.go b/pkg/cloudprovider/providers/azure/azure_util.go
index 04ff821e76..7d2aa565c4 100644
--- a/pkg/cloudprovider/providers/azure/azure_util.go
+++ b/pkg/cloudprovider/providers/azure/azure_util.go
@@ -340,6 +340,10 @@ func getLoadBalancerRuleName(service *v1.Service, port v1.ServicePort, subnetNam
 }
 
 func getSecurityRuleName(service *v1.Service, port v1.ServicePort, sourceAddrPrefix string) string {
+	if useSharedSecurityRule(service) {
+		safePrefix := strings.Replace(sourceAddrPrefix, "/", "_", -1)
+		return fmt.Sprintf("shared-%s-%d-%s", port.Protocol, port.Port, safePrefix)
+	}
 	safePrefix := strings.Replace(sourceAddrPrefix, "/", "_", -1)
 	return fmt.Sprintf("%s-%s-%d-%s", getRulePrefix(service), port.Protocol, port.Port, safePrefix)
 }

From 1e3ec2b639d8dfffe6ecdfa8775d4558f7f4c174 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Fri, 17 Nov 2017 09:33:05 -0800
Subject: [PATCH 27/33] correct doc for reconcileSecurityGroup

---
 pkg/cloudprovider/providers/azure/azure_loadbalancer.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_loadbalancer.md b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
index 05a560b75b..141d066cf1 100644
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
@@ -22,8 +22,10 @@ Service Annotation for Auto and specific load balancer mode
     - Call az cloud to CreateOrUpdate on this lb, or Delete if nothing left
   - return lb, err
 
-- reconcileSecurityGroup(clusterName string, service *v1.Service, lbStatus *v1.LoadBalancerStatus, wantLb bool) (*network.SecurityGroup, error)
+- reconcileSecurityGroup(clusterName string, service *v1.Service, lbIP *string, wantLb bool) (*network.SecurityGroup, error)
   - Go though NSG' properties, update based on wantLb
+    - Use destinationIPAddress as target address if possible
+    - Consolidate NSG rules if possible
   - If any change on the NSG, (the NSG should always exists)
     - Call az cloud to CreateOrUpdate on this NSG
   - return sg, err

From 35964d4a80e7f931e51860109fd0741b4c93be53 Mon Sep 17 00:00:00 2001
From: Jingtao Ren <jiren@microsoft.com>
Date: Mon, 20 Nov 2017 09:53:34 -0800
Subject: [PATCH 28/33] fix rebase test error

---
 .../providers/azure/azure_test.go             | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/pkg/cloudprovider/providers/azure/azure_test.go b/pkg/cloudprovider/providers/azure/azure_test.go
index db6fac8d3f..f22182ab1b 100644
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
@@ -581,17 +581,13 @@ func findLBRuleForPort(lbRules []network.LoadBalancingRule, port int32) (network
 func TestServiceDefaultsToNoSessionPersistence(t *testing.T) {
 	az := getTestCloud()
 	svc := getTestService("service-sa-omitted", v1.ProtocolTCP, 7170)
-	configProperties := getTestPublicFipConfigurationProperties()
-	lb := getTestLoadBalancer()
-	nodes := []*v1.Node{}
+	clusterResources := getClusterResources(az, 1, 1)
 
-	lb, _, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error reconciling svc1: %q", err)
 	}
-
 	validateLoadBalancer(t, lb, svc)
-
 	lbRule, err := findLBRuleForPort(*lb.LoadBalancingRules, 7170)
 	if err != nil {
 		t.Error(err)
@@ -606,11 +602,9 @@ func TestServiceRespectsNoSessionAffinity(t *testing.T) {
 	az := getTestCloud()
 	svc := getTestService("service-sa-none", v1.ProtocolTCP, 7170)
 	svc.Spec.SessionAffinity = v1.ServiceAffinityNone
-	configProperties := getTestPublicFipConfigurationProperties()
-	lb := getTestLoadBalancer()
-	nodes := []*v1.Node{}
+	clusterResources := getClusterResources(az, 1, 1)
 
-	lb, _, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error reconciling svc1: %q", err)
 	}
@@ -631,11 +625,9 @@ func TestServiceRespectsClientIPSessionAffinity(t *testing.T) {
 	az := getTestCloud()
 	svc := getTestService("service-sa-clientip", v1.ProtocolTCP, 7170)
 	svc.Spec.SessionAffinity = v1.ServiceAffinityClientIP
-	configProperties := getTestPublicFipConfigurationProperties()
-	lb := getTestLoadBalancer()
-	nodes := []*v1.Node{}
+	clusterResources := getClusterResources(az, 1, 1)
 
-	lb, _, err := az.reconcileLoadBalancer(lb, &configProperties, testClusterName, &svc, nodes)
+	lb, err := az.reconcileLoadBalancer(testClusterName, &svc, clusterResources.nodes, true /* wantLb */)
 	if err != nil {
 		t.Errorf("Unexpected error reconciling svc1: %q", err)
 	}

From b16bfc768d83095b3012e11c5f937540847ca3a7 Mon Sep 17 00:00:00 2001
From: "Niklas Q. Nielsen" <niklas.nielsen@intel.com>
Date: Tue, 14 Nov 2017 21:06:07 +0000
Subject: [PATCH 29/33] Merging handler into manager API

---
 pkg/kubelet/cm/container_manager_linux.go     |  14 +-
 pkg/kubelet/cm/deviceplugin/BUILD             |   4 +-
 .../cm/deviceplugin/device_plugin_handler.go  | 369 ----------------
 .../device_plugin_handler_test.go             | 414 -----------------
 pkg/kubelet/cm/deviceplugin/endpoint.go       |  36 +-
 pkg/kubelet/cm/deviceplugin/endpoint_test.go  |   8 +-
 pkg/kubelet/cm/deviceplugin/manager.go        | 356 +++++++++++++--
 ...plugin_handler_stub.go => manager_stub.go} |  23 +-
 pkg/kubelet/cm/deviceplugin/manager_test.go   | 416 +++++++++++++++++-
 pkg/kubelet/cm/deviceplugin/pod_devices.go    |   5 +
 pkg/kubelet/cm/deviceplugin/types.go          |  30 +-
 11 files changed, 804 insertions(+), 871 deletions(-)
 delete mode 100644 pkg/kubelet/cm/deviceplugin/device_plugin_handler.go
 delete mode 100644 pkg/kubelet/cm/deviceplugin/device_plugin_handler_test.go
 rename pkg/kubelet/cm/deviceplugin/{device_plugin_handler_stub.go => manager_stub.go} (67%)

diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go
index 2d9f0d4089..870e4900e1 100644
--- a/pkg/kubelet/cm/container_manager_linux.go
+++ b/pkg/kubelet/cm/container_manager_linux.go
@@ -128,7 +128,7 @@ type containerManagerImpl struct {
 	// Interface for QoS cgroup management
 	qosContainerManager QOSContainerManager
 	// Interface for exporting and allocating devices reported by device plugins.
-	devicePluginHandler deviceplugin.Handler
+	devicePluginManager deviceplugin.Manager
 	// Interface for CPU affinity management.
 	cpuManager cpumanager.Manager
 }
@@ -274,11 +274,11 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
 		}
 	}
 
-	glog.Infof("Creating device plugin handler: %t", devicePluginEnabled)
+	glog.Infof("Creating device plugin manager: %t", devicePluginEnabled)
 	if devicePluginEnabled {
-		cm.devicePluginHandler, err = deviceplugin.NewHandlerImpl(updateDeviceCapacityFunc)
+		cm.devicePluginManager, err = deviceplugin.NewManagerImpl(updateDeviceCapacityFunc)
 	} else {
-		cm.devicePluginHandler, err = deviceplugin.NewHandlerStub()
+		cm.devicePluginManager, err = deviceplugin.NewManagerStub()
 	}
 	if err != nil {
 		return nil, err
@@ -597,7 +597,7 @@ func (cm *containerManagerImpl) Start(node *v1.Node,
 	}, time.Second, stopChan)
 
 	// Starts device plugin manager.
-	if err := cm.devicePluginHandler.Start(deviceplugin.ActivePodsFunc(activePods)); err != nil {
+	if err := cm.devicePluginManager.Start(deviceplugin.ActivePodsFunc(activePods)); err != nil {
 		return err
 	}
 	return nil
@@ -622,7 +622,7 @@ func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Containe
 	opts := &kubecontainer.RunContainerOptions{}
 	// Allocate should already be called during predicateAdmitHandler.Admit(),
 	// just try to fetch device runtime information from cached state here
-	devOpts := cm.devicePluginHandler.GetDeviceRunContainerOptions(pod, container)
+	devOpts := cm.devicePluginManager.GetDeviceRunContainerOptions(pod, container)
 	if devOpts == nil {
 		return opts, nil
 	}
@@ -633,7 +633,7 @@ func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Containe
 }
 
 func (cm *containerManagerImpl) UpdatePluginResources(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
-	return cm.devicePluginHandler.Allocate(node, attrs)
+	return cm.devicePluginManager.Allocate(node, attrs)
 }
 
 func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList {
diff --git a/pkg/kubelet/cm/deviceplugin/BUILD b/pkg/kubelet/cm/deviceplugin/BUILD
index 1d650b6d89..341b59691a 100644
--- a/pkg/kubelet/cm/deviceplugin/BUILD
+++ b/pkg/kubelet/cm/deviceplugin/BUILD
@@ -9,11 +9,10 @@ load(
 go_library(
     name = "go_default_library",
     srcs = [
-        "device_plugin_handler.go",
-        "device_plugin_handler_stub.go",
         "device_plugin_stub.go",
         "endpoint.go",
         "manager.go",
+        "manager_stub.go",
         "pod_devices.go",
         "types.go",
     ],
@@ -49,7 +48,6 @@ filegroup(
 go_test(
     name = "go_default_test",
     srcs = [
-        "device_plugin_handler_test.go",
         "endpoint_test.go",
         "manager_test.go",
     ],
diff --git a/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go b/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go
deleted file mode 100644
index 7303e507d5..0000000000
--- a/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package deviceplugin
-
-import (
-	"encoding/json"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"sync"
-
-	"github.com/golang/glog"
-
-	"k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/resource"
-	"k8s.io/apimachinery/pkg/util/sets"
-	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
-	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
-	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
-)
-
-// ActivePodsFunc is a function that returns a list of pods to reconcile.
-type ActivePodsFunc func() []*v1.Pod
-
-// Handler defines the functions used to manage and access device plugin resources.
-type Handler interface {
-	// Start starts device plugin registration service.
-	Start(activePods ActivePodsFunc) error
-	// Devices returns all of registered devices keyed by resourceName.
-	Devices() map[string][]pluginapi.Device
-	// Allocate scans through containers in the pod spec
-	// If it finds the container requires device plugin resource, it:
-	// 1. Checks whether it already has this information in its cached state.
-	// 2. If not, it calls Allocate and populate its cached state afterwards.
-	// 3. If there is no cached state and Allocate fails, it returns an error.
-	// 4. Otherwise, it updates allocatableResource in nodeInfo if necessary,
-	// to make sure it is at least equal to the pod's requested capacity for
-	// any registered device plugin resource
-	Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error
-	// GetDeviceRunContainerOptions checks whether we have cached containerDevices
-	// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
-	// for the found one. An empty struct is returned in case no cached state is found.
-	GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions
-}
-
-// HandlerImpl implements the actual functionality to manage device plugin resources.
-type HandlerImpl struct {
-	// TODO: consider to change this to RWMutex.
-	sync.Mutex
-	// devicePluginManager is an implementation of deviceplugin.Manager interface.
-	devicePluginManager Manager
-	// activePods is a method for listing active pods on the node
-	// so the amount of pluginResources requested by existing pods
-	// could be counted when updating allocated devices
-	activePods ActivePodsFunc
-	// devicePluginManagerMonitorCallback is used for updating devices' states in one time call.
-	// e.g. a new device is advertised, two old devices are deleted and a running device fails.
-	devicePluginManagerMonitorCallback MonitorCallback
-	// allDevices contains all of registered resourceNames and their exported device IDs.
-	allDevices map[string]sets.String
-	// allocatedDevices contains allocated deviceIds, keyed by resourceName.
-	allocatedDevices map[string]sets.String
-	// podDevices contains pod to allocated device mapping.
-	podDevices podDevices
-}
-
-// NewHandlerImpl creates a HandlerImpl to manage device plugin resources.
-// updateCapacityFunc is called to update ContainerManager capacity when
-// device capacity changes.
-func NewHandlerImpl(updateCapacityFunc func(v1.ResourceList)) (*HandlerImpl, error) {
-	glog.V(2).Infof("Creating Device Plugin Handler")
-	handler := &HandlerImpl{
-		allDevices:       make(map[string]sets.String),
-		allocatedDevices: make(map[string]sets.String),
-		podDevices:       make(podDevices),
-	}
-
-	deviceManagerMonitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {
-		var capacity = v1.ResourceList{}
-		kept := append(updated, added...)
-
-		handler.Lock()
-		defer handler.Unlock()
-
-		if _, ok := handler.allDevices[resourceName]; !ok {
-			handler.allDevices[resourceName] = sets.NewString()
-		}
-		// For now, Handler only keeps track of healthy devices.
-		// We can revisit this later when the need comes to track unhealthy devices here.
-		for _, dev := range kept {
-			if dev.Health == pluginapi.Healthy {
-				handler.allDevices[resourceName].Insert(dev.ID)
-			} else {
-				handler.allDevices[resourceName].Delete(dev.ID)
-			}
-		}
-		for _, dev := range deleted {
-			handler.allDevices[resourceName].Delete(dev.ID)
-		}
-		capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(handler.allDevices[resourceName].Len()), resource.DecimalSI)
-		updateCapacityFunc(capacity)
-	}
-
-	mgr, err := NewManagerImpl(pluginapi.KubeletSocket, deviceManagerMonitorCallback)
-	if err != nil {
-		return nil, fmt.Errorf("Failed to initialize device plugin manager: %+v", err)
-	}
-
-	handler.devicePluginManager = mgr
-	handler.devicePluginManagerMonitorCallback = deviceManagerMonitorCallback
-
-	return handler, nil
-}
-
-// Start initializes podDevices and allocatedDevices information from checkpoint-ed state
-// and starts device plugin registration service.
-func (h *HandlerImpl) Start(activePods ActivePodsFunc) error {
-	h.activePods = activePods
-
-	// Loads in allocatedDevices information from disk.
-	err := h.readCheckpoint()
-	if err != nil {
-		glog.Warningf("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date. Err: %v", err)
-	}
-
-	return h.devicePluginManager.Start()
-}
-
-// Devices returns all of registered devices keyed by resourceName.
-func (h *HandlerImpl) Devices() map[string][]pluginapi.Device {
-	return h.devicePluginManager.Devices()
-}
-
-// Returns list of device Ids we need to allocate with Allocate rpc call.
-// Returns empty list in case we don't need to issue the Allocate rpc call.
-func (h *HandlerImpl) devicesToAllocate(podUID, contName, resource string, required int) (sets.String, error) {
-	h.Lock()
-	defer h.Unlock()
-	needed := required
-	// Gets list of devices that have already been allocated.
-	// This can happen if a container restarts for example.
-	devices := h.podDevices.containerDevices(podUID, contName, resource)
-	if devices != nil {
-		glog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, contName, podUID, devices.List())
-		needed = needed - devices.Len()
-		// A pod's resource is not expected to change once admitted by the API server,
-		// so just fail loudly here. We can revisit this part if this no longer holds.
-		if needed != 0 {
-			return nil, fmt.Errorf("pod %v container %v changed request for resource %v from %v to %v", podUID, contName, resource, devices.Len(), required)
-		}
-	}
-	if needed == 0 {
-		// No change, no work.
-		return nil, nil
-	}
-	devices = sets.NewString()
-	// Needs to allocate additional devices.
-	if h.allocatedDevices[resource] == nil {
-		h.allocatedDevices[resource] = sets.NewString()
-	}
-	// Gets Devices in use.
-	devicesInUse := h.allocatedDevices[resource]
-	// Gets a list of available devices.
-	available := h.allDevices[resource].Difference(devicesInUse)
-	if int(available.Len()) < needed {
-		return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
-	}
-	allocated := available.UnsortedList()[:needed]
-	// Updates h.allocatedDevices with allocated devices to prevent them
-	// from being allocated to other pods/containers, given that we are
-	// not holding lock during the rpc call.
-	for _, device := range allocated {
-		h.allocatedDevices[resource].Insert(device)
-		devices.Insert(device)
-	}
-	return devices, nil
-}
-
-// allocateContainerResources attempts to allocate all of required device
-// plugin resources for the input container, issues an Allocate rpc request
-// for each new device resource requirement, processes their AllocateResponses,
-// and updates the cached containerDevices on success.
-func (h *HandlerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container) error {
-	podUID := string(pod.UID)
-	contName := container.Name
-	allocatedDevicesUpdated := false
-	for k, v := range container.Resources.Limits {
-		resource := string(k)
-		needed := int(v.Value())
-		glog.V(3).Infof("needs %d %s", needed, resource)
-		if _, registeredResource := h.allDevices[resource]; !registeredResource {
-			continue
-		}
-		// Updates allocatedDevices to garbage collect any stranded resources
-		// before doing the device plugin allocation.
-		if !allocatedDevicesUpdated {
-			h.updateAllocatedDevices(h.activePods())
-			allocatedDevicesUpdated = true
-		}
-		allocDevices, err := h.devicesToAllocate(podUID, contName, resource, needed)
-		if err != nil {
-			return err
-		}
-		if allocDevices == nil || len(allocDevices) <= 0 {
-			continue
-		}
-		// devicePluginManager.Allocate involves RPC calls to device plugin, which
-		// could be heavy-weight. Therefore we want to perform this operation outside
-		// mutex lock. Note if Allcate call fails, we may leave container resources
-		// partially allocated for the failed container. We rely on updateAllocatedDevices()
-		// to garbage collect these resources later. Another side effect is that if
-		// we have X resource A and Y resource B in total, and two containers, container1
-		// and container2 both require X resource A and Y resource B. Both allocation
-		// requests may fail if we serve them in mixed order.
-		// TODO: may revisit this part later if we see inefficient resource allocation
-		// in real use as the result of this. Should also consider to parallize device
-		// plugin Allocate grpc calls if it becomes common that a container may require
-		// resources from multiple device plugins.
-		resp, err := h.devicePluginManager.Allocate(resource, allocDevices.UnsortedList())
-		if err != nil {
-			// In case of allocation failure, we want to restore h.allocatedDevices
-			// to the actual allocated state from h.podDevices.
-			h.Lock()
-			h.allocatedDevices = h.podDevices.devices()
-			h.Unlock()
-			return err
-		}
-
-		// Update internal cached podDevices state.
-		h.Lock()
-		h.podDevices.insert(podUID, contName, resource, allocDevices, resp)
-		h.Unlock()
-	}
-
-	// Checkpoints device to container allocation information.
-	return h.writeCheckpoint()
-}
-
-// Allocate attempts to allocate all of required device plugin resources,
-// and update Allocatable resources in nodeInfo if necessary
-func (h *HandlerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
-	pod := attrs.Pod
-	// TODO: Reuse devices between init containers and regular containers.
-	for _, container := range pod.Spec.InitContainers {
-		if err := h.allocateContainerResources(pod, &container); err != nil {
-			return err
-		}
-	}
-	for _, container := range pod.Spec.Containers {
-		if err := h.allocateContainerResources(pod, &container); err != nil {
-			return err
-		}
-	}
-
-	// quick return if no pluginResources requested
-	if _, podRequireDevicePluginResource := h.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
-		return nil
-	}
-
-	h.sanitizeNodeAllocatable(node)
-
-	return nil
-}
-
-// sanitizeNodeAllocatable scans through allocatedDevices in DevicePluginHandler
-// and if necessary, updates allocatableResource in nodeInfo to at least equal to
-// the allocated capacity. This allows pods that have already been scheduled on
-// the node to pass GeneralPredicates admission checking even upon device plugin failure.
-func (h *HandlerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
-	var newAllocatableResource *schedulercache.Resource
-	allocatableResource := node.AllocatableResource()
-	if allocatableResource.ScalarResources == nil {
-		allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
-	}
-	for resource, devices := range h.allocatedDevices {
-		needed := devices.Len()
-		quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
-		if ok && int(quant) >= needed {
-			continue
-		}
-		// Needs to update nodeInfo.AllocatableResource to make sure
-		// NodeInfo.allocatableResource at least equal to the capacity already allocated.
-		if newAllocatableResource == nil {
-			newAllocatableResource = allocatableResource.Clone()
-		}
-		newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
-	}
-	if newAllocatableResource != nil {
-		node.SetAllocatableResource(newAllocatableResource)
-	}
-}
-
-// GetDeviceRunContainerOptions checks whether we have cached containerDevices
-// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
-// for the found one. An empty struct is returned in case no cached state is found.
-func (h *HandlerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
-	h.Lock()
-	defer h.Unlock()
-	return h.podDevices.deviceRunContainerOptions(string(pod.UID), container.Name)
-}
-
-// updateAllocatedDevices gets a list of active pods and then frees any Devices that are bound to
-// terminated pods. Returns error on failure.
-func (h *HandlerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
-	h.Lock()
-	defer h.Unlock()
-	activePodUids := sets.NewString()
-	for _, pod := range activePods {
-		activePodUids.Insert(string(pod.UID))
-	}
-	allocatedPodUids := h.podDevices.pods()
-	podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
-	if len(podsToBeRemoved) <= 0 {
-		return
-	}
-	glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
-	h.podDevices.delete(podsToBeRemoved.List())
-	// Regenerated allocatedDevices after we update pod allocation information.
-	h.allocatedDevices = h.podDevices.devices()
-}
-
-// Checkpoints device to container allocation information to disk.
-func (h *HandlerImpl) writeCheckpoint() error {
-	h.Lock()
-	data := h.podDevices.toCheckpointData()
-	h.Unlock()
-
-	dataJSON, err := json.Marshal(data)
-	if err != nil {
-		return err
-	}
-	filepath := h.devicePluginManager.CheckpointFile()
-	return ioutil.WriteFile(filepath, dataJSON, 0644)
-}
-
-// Reads device to container allocation information from disk, and populates
-// h.allocatedDevices accordingly.
-func (h *HandlerImpl) readCheckpoint() error {
-	filepath := h.devicePluginManager.CheckpointFile()
-	content, err := ioutil.ReadFile(filepath)
-	if err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("failed to read checkpoint file %q: %v", filepath, err)
-	}
-	glog.V(2).Infof("Read checkpoint file %s\n", filepath)
-	var data checkpointData
-	if err := json.Unmarshal(content, &data); err != nil {
-		return fmt.Errorf("failed to unmarshal checkpoint data: %v", err)
-	}
-
-	h.Lock()
-	defer h.Unlock()
-	h.podDevices.fromCheckpointData(data)
-	h.allocatedDevices = h.podDevices.devices()
-	return nil
-}
diff --git a/pkg/kubelet/cm/deviceplugin/device_plugin_handler_test.go b/pkg/kubelet/cm/deviceplugin/device_plugin_handler_test.go
deleted file mode 100644
index 252968c380..0000000000
--- a/pkg/kubelet/cm/deviceplugin/device_plugin_handler_test.go
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package deviceplugin
-
-import (
-	"flag"
-	"fmt"
-	"reflect"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-
-	"k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/resource"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/util/sets"
-	"k8s.io/apimachinery/pkg/util/uuid"
-	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
-	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
-	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
-)
-
-func TestUpdateCapacity(t *testing.T) {
-	var expected = v1.ResourceList{}
-	as := assert.New(t)
-	verifyCapacityFunc := func(updates v1.ResourceList) {
-		as.Equal(expected, updates)
-	}
-	testHandler, err := NewHandlerImpl(verifyCapacityFunc)
-	as.NotNil(testHandler)
-	as.Nil(err)
-
-	devs := []pluginapi.Device{
-		{ID: "Device1", Health: pluginapi.Healthy},
-		{ID: "Device2", Health: pluginapi.Healthy},
-		{ID: "Device3", Health: pluginapi.Unhealthy},
-	}
-
-	resourceName := "resource1"
-	// Adds three devices for resource1, two healthy and one unhealthy.
-	// Expects capacity for resource1 to be 2.
-	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
-	testHandler.devicePluginManagerMonitorCallback(resourceName, devs, []pluginapi.Device{}, []pluginapi.Device{})
-	// Deletes an unhealthy device should NOT change capacity.
-	testHandler.devicePluginManagerMonitorCallback(resourceName, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[2]})
-	// Updates a healthy device to unhealthy should reduce capacity by 1.
-	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(1), resource.DecimalSI)
-	// Deletes a healthy device should reduce capacity by 1.
-	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(0), resource.DecimalSI)
-	// Tests adding another resource.
-	delete(expected, v1.ResourceName(resourceName))
-	resourceName2 := "resource2"
-	expected[v1.ResourceName(resourceName2)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
-	testHandler.devicePluginManagerMonitorCallback(resourceName2, devs, []pluginapi.Device{}, []pluginapi.Device{})
-}
-
-type stringPairType struct {
-	value1 string
-	value2 string
-}
-
-// DevicePluginManager stub to test device Allocation behavior.
-type DevicePluginManagerTestStub struct {
-	// All data structs are keyed by resourceName+DevId
-	devRuntimeDevices map[string][]stringPairType
-	devRuntimeMounts  map[string][]stringPairType
-	devRuntimeEnvs    map[string][]stringPairType
-}
-
-func NewDevicePluginManagerTestStub() (*DevicePluginManagerTestStub, error) {
-	return &DevicePluginManagerTestStub{
-		devRuntimeDevices: make(map[string][]stringPairType),
-		devRuntimeMounts:  make(map[string][]stringPairType),
-		devRuntimeEnvs:    make(map[string][]stringPairType),
-	}, nil
-}
-
-func (m *DevicePluginManagerTestStub) Start() error {
-	return nil
-}
-
-func (m *DevicePluginManagerTestStub) Devices() map[string][]pluginapi.Device {
-	return make(map[string][]pluginapi.Device)
-}
-
-func (m *DevicePluginManagerTestStub) Allocate(resourceName string, devIds []string) (*pluginapi.AllocateResponse, error) {
-	resp := new(pluginapi.AllocateResponse)
-	resp.Envs = make(map[string]string)
-	for _, id := range devIds {
-		key := resourceName + id
-		fmt.Printf("Alloc device %v for resource %v\n", id, resourceName)
-		for _, dev := range m.devRuntimeDevices[key] {
-			fmt.Printf("Add dev %v %v\n", dev.value1, dev.value2)
-			resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
-				ContainerPath: dev.value1,
-				HostPath:      dev.value2,
-				Permissions:   "mrw",
-			})
-		}
-		for _, mount := range m.devRuntimeMounts[key] {
-			fmt.Printf("Add mount %v %v\n", mount.value1, mount.value2)
-			resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
-				ContainerPath: mount.value1,
-				HostPath:      mount.value2,
-				ReadOnly:      true,
-			})
-		}
-		for _, env := range m.devRuntimeEnvs[key] {
-			fmt.Printf("Add env %v %v\n", env.value1, env.value2)
-			resp.Envs[env.value1] = env.value2
-		}
-	}
-	return resp, nil
-}
-
-func (m *DevicePluginManagerTestStub) Stop() error {
-	return nil
-}
-
-func (m *DevicePluginManagerTestStub) CheckpointFile() string {
-	return "/tmp/device-plugin-checkpoint"
-}
-
-func constructDevices(devices []string) sets.String {
-	ret := sets.NewString()
-	for _, dev := range devices {
-		ret.Insert(dev)
-	}
-	return ret
-}
-
-func constructAllocResp(devices, mounts, envs map[string]string) *pluginapi.AllocateResponse {
-	resp := &pluginapi.AllocateResponse{}
-	for k, v := range devices {
-		resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
-			HostPath:      k,
-			ContainerPath: v,
-			Permissions:   "mrw",
-		})
-	}
-	for k, v := range mounts {
-		resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
-			ContainerPath: k,
-			HostPath:      v,
-			ReadOnly:      true,
-		})
-	}
-	resp.Envs = make(map[string]string)
-	for k, v := range envs {
-		resp.Envs[k] = v
-	}
-	return resp
-}
-
-func TestCheckpoint(t *testing.T) {
-	resourceName1 := "domain1.com/resource1"
-	resourceName2 := "domain2.com/resource2"
-
-	m, err := NewDevicePluginManagerTestStub()
-	as := assert.New(t)
-	as.Nil(err)
-
-	testHandler := &HandlerImpl{
-		devicePluginManager: m,
-		allDevices:          make(map[string]sets.String),
-		allocatedDevices:    make(map[string]sets.String),
-		podDevices:          make(podDevices),
-	}
-
-	testHandler.podDevices.insert("pod1", "con1", resourceName1,
-		constructDevices([]string{"dev1", "dev2"}),
-		constructAllocResp(map[string]string{"/dev/r1dev1": "/dev/r1dev1", "/dev/r1dev2": "/dev/r1dev2"},
-			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
-	testHandler.podDevices.insert("pod1", "con1", resourceName2,
-		constructDevices([]string{"dev1", "dev2"}),
-		constructAllocResp(map[string]string{"/dev/r2dev1": "/dev/r2dev1", "/dev/r2dev2": "/dev/r2dev2"},
-			map[string]string{"/home/r2lib1": "/usr/r2lib1"},
-			map[string]string{"r2devices": "dev1 dev2"}))
-	testHandler.podDevices.insert("pod1", "con2", resourceName1,
-		constructDevices([]string{"dev3"}),
-		constructAllocResp(map[string]string{"/dev/r1dev3": "/dev/r1dev3"},
-			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
-	testHandler.podDevices.insert("pod2", "con1", resourceName1,
-		constructDevices([]string{"dev4"}),
-		constructAllocResp(map[string]string{"/dev/r1dev4": "/dev/r1dev4"},
-			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
-
-	expectedPodDevices := testHandler.podDevices
-	expectedAllocatedDevices := testHandler.podDevices.devices()
-
-	err = testHandler.writeCheckpoint()
-	as.Nil(err)
-	testHandler.podDevices = make(podDevices)
-	err = testHandler.readCheckpoint()
-	as.Nil(err)
-
-	as.Equal(len(expectedPodDevices), len(testHandler.podDevices))
-	for podUID, containerDevices := range expectedPodDevices {
-		for conName, resources := range containerDevices {
-			for resource := range resources {
-				as.True(reflect.DeepEqual(
-					expectedPodDevices.containerDevices(podUID, conName, resource),
-					testHandler.podDevices.containerDevices(podUID, conName, resource)))
-				opts1 := expectedPodDevices.deviceRunContainerOptions(podUID, conName)
-				opts2 := testHandler.podDevices.deviceRunContainerOptions(podUID, conName)
-				as.Equal(len(opts1.Envs), len(opts2.Envs))
-				as.Equal(len(opts1.Mounts), len(opts2.Mounts))
-				as.Equal(len(opts1.Devices), len(opts2.Devices))
-			}
-		}
-	}
-	as.True(reflect.DeepEqual(expectedAllocatedDevices, testHandler.allocatedDevices))
-}
-
-type activePodsStub struct {
-	activePods []*v1.Pod
-}
-
-func (a *activePodsStub) getActivePods() []*v1.Pod {
-	return a.activePods
-}
-
-func (a *activePodsStub) updateActivePods(newPods []*v1.Pod) {
-	a.activePods = newPods
-}
-
-func TestPodContainerDeviceAllocation(t *testing.T) {
-	flag.Set("alsologtostderr", fmt.Sprintf("%t", true))
-	var logLevel string
-	flag.StringVar(&logLevel, "logLevel", "4", "test")
-	flag.Lookup("v").Value.Set(logLevel)
-
-	resourceName1 := "domain1.com/resource1"
-	resourceQuantity1 := *resource.NewQuantity(int64(2), resource.DecimalSI)
-	devID1 := "dev1"
-	devID2 := "dev2"
-	resourceName2 := "domain2.com/resource2"
-	resourceQuantity2 := *resource.NewQuantity(int64(1), resource.DecimalSI)
-	devID3 := "dev3"
-	devID4 := "dev4"
-
-	m, err := NewDevicePluginManagerTestStub()
-	as := assert.New(t)
-	as.Nil(err)
-	monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
-	podsStub := activePodsStub{
-		activePods: []*v1.Pod{},
-	}
-	cachedNode := &v1.Node{
-		Status: v1.NodeStatus{
-			Allocatable: v1.ResourceList{},
-		},
-	}
-	nodeInfo := &schedulercache.NodeInfo{}
-	nodeInfo.SetNode(cachedNode)
-
-	testHandler := &HandlerImpl{
-		devicePluginManager:                m,
-		devicePluginManagerMonitorCallback: monitorCallback,
-		allDevices:                         make(map[string]sets.String),
-		allocatedDevices:                   make(map[string]sets.String),
-		podDevices:                         make(podDevices),
-		activePods:                         podsStub.getActivePods,
-	}
-	testHandler.allDevices[resourceName1] = sets.NewString()
-	testHandler.allDevices[resourceName1].Insert(devID1)
-	testHandler.allDevices[resourceName1].Insert(devID2)
-	testHandler.allDevices[resourceName2] = sets.NewString()
-	testHandler.allDevices[resourceName2].Insert(devID3)
-	testHandler.allDevices[resourceName2].Insert(devID4)
-
-	m.devRuntimeDevices[resourceName1+devID1] = append(m.devRuntimeDevices[resourceName1+devID1], stringPairType{"/dev/aaa", "/dev/aaa"})
-	m.devRuntimeDevices[resourceName1+devID1] = append(m.devRuntimeDevices[resourceName1+devID1], stringPairType{"/dev/bbb", "/dev/bbb"})
-	m.devRuntimeDevices[resourceName1+devID2] = append(m.devRuntimeDevices[resourceName1+devID2], stringPairType{"/dev/ccc", "/dev/ccc"})
-	m.devRuntimeMounts[resourceName1+devID1] = append(m.devRuntimeMounts[resourceName1+devID1], stringPairType{"/container_dir1/file1", "host_dir1/file1"})
-	m.devRuntimeMounts[resourceName1+devID2] = append(m.devRuntimeMounts[resourceName1+devID2], stringPairType{"/container_dir1/file2", "host_dir1/file2"})
-	m.devRuntimeEnvs[resourceName1+devID2] = append(m.devRuntimeEnvs[resourceName1+devID2], stringPairType{"key1", "val1"})
-	m.devRuntimeEnvs[resourceName2+devID3] = append(m.devRuntimeEnvs[resourceName2+devID3], stringPairType{"key2", "val2"})
-	m.devRuntimeEnvs[resourceName2+devID4] = append(m.devRuntimeEnvs[resourceName2+devID4], stringPairType{"key2", "val3"})
-
-	pod := &v1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			UID: uuid.NewUUID(),
-		},
-		Spec: v1.PodSpec{
-			Containers: []v1.Container{
-				{
-					Name: string(uuid.NewUUID()),
-					Resources: v1.ResourceRequirements{
-						Limits: v1.ResourceList{
-							v1.ResourceName(resourceName1): resourceQuantity1,
-							v1.ResourceName("cpu"):         resourceQuantity1,
-							v1.ResourceName(resourceName2): resourceQuantity2,
-						},
-					},
-				},
-			},
-		},
-	}
-
-	podsStub.updateActivePods([]*v1.Pod{pod})
-	err = testHandler.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: pod})
-	as.Nil(err)
-	runContainerOpts := testHandler.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0])
-	as.Equal(len(runContainerOpts.Devices), 3)
-	as.Equal(len(runContainerOpts.Mounts), 2)
-	as.Equal(len(runContainerOpts.Envs), 2)
-
-	// Requesting to create a pod without enough resources should fail.
-	as.Equal(2, testHandler.allocatedDevices[resourceName1].Len())
-	failPod := &v1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			UID: uuid.NewUUID(),
-		},
-		Spec: v1.PodSpec{
-			Containers: []v1.Container{
-				{
-					Name: string(uuid.NewUUID()),
-					Resources: v1.ResourceRequirements{
-						Limits: v1.ResourceList{
-							v1.ResourceName(resourceName1): resourceQuantity2,
-						},
-					},
-				},
-			},
-		},
-	}
-	err = testHandler.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: failPod})
-	as.NotNil(err)
-	runContainerOpts2 := testHandler.GetDeviceRunContainerOptions(failPod, &failPod.Spec.Containers[0])
-	as.Nil(runContainerOpts2)
-
-	// Requesting to create a new pod with a single resourceName2 should succeed.
-	newPod := &v1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			UID: uuid.NewUUID(),
-		},
-		Spec: v1.PodSpec{
-			Containers: []v1.Container{
-				{
-					Name: string(uuid.NewUUID()),
-					Resources: v1.ResourceRequirements{
-						Limits: v1.ResourceList{
-							v1.ResourceName(resourceName2): resourceQuantity2,
-						},
-					},
-				},
-			},
-		},
-	}
-	err = testHandler.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: newPod})
-	as.Nil(err)
-	runContainerOpts3 := testHandler.GetDeviceRunContainerOptions(newPod, &newPod.Spec.Containers[0])
-	as.Equal(1, len(runContainerOpts3.Envs))
-}
-
-func TestSanitizeNodeAllocatable(t *testing.T) {
-	resourceName1 := "domain1.com/resource1"
-	devID1 := "dev1"
-
-	resourceName2 := "domain2.com/resource2"
-	devID2 := "dev2"
-
-	m, err := NewDevicePluginManagerTestStub()
-	as := assert.New(t)
-	as.Nil(err)
-	monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
-
-	testHandler := &HandlerImpl{
-		devicePluginManager:                m,
-		devicePluginManagerMonitorCallback: monitorCallback,
-		allDevices:                         make(map[string]sets.String),
-		allocatedDevices:                   make(map[string]sets.String),
-		podDevices:                         make(podDevices),
-	}
-	// require one of resource1 and one of resource2
-	testHandler.allocatedDevices[resourceName1] = sets.NewString()
-	testHandler.allocatedDevices[resourceName1].Insert(devID1)
-	testHandler.allocatedDevices[resourceName2] = sets.NewString()
-	testHandler.allocatedDevices[resourceName2].Insert(devID2)
-
-	cachedNode := &v1.Node{
-		Status: v1.NodeStatus{
-			Allocatable: v1.ResourceList{
-				// has no resource1 and two of resource2
-				v1.ResourceName(resourceName2): *resource.NewQuantity(int64(2), resource.DecimalSI),
-			},
-		},
-	}
-	nodeInfo := &schedulercache.NodeInfo{}
-	nodeInfo.SetNode(cachedNode)
-
-	testHandler.sanitizeNodeAllocatable(nodeInfo)
-
-	allocatableScalarResources := nodeInfo.AllocatableResource().ScalarResources
-	// allocatable in nodeInfo is less than needed, should update
-	as.Equal(1, int(allocatableScalarResources[v1.ResourceName(resourceName1)]))
-	// allocatable in nodeInfo is more than needed, should skip updating
-	as.Equal(2, int(allocatableScalarResources[v1.ResourceName(resourceName2)]))
-}
diff --git a/pkg/kubelet/cm/deviceplugin/endpoint.go b/pkg/kubelet/cm/deviceplugin/endpoint.go
index 44898dc023..29feaf5285 100644
--- a/pkg/kubelet/cm/deviceplugin/endpoint.go
+++ b/pkg/kubelet/cm/deviceplugin/endpoint.go
@@ -32,7 +32,15 @@ import (
 // endpoint maps to a single registered device plugin. It is responsible
 // for managing gRPC communications with the device plugin and caching
 // device states reported by the device plugin.
-type endpoint struct {
+type endpoint interface {
+	run()
+	stop()
+	allocate(devs []string) (*pluginapi.AllocateResponse, error)
+	getDevices() []pluginapi.Device
+	callback(resourceName string, added, updated, deleted []pluginapi.Device)
+}
+
+type endpointImpl struct {
 	client     pluginapi.DevicePluginClient
 	clientConn *grpc.ClientConn
 
@@ -42,30 +50,34 @@ type endpoint struct {
 	devices map[string]pluginapi.Device
 	mutex   sync.Mutex
 
-	callback MonitorCallback
+	cb monitorCallback
 }
 
 // newEndpoint creates a new endpoint for the given resourceName.
-func newEndpoint(socketPath, resourceName string, devices map[string]pluginapi.Device, callback MonitorCallback) (*endpoint, error) {
+func newEndpointImpl(socketPath, resourceName string, devices map[string]pluginapi.Device, callback monitorCallback) (*endpointImpl, error) {
 	client, c, err := dial(socketPath)
 	if err != nil {
 		glog.Errorf("Can't create new endpoint with path %s err %v", socketPath, err)
 		return nil, err
 	}
 
-	return &endpoint{
+	return &endpointImpl{
 		client:     client,
 		clientConn: c,
 
 		socketPath:   socketPath,
 		resourceName: resourceName,
 
-		devices:  devices,
-		callback: callback,
+		devices: devices,
+		cb:      callback,
 	}, nil
 }
 
-func (e *endpoint) getDevices() []pluginapi.Device {
+func (e *endpointImpl) callback(resourceName string, added, updated, deleted []pluginapi.Device) {
+	e.cb(resourceName, added, updated, deleted)
+}
+
+func (e *endpointImpl) getDevices() []pluginapi.Device {
 	e.mutex.Lock()
 	defer e.mutex.Unlock()
 	var devs []pluginapi.Device
@@ -81,11 +93,9 @@ func (e *endpoint) getDevices() []pluginapi.Device {
 // blocks on receiving ListAndWatch gRPC stream updates. Each ListAndWatch
 // stream update contains a new list of device states. listAndWatch compares the new
 // device states with its cached states to get list of new, updated, and deleted devices.
-// It then issues a callback to pass this information to the device_plugin_handler which
+// It then issues a callback to pass this information to the device manager which
 // will adjust the resource available information accordingly.
-func (e *endpoint) run() {
-	glog.V(3).Infof("Starting ListAndWatch")
-
+func (e *endpointImpl) run() {
 	stream, err := e.client.ListAndWatch(context.Background(), &pluginapi.Empty{})
 	if err != nil {
 		glog.Errorf(errListAndWatch, e.resourceName, err)
@@ -162,13 +172,13 @@ func (e *endpoint) run() {
 }
 
 // allocate issues Allocate gRPC call to the device plugin.
-func (e *endpoint) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
+func (e *endpointImpl) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
 	return e.client.Allocate(context.Background(), &pluginapi.AllocateRequest{
 		DevicesIDs: devs,
 	})
 }
 
-func (e *endpoint) stop() {
+func (e *endpointImpl) stop() {
 	e.clientConn.Close()
 }
 
diff --git a/pkg/kubelet/cm/deviceplugin/endpoint_test.go b/pkg/kubelet/cm/deviceplugin/endpoint_test.go
index cb27c89f31..226148a6b0 100644
--- a/pkg/kubelet/cm/deviceplugin/endpoint_test.go
+++ b/pkg/kubelet/cm/deviceplugin/endpoint_test.go
@@ -87,7 +87,7 @@ func TestRun(t *testing.T) {
 }
 
 func TestGetDevices(t *testing.T) {
-	e := endpoint{
+	e := endpointImpl{
 		devices: map[string]pluginapi.Device{
 			"ADeviceId": {ID: "ADeviceId", Health: pluginapi.Healthy},
 		},
@@ -96,19 +96,19 @@ func TestGetDevices(t *testing.T) {
 	require.Len(t, devs, 1)
 }
 
-func esetup(t *testing.T, devs []*pluginapi.Device, socket, resourceName string, callback MonitorCallback) (*Stub, *endpoint) {
+func esetup(t *testing.T, devs []*pluginapi.Device, socket, resourceName string, callback monitorCallback) (*Stub, *endpointImpl) {
 	p := NewDevicePluginStub(devs, socket)
 
 	err := p.Start()
 	require.NoError(t, err)
 
-	e, err := newEndpoint(socket, "mock", make(map[string]pluginapi.Device), func(n string, a, u, r []pluginapi.Device) {})
+	e, err := newEndpointImpl(socket, "mock", make(map[string]pluginapi.Device), func(n string, a, u, r []pluginapi.Device) {})
 	require.NoError(t, err)
 
 	return p, e
 }
 
-func ecleanup(t *testing.T, p *Stub, e *endpoint) {
+func ecleanup(t *testing.T, p *Stub, e *endpointImpl) {
 	p.Stop()
 	e.stop()
 }
diff --git a/pkg/kubelet/cm/deviceplugin/manager.go b/pkg/kubelet/cm/deviceplugin/manager.go
index 2b2c0a333f..be46973ef7 100644
--- a/pkg/kubelet/cm/deviceplugin/manager.go
+++ b/pkg/kubelet/cm/deviceplugin/manager.go
@@ -17,7 +17,9 @@ limitations under the License.
 package deviceplugin
 
 import (
+	"encoding/json"
 	"fmt"
+	"io/ioutil"
 	"net"
 	"os"
 	"path/filepath"
@@ -28,27 +30,58 @@ import (
 	"google.golang.org/grpc"
 
 	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apimachinery/pkg/util/sets"
 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
 	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
+	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )
 
+// ActivePodsFunc is a function that returns a list of pods to reconcile.
+type ActivePodsFunc func() []*v1.Pod
+
+// monitorCallback is the function called when a device's health state changes,
+// or new devices are reported, or old devices are deleted.
+// Updated contains the most recent state of the Device.
+type monitorCallback func(resourceName string, added, updated, deleted []pluginapi.Device)
+
 // ManagerImpl is the structure in charge of managing Device Plugins.
 type ManagerImpl struct {
 	socketname string
 	socketdir  string
 
-	endpoints map[string]*endpoint // Key is ResourceName
+	endpoints map[string]endpoint // Key is ResourceName
 	mutex     sync.Mutex
 
-	callback MonitorCallback
-
 	server *grpc.Server
+
+	// activePods is a method for listing active pods on the node
+	// so the amount of pluginResources requested by existing pods
+	// could be counted when updating allocated devices
+	activePods ActivePodsFunc
+
+	// callback is used for updating devices' states in one time call.
+	// e.g. a new device is advertised, two old devices are deleted and a running device fails.
+	callback monitorCallback
+
+	// allDevices contains all of registered resourceNames and their exported device IDs.
+	allDevices map[string]sets.String
+
+	// allocatedDevices contains allocated deviceIds, keyed by resourceName.
+	allocatedDevices map[string]sets.String
+
+	// podDevices contains pod to allocated device mapping.
+	podDevices podDevices
 }
 
-// NewManagerImpl creates a new manager on the socket `socketPath`.
-// f is the callback that is called when a device becomes unhealthy.
-// socketPath is present for testing purposes in production this is pluginapi.KubeletSocket
-func NewManagerImpl(socketPath string, f MonitorCallback) (*ManagerImpl, error) {
+// NewManagerImpl creates a new manager. updateCapacityFunc is called to
+// update ContainerManager capacity when device capacity changes.
+func NewManagerImpl(updateCapacityFunc func(v1.ResourceList)) (*ManagerImpl, error) {
+	return newManagerImpl(updateCapacityFunc, pluginapi.KubeletSocket)
+}
+
+func newManagerImpl(updateCapacityFunc func(v1.ResourceList), socketPath string) (*ManagerImpl, error) {
 	glog.V(2).Infof("Creating Device Plugin manager at %s", socketPath)
 
 	if socketPath == "" || !filepath.IsAbs(socketPath) {
@@ -56,13 +89,42 @@ func NewManagerImpl(socketPath string, f MonitorCallback) (*ManagerImpl, error)
 	}
 
 	dir, file := filepath.Split(socketPath)
-	return &ManagerImpl{
-		endpoints: make(map[string]*endpoint),
+	manager := &ManagerImpl{
+		endpoints:        make(map[string]endpoint),
+		socketname:       file,
+		socketdir:        dir,
+		allDevices:       make(map[string]sets.String),
+		allocatedDevices: make(map[string]sets.String),
+		podDevices:       make(podDevices),
+	}
 
-		socketname: file,
-		socketdir:  dir,
-		callback:   f,
-	}, nil
+	manager.callback = func(resourceName string, added, updated, deleted []pluginapi.Device) {
+		var capacity = v1.ResourceList{}
+		kept := append(updated, added...)
+
+		manager.mutex.Lock()
+		defer manager.mutex.Unlock()
+
+		if _, ok := manager.allDevices[resourceName]; !ok {
+			manager.allDevices[resourceName] = sets.NewString()
+		}
+		// For now, Manager only keeps track of healthy devices.
+		// We can revisit this later when the need comes to track unhealthy devices here.
+		for _, dev := range kept {
+			if dev.Health == pluginapi.Healthy {
+				manager.allDevices[resourceName].Insert(dev.ID)
+			} else {
+				manager.allDevices[resourceName].Delete(dev.ID)
+			}
+		}
+		for _, dev := range deleted {
+			manager.allDevices[resourceName].Delete(dev.ID)
+		}
+		capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(manager.allDevices[resourceName].Len()), resource.DecimalSI)
+		updateCapacityFunc(capacity)
+	}
+
+	return manager, nil
 }
 
 func (m *ManagerImpl) removeContents(dir string) error {
@@ -77,7 +139,7 @@ func (m *ManagerImpl) removeContents(dir string) error {
 	}
 	for _, name := range names {
 		filePath := filepath.Join(dir, name)
-		if filePath == m.CheckpointFile() {
+		if filePath == m.checkpointFile() {
 			continue
 		}
 		stat, err := os.Stat(filePath)
@@ -101,15 +163,25 @@ const (
 	kubeletDevicePluginCheckpoint = "kubelet_internal_checkpoint"
 )
 
-// CheckpointFile returns device plugin checkpoint file path.
-func (m *ManagerImpl) CheckpointFile() string {
+// checkpointFile returns device plugin checkpoint file path.
+func (m *ManagerImpl) checkpointFile() string {
 	return filepath.Join(m.socketdir, kubeletDevicePluginCheckpoint)
 }
 
-// Start starts the Device Plugin Manager
-func (m *ManagerImpl) Start() error {
+// Start starts the Device Plugin Manager amd start initialization of
+// podDevices and allocatedDevices information from checkpoint-ed state and
+// starts device plugin registration service.
+func (m *ManagerImpl) Start(activePods ActivePodsFunc) error {
 	glog.V(2).Infof("Starting Device Plugin manager")
 
+	m.activePods = activePods
+
+	// Loads in allocatedDevices information from disk.
+	err := m.readCheckpoint()
+	if err != nil {
+		glog.Warningf("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date. Err: %v", err)
+	}
+
 	socketPath := filepath.Join(m.socketdir, m.socketname)
 	os.MkdirAll(m.socketdir, 0755)
 
@@ -130,6 +202,8 @@ func (m *ManagerImpl) Start() error {
 	pluginapi.RegisterRegistrationServer(m.server, m)
 	go m.server.Serve(s)
 
+	glog.V(2).Infof("Serving device plugin registration server on %q", socketPath)
+
 	return nil
 }
 
@@ -150,22 +224,27 @@ func (m *ManagerImpl) Devices() map[string][]pluginapi.Device {
 
 // Allocate is the call that you can use to allocate a set of devices
 // from the registered device plugins.
-func (m *ManagerImpl) Allocate(resourceName string, devs []string) (*pluginapi.AllocateResponse, error) {
-
-	if len(devs) == 0 {
-		return nil, nil
+func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
+	pod := attrs.Pod
+	// TODO: Reuse devices between init containers and regular containers.
+	for _, container := range pod.Spec.InitContainers {
+		if err := m.allocateContainerResources(pod, &container); err != nil {
+			return err
+		}
+	}
+	for _, container := range pod.Spec.Containers {
+		if err := m.allocateContainerResources(pod, &container); err != nil {
+			return err
+		}
 	}
 
-	glog.V(3).Infof("Recieved allocation request for devices %v for device plugin %s",
-		devs, resourceName)
-	m.mutex.Lock()
-	e, ok := m.endpoints[resourceName]
-	m.mutex.Unlock()
-	if !ok {
-		return nil, fmt.Errorf("Unknown Device Plugin %s", resourceName)
+	// quick return if no pluginResources requested
+	if _, podRequireDevicePluginResource := m.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
+		return nil
 	}
 
-	return e.allocate(devs)
+	m.sanitizeNodeAllocatable(node)
+	return nil
 }
 
 // Register registers a device plugin.
@@ -211,12 +290,16 @@ func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
 	if ok && old != nil {
 		// Pass devices of previous endpoint into re-registered one,
 		// to avoid potential orphaned devices upon re-registration
-		existingDevs = old.devices
+		devices := make(map[string]pluginapi.Device)
+		for _, device := range old.getDevices() {
+			devices[device.ID] = device
+		}
+		existingDevs = devices
 	}
 	m.mutex.Unlock()
 
 	socketPath := filepath.Join(m.socketdir, r.Endpoint)
-	e, err := newEndpoint(socketPath, r.ResourceName, existingDevs, m.callback)
+	e, err := newEndpointImpl(socketPath, r.ResourceName, existingDevs, m.callback)
 	if err != nil {
 		glog.Errorf("Failed to dial device plugin with request %v: %v", r, err)
 		return
@@ -259,3 +342,212 @@ func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
 		m.mutex.Unlock()
 	}()
 }
+
+// Checkpoints device to container allocation information to disk.
+func (m *ManagerImpl) writeCheckpoint() error {
+	m.mutex.Lock()
+	data := m.podDevices.toCheckpointData()
+	m.mutex.Unlock()
+
+	dataJSON, err := json.Marshal(data)
+	if err != nil {
+		return err
+	}
+	filepath := m.checkpointFile()
+	return ioutil.WriteFile(filepath, dataJSON, 0644)
+}
+
+// Reads device to container allocation information from disk, and populates
+// m.allocatedDevices accordingly.
+func (m *ManagerImpl) readCheckpoint() error {
+	filepath := m.checkpointFile()
+	content, err := ioutil.ReadFile(filepath)
+	if err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("failed to read checkpoint file %q: %v", filepath, err)
+	}
+	glog.V(2).Infof("Read checkpoint file %s\n", filepath)
+	var data checkpointData
+	if err := json.Unmarshal(content, &data); err != nil {
+		return fmt.Errorf("failed to unmarshal checkpoint data: %v", err)
+	}
+
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	m.podDevices.fromCheckpointData(data)
+	m.allocatedDevices = m.podDevices.devices()
+	return nil
+}
+
+// updateAllocatedDevices gets a list of active pods and then frees any Devices that are bound to
+// terminated pods. Returns error on failure.
+func (m *ManagerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	activePodUids := sets.NewString()
+	for _, pod := range activePods {
+		activePodUids.Insert(string(pod.UID))
+	}
+	allocatedPodUids := m.podDevices.pods()
+	podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
+	if len(podsToBeRemoved) <= 0 {
+		return
+	}
+	glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
+	m.podDevices.delete(podsToBeRemoved.List())
+	// Regenerated allocatedDevices after we update pod allocation information.
+	m.allocatedDevices = m.podDevices.devices()
+}
+
+// Returns list of device Ids we need to allocate with Allocate rpc call.
+// Returns empty list in case we don't need to issue the Allocate rpc call.
+func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int) (sets.String, error) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	needed := required
+	// Gets list of devices that have already been allocated.
+	// This can happen if a container restarts for example.
+	devices := m.podDevices.containerDevices(podUID, contName, resource)
+	if devices != nil {
+		glog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, contName, podUID, devices.List())
+		needed = needed - devices.Len()
+		// A pod's resource is not expected to change once admitted by the API server,
+		// so just fail loudly here. We can revisit this part if this no longer holds.
+		if needed != 0 {
+			return nil, fmt.Errorf("pod %v container %v changed request for resource %v from %v to %v", podUID, contName, resource, devices.Len(), required)
+		}
+	}
+	if needed == 0 {
+		// No change, no work.
+		return nil, nil
+	}
+	devices = sets.NewString()
+	// Needs to allocate additional devices.
+	if m.allocatedDevices[resource] == nil {
+		m.allocatedDevices[resource] = sets.NewString()
+	}
+	// Gets Devices in use.
+	devicesInUse := m.allocatedDevices[resource]
+	// Gets a list of available devices.
+	available := m.allDevices[resource].Difference(devicesInUse)
+	if int(available.Len()) < needed {
+		return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
+	}
+	allocated := available.UnsortedList()[:needed]
+	// Updates m.allocatedDevices with allocated devices to prevent them
+	// from being allocated to other pods/containers, given that we are
+	// not holding lock during the rpc call.
+	for _, device := range allocated {
+		m.allocatedDevices[resource].Insert(device)
+		devices.Insert(device)
+	}
+	return devices, nil
+}
+
+// allocateContainerResources attempts to allocate all of required device
+// plugin resources for the input container, issues an Allocate rpc request
+// for each new device resource requirement, processes their AllocateResponses,
+// and updates the cached containerDevices on success.
+func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container) error {
+	podUID := string(pod.UID)
+	contName := container.Name
+	allocatedDevicesUpdated := false
+	for k, v := range container.Resources.Limits {
+		resource := string(k)
+		needed := int(v.Value())
+		glog.V(3).Infof("needs %d %s", needed, resource)
+		if _, registeredResource := m.allDevices[resource]; !registeredResource {
+			continue
+		}
+		// Updates allocatedDevices to garbage collect any stranded resources
+		// before doing the device plugin allocation.
+		if !allocatedDevicesUpdated {
+			m.updateAllocatedDevices(m.activePods())
+			allocatedDevicesUpdated = true
+		}
+		allocDevices, err := m.devicesToAllocate(podUID, contName, resource, needed)
+		if err != nil {
+			return err
+		}
+		if allocDevices == nil || len(allocDevices) <= 0 {
+			continue
+		}
+		// devicePluginManager.Allocate involves RPC calls to device plugin, which
+		// could be heavy-weight. Therefore we want to perform this operation outside
+		// mutex lock. Note if Allocate call fails, we may leave container resources
+		// partially allocated for the failed container. We rely on updateAllocatedDevices()
+		// to garbage collect these resources later. Another side effect is that if
+		// we have X resource A and Y resource B in total, and two containers, container1
+		// and container2 both require X resource A and Y resource B. Both allocation
+		// requests may fail if we serve them in mixed order.
+		// TODO: may revisit this part later if we see inefficient resource allocation
+		// in real use as the result of this. Should also consider to parallize device
+		// plugin Allocate grpc calls if it becomes common that a container may require
+		// resources from multiple device plugins.
+		m.mutex.Lock()
+		e, ok := m.endpoints[resource]
+		m.mutex.Unlock()
+		if !ok {
+			m.mutex.Lock()
+			m.allocatedDevices = m.podDevices.devices()
+			m.mutex.Unlock()
+			return fmt.Errorf("Unknown Device Plugin %s", resource)
+		}
+
+		devs := allocDevices.UnsortedList()
+		glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
+		resp, err := e.allocate(devs)
+		if err != nil {
+			// In case of allocation failure, we want to restore m.allocatedDevices
+			// to the actual allocated state from m.podDevices.
+			m.mutex.Lock()
+			m.allocatedDevices = m.podDevices.devices()
+			m.mutex.Unlock()
+			return err
+		}
+
+		// Update internal cached podDevices state.
+		m.mutex.Lock()
+		m.podDevices.insert(podUID, contName, resource, allocDevices, resp)
+		m.mutex.Unlock()
+	}
+
+	// Checkpoints device to container allocation information.
+	return m.writeCheckpoint()
+}
+
+// GetDeviceRunContainerOptions checks whether we have cached containerDevices
+// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
+// for the found one. An empty struct is returned in case no cached state is found.
+func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	return m.podDevices.deviceRunContainerOptions(string(pod.UID), container.Name)
+}
+
+// sanitizeNodeAllocatable scans through allocatedDevices in the device manager
+// and if necessary, updates allocatableResource in nodeInfo to at least equal to
+// the allocated capacity. This allows pods that have already been scheduled on
+// the node to pass GeneralPredicates admission checking even upon device plugin failure.
+func (m *ManagerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
+	var newAllocatableResource *schedulercache.Resource
+	allocatableResource := node.AllocatableResource()
+	if allocatableResource.ScalarResources == nil {
+		allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
+	}
+	for resource, devices := range m.allocatedDevices {
+		needed := devices.Len()
+		quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
+		if ok && int(quant) >= needed {
+			continue
+		}
+		// Needs to update nodeInfo.AllocatableResource to make sure
+		// NodeInfo.allocatableResource at least equal to the capacity already allocated.
+		if newAllocatableResource == nil {
+			newAllocatableResource = allocatableResource.Clone()
+		}
+		newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
+	}
+	if newAllocatableResource != nil {
+		node.SetAllocatableResource(newAllocatableResource)
+	}
+}
diff --git a/pkg/kubelet/cm/deviceplugin/device_plugin_handler_stub.go b/pkg/kubelet/cm/deviceplugin/manager_stub.go
similarity index 67%
rename from pkg/kubelet/cm/deviceplugin/device_plugin_handler_stub.go
rename to pkg/kubelet/cm/deviceplugin/manager_stub.go
index eb72309046..450164b2ee 100644
--- a/pkg/kubelet/cm/deviceplugin/device_plugin_handler_stub.go
+++ b/pkg/kubelet/cm/deviceplugin/manager_stub.go
@@ -23,30 +23,35 @@ import (
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )
 
-// HandlerStub provides a simple stub implementation for Handler.
-type HandlerStub struct{}
+// ManagerStub provides a simple stub implementation for the Device Manager.
+type ManagerStub struct{}
 
-// NewHandlerStub creates a HandlerStub.
-func NewHandlerStub() (*HandlerStub, error) {
-	return &HandlerStub{}, nil
+// NewManagerStub creates a ManagerStub.
+func NewManagerStub() (*ManagerStub, error) {
+	return &ManagerStub{}, nil
 }
 
 // Start simply returns nil.
-func (h *HandlerStub) Start(activePods ActivePodsFunc) error {
+func (h *ManagerStub) Start(activePods ActivePodsFunc) error {
+	return nil
+}
+
+// Stop simply returns nil.
+func (h *ManagerStub) Stop() error {
 	return nil
 }
 
 // Devices returns an empty map.
-func (h *HandlerStub) Devices() map[string][]pluginapi.Device {
+func (h *ManagerStub) Devices() map[string][]pluginapi.Device {
 	return make(map[string][]pluginapi.Device)
 }
 
 // Allocate simply returns nil.
-func (h *HandlerStub) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
+func (h *ManagerStub) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
 	return nil
 }
 
 // GetDeviceRunContainerOptions simply returns nil.
-func (h *HandlerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
+func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
 	return nil
 }
diff --git a/pkg/kubelet/cm/deviceplugin/manager_test.go b/pkg/kubelet/cm/deviceplugin/manager_test.go
index 0d2178f92c..20fb427573 100644
--- a/pkg/kubelet/cm/deviceplugin/manager_test.go
+++ b/pkg/kubelet/cm/deviceplugin/manager_test.go
@@ -17,13 +17,23 @@ limitations under the License.
 package deviceplugin
 
 import (
+	"flag"
+	"fmt"
+	"reflect"
 	"sync/atomic"
 	"testing"
 	"time"
 
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-
+	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apimachinery/pkg/util/uuid"
 	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
+	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )
 
 const (
@@ -33,10 +43,8 @@ const (
 )
 
 func TestNewManagerImpl(t *testing.T) {
-	_, err := NewManagerImpl("", func(n string, a, u, r []pluginapi.Device) {})
-	require.Error(t, err)
-
-	_, err = NewManagerImpl(socketName, func(n string, a, u, r []pluginapi.Device) {})
+	verifyCapacityFunc := func(updates v1.ResourceList) {}
+	_, err := newManagerImpl(verifyCapacityFunc, socketName)
 	require.NoError(t, err)
 }
 
@@ -72,6 +80,7 @@ func TestDevicePluginReRegistration(t *testing.T) {
 	m, p1 := setup(t, devs, callback)
 	p1.Register(socketName, testResourceName)
 	// Wait for the first callback to be issued.
+
 	<-callbackChan
 	// Wait till the endpoint is added to the manager.
 	for i := 0; i < 20; i++ {
@@ -113,10 +122,17 @@ func TestDevicePluginReRegistration(t *testing.T) {
 
 }
 
-func setup(t *testing.T, devs []*pluginapi.Device, callback MonitorCallback) (Manager, *Stub) {
-	m, err := NewManagerImpl(socketName, callback)
+func setup(t *testing.T, devs []*pluginapi.Device, callback monitorCallback) (Manager, *Stub) {
+	updateCapacity := func(v1.ResourceList) {}
+	m, err := newManagerImpl(updateCapacity, socketName)
 	require.NoError(t, err)
-	err = m.Start()
+
+	m.callback = callback
+
+	activePods := func() []*v1.Pod {
+		return []*v1.Pod{}
+	}
+	err = m.Start(activePods)
 	require.NoError(t, err)
 
 	p := NewDevicePluginStub(devs, pluginSocketName)
@@ -130,3 +146,387 @@ func cleanup(t *testing.T, m Manager, p *Stub) {
 	p.Stop()
 	m.Stop()
 }
+
+func TestUpdateCapacity(t *testing.T) {
+	var expected = v1.ResourceList{}
+	as := assert.New(t)
+	verifyCapacityFunc := func(updates v1.ResourceList) {
+		as.Equal(expected, updates)
+	}
+	testManager, err := newManagerImpl(verifyCapacityFunc, socketName)
+	as.NotNil(testManager)
+	as.Nil(err)
+
+	devs := []pluginapi.Device{
+		{ID: "Device1", Health: pluginapi.Healthy},
+		{ID: "Device2", Health: pluginapi.Healthy},
+		{ID: "Device3", Health: pluginapi.Unhealthy},
+	}
+
+	resourceName := "resource1"
+	// Adds three devices for resource1, two healthy and one unhealthy.
+	// Expects capacity for resource1 to be 2.
+	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
+	testManager.callback(resourceName, devs, []pluginapi.Device{}, []pluginapi.Device{})
+	// Deletes an unhealthy device should NOT change capacity.
+	testManager.callback(resourceName, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[2]})
+	// Updates a healthy device to unhealthy should reduce capacity by 1.
+	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(1), resource.DecimalSI)
+	// Deletes a healthy device should reduce capacity by 1.
+	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(0), resource.DecimalSI)
+	// Tests adding another resource.
+	delete(expected, v1.ResourceName(resourceName))
+	resourceName2 := "resource2"
+	expected[v1.ResourceName(resourceName2)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
+	testManager.callback(resourceName2, devs, []pluginapi.Device{}, []pluginapi.Device{})
+}
+
+type stringPairType struct {
+	value1 string
+	value2 string
+}
+
+func constructDevices(devices []string) sets.String {
+	ret := sets.NewString()
+	for _, dev := range devices {
+		ret.Insert(dev)
+	}
+	return ret
+}
+
+func constructAllocResp(devices, mounts, envs map[string]string) *pluginapi.AllocateResponse {
+	resp := &pluginapi.AllocateResponse{}
+	for k, v := range devices {
+		resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
+			HostPath:      k,
+			ContainerPath: v,
+			Permissions:   "mrw",
+		})
+	}
+	for k, v := range mounts {
+		resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
+			ContainerPath: k,
+			HostPath:      v,
+			ReadOnly:      true,
+		})
+	}
+	resp.Envs = make(map[string]string)
+	for k, v := range envs {
+		resp.Envs[k] = v
+	}
+	return resp
+}
+
+func TestCheckpoint(t *testing.T) {
+	resourceName1 := "domain1.com/resource1"
+	resourceName2 := "domain2.com/resource2"
+
+	testManager := &ManagerImpl{
+		allDevices:       make(map[string]sets.String),
+		allocatedDevices: make(map[string]sets.String),
+		podDevices:       make(podDevices),
+	}
+
+	testManager.podDevices.insert("pod1", "con1", resourceName1,
+		constructDevices([]string{"dev1", "dev2"}),
+		constructAllocResp(map[string]string{"/dev/r1dev1": "/dev/r1dev1", "/dev/r1dev2": "/dev/r1dev2"},
+			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
+	testManager.podDevices.insert("pod1", "con1", resourceName2,
+		constructDevices([]string{"dev1", "dev2"}),
+		constructAllocResp(map[string]string{"/dev/r2dev1": "/dev/r2dev1", "/dev/r2dev2": "/dev/r2dev2"},
+			map[string]string{"/home/r2lib1": "/usr/r2lib1"},
+			map[string]string{"r2devices": "dev1 dev2"}))
+	testManager.podDevices.insert("pod1", "con2", resourceName1,
+		constructDevices([]string{"dev3"}),
+		constructAllocResp(map[string]string{"/dev/r1dev3": "/dev/r1dev3"},
+			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
+	testManager.podDevices.insert("pod2", "con1", resourceName1,
+		constructDevices([]string{"dev4"}),
+		constructAllocResp(map[string]string{"/dev/r1dev4": "/dev/r1dev4"},
+			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
+
+	expectedPodDevices := testManager.podDevices
+	expectedAllocatedDevices := testManager.podDevices.devices()
+
+	err := testManager.writeCheckpoint()
+	as := assert.New(t)
+
+	as.Nil(err)
+	testManager.podDevices = make(podDevices)
+	err = testManager.readCheckpoint()
+	as.Nil(err)
+
+	as.Equal(len(expectedPodDevices), len(testManager.podDevices))
+	for podUID, containerDevices := range expectedPodDevices {
+		for conName, resources := range containerDevices {
+			for resource := range resources {
+				as.True(reflect.DeepEqual(
+					expectedPodDevices.containerDevices(podUID, conName, resource),
+					testManager.podDevices.containerDevices(podUID, conName, resource)))
+				opts1 := expectedPodDevices.deviceRunContainerOptions(podUID, conName)
+				opts2 := testManager.podDevices.deviceRunContainerOptions(podUID, conName)
+				as.Equal(len(opts1.Envs), len(opts2.Envs))
+				as.Equal(len(opts1.Mounts), len(opts2.Mounts))
+				as.Equal(len(opts1.Devices), len(opts2.Devices))
+			}
+		}
+	}
+	as.True(reflect.DeepEqual(expectedAllocatedDevices, testManager.allocatedDevices))
+}
+
+type activePodsStub struct {
+	activePods []*v1.Pod
+}
+
+func (a *activePodsStub) getActivePods() []*v1.Pod {
+	return a.activePods
+}
+
+func (a *activePodsStub) updateActivePods(newPods []*v1.Pod) {
+	a.activePods = newPods
+}
+
+type MockEndpoint struct {
+	allocateFunc func(devs []string) (*pluginapi.AllocateResponse, error)
+}
+
+func (m *MockEndpoint) stop() {}
+func (m *MockEndpoint) run()  {}
+
+func (m *MockEndpoint) getDevices() []pluginapi.Device {
+	return []pluginapi.Device{}
+}
+
+func (m *MockEndpoint) callback(resourceName string, added, updated, deleted []pluginapi.Device) {}
+
+func (m *MockEndpoint) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
+	if m.allocateFunc != nil {
+		return m.allocateFunc(devs)
+	}
+	return nil, nil
+}
+
+func TestPodContainerDeviceAllocation(t *testing.T) {
+	flag.Set("alsologtostderr", fmt.Sprintf("%t", true))
+	var logLevel string
+	flag.StringVar(&logLevel, "logLevel", "4", "test")
+	flag.Lookup("v").Value.Set(logLevel)
+
+	resourceName1 := "domain1.com/resource1"
+	resourceQuantity1 := *resource.NewQuantity(int64(2), resource.DecimalSI)
+	devID1 := "dev1"
+	devID2 := "dev2"
+	resourceName2 := "domain2.com/resource2"
+	resourceQuantity2 := *resource.NewQuantity(int64(1), resource.DecimalSI)
+	devID3 := "dev3"
+	devID4 := "dev4"
+
+	as := require.New(t)
+	monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
+	podsStub := activePodsStub{
+		activePods: []*v1.Pod{},
+	}
+	cachedNode := &v1.Node{
+		Status: v1.NodeStatus{
+			Allocatable: v1.ResourceList{},
+		},
+	}
+	nodeInfo := &schedulercache.NodeInfo{}
+	nodeInfo.SetNode(cachedNode)
+
+	testManager := &ManagerImpl{
+		callback:         monitorCallback,
+		allDevices:       make(map[string]sets.String),
+		allocatedDevices: make(map[string]sets.String),
+		endpoints:        make(map[string]endpoint),
+		podDevices:       make(podDevices),
+		activePods:       podsStub.getActivePods,
+	}
+
+	testManager.allDevices[resourceName1] = sets.NewString()
+	testManager.allDevices[resourceName1].Insert(devID1)
+	testManager.allDevices[resourceName1].Insert(devID2)
+	testManager.allDevices[resourceName2] = sets.NewString()
+	testManager.allDevices[resourceName2].Insert(devID3)
+	testManager.allDevices[resourceName2].Insert(devID4)
+
+	testManager.endpoints[resourceName1] = &MockEndpoint{
+		allocateFunc: func(devs []string) (*pluginapi.AllocateResponse, error) {
+			resp := new(pluginapi.AllocateResponse)
+			resp.Envs = make(map[string]string)
+			for _, dev := range devs {
+				switch dev {
+				case "dev1":
+					resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
+						ContainerPath: "/dev/aaa",
+						HostPath:      "/dev/aaa",
+						Permissions:   "mrw",
+					})
+
+					resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
+						ContainerPath: "/dev/bbb",
+						HostPath:      "/dev/bbb",
+						Permissions:   "mrw",
+					})
+
+					resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
+						ContainerPath: "/container_dir1/file1",
+						HostPath:      "host_dir1/file1",
+						ReadOnly:      true,
+					})
+
+				case "dev2":
+					resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
+						ContainerPath: "/dev/ccc",
+						HostPath:      "/dev/ccc",
+						Permissions:   "mrw",
+					})
+
+					resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
+						ContainerPath: "/container_dir1/file2",
+						HostPath:      "host_dir1/file2",
+						ReadOnly:      true,
+					})
+
+					resp.Envs["key1"] = "val1"
+				}
+			}
+			return resp, nil
+		},
+	}
+
+	testManager.endpoints[resourceName2] = &MockEndpoint{
+		allocateFunc: func(devs []string) (*pluginapi.AllocateResponse, error) {
+			resp := new(pluginapi.AllocateResponse)
+			resp.Envs = make(map[string]string)
+			for _, dev := range devs {
+				switch dev {
+				case "dev3":
+					resp.Envs["key2"] = "val2"
+
+				case "dev4":
+					resp.Envs["key2"] = "val3"
+				}
+			}
+			return resp, nil
+		},
+	}
+
+	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			UID: uuid.NewUUID(),
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name: string(uuid.NewUUID()),
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							v1.ResourceName(resourceName1): resourceQuantity1,
+							v1.ResourceName("cpu"):         resourceQuantity1,
+							v1.ResourceName(resourceName2): resourceQuantity2,
+						},
+					},
+				},
+			},
+		},
+	}
+
+	podsStub.updateActivePods([]*v1.Pod{pod})
+	err := testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: pod})
+	as.Nil(err)
+	runContainerOpts := testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0])
+	as.NotNil(runContainerOpts)
+	as.Equal(len(runContainerOpts.Devices), 3)
+	as.Equal(len(runContainerOpts.Mounts), 2)
+	as.Equal(len(runContainerOpts.Envs), 2)
+
+	// Requesting to create a pod without enough resources should fail.
+	as.Equal(2, testManager.allocatedDevices[resourceName1].Len())
+	failPod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			UID: uuid.NewUUID(),
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name: string(uuid.NewUUID()),
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							v1.ResourceName(resourceName1): resourceQuantity2,
+						},
+					},
+				},
+			},
+		},
+	}
+	err = testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: failPod})
+	as.NotNil(err)
+	runContainerOpts2 := testManager.GetDeviceRunContainerOptions(failPod, &failPod.Spec.Containers[0])
+	as.Nil(runContainerOpts2)
+
+	// Requesting to create a new pod with a single resourceName2 should succeed.
+	newPod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			UID: uuid.NewUUID(),
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name: string(uuid.NewUUID()),
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							v1.ResourceName(resourceName2): resourceQuantity2,
+						},
+					},
+				},
+			},
+		},
+	}
+	err = testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: newPod})
+	as.Nil(err)
+	runContainerOpts3 := testManager.GetDeviceRunContainerOptions(newPod, &newPod.Spec.Containers[0])
+	as.Equal(1, len(runContainerOpts3.Envs))
+}
+
+func TestSanitizeNodeAllocatable(t *testing.T) {
+	resourceName1 := "domain1.com/resource1"
+	devID1 := "dev1"
+
+	resourceName2 := "domain2.com/resource2"
+	devID2 := "dev2"
+
+	as := assert.New(t)
+	monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
+
+	testManager := &ManagerImpl{
+		callback:         monitorCallback,
+		allDevices:       make(map[string]sets.String),
+		allocatedDevices: make(map[string]sets.String),
+		podDevices:       make(podDevices),
+	}
+	// require one of resource1 and one of resource2
+	testManager.allocatedDevices[resourceName1] = sets.NewString()
+	testManager.allocatedDevices[resourceName1].Insert(devID1)
+	testManager.allocatedDevices[resourceName2] = sets.NewString()
+	testManager.allocatedDevices[resourceName2].Insert(devID2)
+
+	cachedNode := &v1.Node{
+		Status: v1.NodeStatus{
+			Allocatable: v1.ResourceList{
+				// has no resource1 and two of resource2
+				v1.ResourceName(resourceName2): *resource.NewQuantity(int64(2), resource.DecimalSI),
+			},
+		},
+	}
+	nodeInfo := &schedulercache.NodeInfo{}
+	nodeInfo.SetNode(cachedNode)
+
+	testManager.sanitizeNodeAllocatable(nodeInfo)
+
+	allocatableScalarResources := nodeInfo.AllocatableResource().ScalarResources
+	// allocatable in nodeInfo is less than needed, should update
+	as.Equal(1, int(allocatableScalarResources[v1.ResourceName(resourceName1)]))
+	// allocatable in nodeInfo is more than needed, should skip updating
+	as.Equal(2, int(allocatableScalarResources[v1.ResourceName(resourceName2)]))
+}
diff --git a/pkg/kubelet/cm/deviceplugin/pod_devices.go b/pkg/kubelet/cm/deviceplugin/pod_devices.go
index e343321e22..e903dbf103 100644
--- a/pkg/kubelet/cm/deviceplugin/pod_devices.go
+++ b/pkg/kubelet/cm/deviceplugin/pod_devices.go
@@ -116,6 +116,11 @@ func (pdev podDevices) toCheckpointData() checkpointData {
 		for conName, resources := range containerDevices {
 			for resource, devices := range resources {
 				devIds := devices.deviceIds.UnsortedList()
+				if devices.allocResp == nil {
+					glog.Errorf("Can't marshal allocResp for %v %v %v: allocation response is missing", podUID, conName, resource)
+					continue
+				}
+
 				allocResp, err := devices.allocResp.Marshal()
 				if err != nil {
 					glog.Errorf("Can't marshal allocResp for %v %v %v: %v", podUID, conName, resource, err)
diff --git a/pkg/kubelet/cm/deviceplugin/types.go b/pkg/kubelet/cm/deviceplugin/types.go
index 9745393d49..4b1607a71d 100644
--- a/pkg/kubelet/cm/deviceplugin/types.go
+++ b/pkg/kubelet/cm/deviceplugin/types.go
@@ -17,34 +17,40 @@ limitations under the License.
 package deviceplugin
 
 import (
+	"k8s.io/api/core/v1"
 	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
+	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )
 
-// MonitorCallback is the function called when a device's health state changes,
-// or new devices are reported, or old devices are deleted.
-// Updated contains the most recent state of the Device.
-type MonitorCallback func(resourceName string, added, updated, deleted []pluginapi.Device)
-
 // Manager manages all the Device Plugins running on a node.
 type Manager interface {
-	// Start starts the gRPC Registration service.
-	Start() error
+	// Start starts device plugin registration service.
+	Start(activePods ActivePodsFunc) error
 
 	// Devices is the map of devices that have registered themselves
 	// against the manager.
 	// The map key is the ResourceName of the device plugins.
 	Devices() map[string][]pluginapi.Device
 
-	// Allocate takes resourceName and list of device Ids, and calls the
-	// gRPC Allocate on the device plugin matching the resourceName.
-	Allocate(string, []string) (*pluginapi.AllocateResponse, error)
+	// Allocate configures and assigns devices to pods. The pods are provided
+	// through the pod admission attributes in the attrs argument. From the
+	// requested device resources, Allocate will communicate with the owning
+	// device plugin to allow setup procedures to take place, and for the
+	// device plugin to provide runtime settings to use the device (environment
+	// variables, mount points and device files). The node object is provided
+	// for the device manager to update the node capacity to reflect the
+	// currently available devices.
+	Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error
 
 	// Stop stops the manager.
 	Stop() error
 
-	// Returns checkpoint file path.
-	CheckpointFile() string
+	// GetDeviceRunContainerOptions checks whether we have cached containerDevices
+	// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
+	// for the found one. An empty struct is returned in case no cached state is found.
+	GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions
 }
 
 // DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices.

From 21ab4d0c9ba3416792a1a171ad2618eb990e6d9b Mon Sep 17 00:00:00 2001
From: Eric Chiang <eric.chiang.m@gmail.com>
Date: Mon, 20 Nov 2017 14:03:04 -0800
Subject: [PATCH 30/33] rbac bootstrap policy: add selfsubjectrulesreviews to
 basic-user

---
 plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go       | 2 +-
 .../authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go
index 16a5d9a44e..29bd87f2a7 100644
--- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go
+++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go
@@ -169,7 +169,7 @@ func ClusterRoles() []rbac.ClusterRole {
 			ObjectMeta: metav1.ObjectMeta{Name: "system:basic-user"},
 			Rules: []rbac.PolicyRule{
 				// TODO add future selfsubjectrulesreview, project request APIs, project listing APIs
-				rbac.NewRule("create").Groups(authorizationGroup).Resources("selfsubjectaccessreviews").RuleOrDie(),
+				rbac.NewRule("create").Groups(authorizationGroup).Resources("selfsubjectaccessreviews", "selfsubjectrulesreviews").RuleOrDie(),
 			},
 		},
 
diff --git a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml
index 7fe801931a..4db6a8a113 100644
--- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml
+++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml
@@ -522,6 +522,7 @@ items:
     - authorization.k8s.io
     resources:
     - selfsubjectaccessreviews
+    - selfsubjectrulesreviews
     verbs:
     - create
 - apiVersion: rbac.authorization.k8s.io/v1

From 25469e9b4467a2ce7b7c6d3284905fe5a6e86c3c Mon Sep 17 00:00:00 2001
From: Jun Xiang Tee <juntee@google.com>
Date: Mon, 20 Nov 2017 15:36:27 -0800
Subject: [PATCH 31/33] convert testScaledRolloutDeployment e2e test to
 integration test

---
 test/e2e/apps/BUILD                           |   1 -
 test/e2e/apps/deployment.go                   | 179 ----------------
 test/integration/deployment/BUILD             |   1 +
 .../integration/deployment/deployment_test.go | 192 ++++++++++++++++++
 test/integration/deployment/util.go           |  13 +-
 test/integration/replicaset/BUILD             |   1 +
 .../integration/replicaset/replicaset_test.go |  12 +-
 test/utils/replicaset.go                      |  16 ++
 8 files changed, 225 insertions(+), 190 deletions(-)

diff --git a/test/e2e/apps/BUILD b/test/e2e/apps/BUILD
index 6b0cca1d2d..b6bb7d4fbe 100644
--- a/test/e2e/apps/BUILD
+++ b/test/e2e/apps/BUILD
@@ -69,7 +69,6 @@ go_library(
         "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/watch:go_default_library",
         "//vendor/k8s.io/client-go/kubernetes:go_default_library",
-        "//vendor/k8s.io/client-go/kubernetes/typed/extensions/v1beta1:go_default_library",
         "//vendor/k8s.io/client-go/tools/cache:go_default_library",
     ],
 )
diff --git a/test/e2e/apps/deployment.go b/test/e2e/apps/deployment.go
index 77845b695d..b8e845a5a9 100644
--- a/test/e2e/apps/deployment.go
+++ b/test/e2e/apps/deployment.go
@@ -35,7 +35,6 @@ import (
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/apimachinery/pkg/watch"
 	clientset "k8s.io/client-go/kubernetes"
-	extensionsclient "k8s.io/client-go/kubernetes/typed/extensions/v1beta1"
 	extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
 	"k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
 	deploymentutil "k8s.io/kubernetes/pkg/controller/deployment/util"
@@ -87,10 +86,6 @@ var _ = SIGDescribe("Deployment", func() {
 	It("deployment should support rollback", func() {
 		testRollbackDeployment(f)
 	})
-	It("scaled rollout deployment should not block on annotation check", func() {
-		testScaledRolloutDeployment(f)
-	})
-
 	It("iterative rollouts should eventually progress", func() {
 		testIterativeDeployments(f)
 	})
@@ -621,159 +616,6 @@ func testRollbackDeployment(f *framework.Framework) {
 	Expect(err).NotTo(HaveOccurred())
 }
 
-func testScaledRolloutDeployment(f *framework.Framework) {
-	ns := f.Namespace.Name
-	c := f.ClientSet
-
-	podLabels := map[string]string{"name": NginxImageName}
-	replicas := int32(10)
-
-	// Create a nginx deployment.
-	deploymentName := "nginx"
-	d := framework.NewDeployment(deploymentName, replicas, podLabels, NginxImageName, NginxImage, extensions.RollingUpdateDeploymentStrategyType)
-	d.Spec.Strategy.RollingUpdate = new(extensions.RollingUpdateDeployment)
-	d.Spec.Strategy.RollingUpdate.MaxSurge = intOrStrP(3)
-	d.Spec.Strategy.RollingUpdate.MaxUnavailable = intOrStrP(2)
-
-	framework.Logf("Creating deployment %q", deploymentName)
-	deployment, err := c.ExtensionsV1beta1().Deployments(ns).Create(d)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for observed generation %d", deployment.Generation)
-	Expect(framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)).NotTo(HaveOccurred())
-
-	// Verify that the required pods have come up.
-	framework.Logf("Waiting for all required pods to come up")
-	err = framework.VerifyPodsRunning(f.ClientSet, ns, NginxImageName, false, *(deployment.Spec.Replicas))
-	Expect(err).NotTo(HaveOccurred(), "error in waiting for pods to come up: %v", err)
-
-	framework.Logf("Waiting for deployment %q to complete", deployment.Name)
-	Expect(framework.WaitForDeploymentComplete(c, deployment)).NotTo(HaveOccurred())
-
-	first, err := deploymentutil.GetNewReplicaSet(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	// Update the deployment with a non-existent image so that the new replica set will be blocked.
-	framework.Logf("Updating deployment %q with a non-existent image", deploymentName)
-	deployment, err = framework.UpdateDeploymentWithRetries(c, ns, d.Name, func(update *extensions.Deployment) {
-		update.Spec.Template.Spec.Containers[0].Image = "nginx:404"
-	})
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for observed generation %d", deployment.Generation)
-	err = framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)
-	Expect(err).NotTo(HaveOccurred())
-
-	deployment, err = c.ExtensionsV1beta1().Deployments(ns).Get(deploymentName, metav1.GetOptions{})
-	Expect(err).NotTo(HaveOccurred())
-
-	if deployment.Status.AvailableReplicas < deploymentutil.MinAvailable(deployment) {
-		Expect(fmt.Errorf("Observed %d available replicas, less than min required %d", deployment.Status.AvailableReplicas, deploymentutil.MinAvailable(deployment))).NotTo(HaveOccurred())
-	}
-
-	framework.Logf("Checking that the replica sets for %q are synced", deploymentName)
-	second, err := deploymentutil.GetNewReplicaSet(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	first, err = c.ExtensionsV1beta1().ReplicaSets(first.Namespace).Get(first.Name, metav1.GetOptions{})
-	Expect(err).NotTo(HaveOccurred())
-
-	firstCond := replicaSetHasDesiredReplicas(c.ExtensionsV1beta1(), first)
-	err = wait.PollImmediate(10*time.Millisecond, 1*time.Minute, firstCond)
-	Expect(err).NotTo(HaveOccurred())
-
-	secondCond := replicaSetHasDesiredReplicas(c.ExtensionsV1beta1(), second)
-	err = wait.PollImmediate(10*time.Millisecond, 1*time.Minute, secondCond)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Updating the size (up) and template at the same time for deployment %q", deploymentName)
-	newReplicas := int32(20)
-	deployment, err = framework.UpdateDeploymentWithRetries(c, ns, deployment.Name, func(update *extensions.Deployment) {
-		update.Spec.Replicas = &newReplicas
-		update.Spec.Template.Spec.Containers[0].Image = NautilusImage
-	})
-	Expect(err).NotTo(HaveOccurred())
-
-	err = framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for deployment status to sync (current available: %d, minimum available: %d)", deployment.Status.AvailableReplicas, deploymentutil.MinAvailable(deployment))
-	Expect(framework.WaitForDeploymentComplete(c, deployment)).NotTo(HaveOccurred())
-
-	oldRSs, _, rs, err := deploymentutil.GetAllReplicaSets(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	for _, rs := range append(oldRSs, rs) {
-		framework.Logf("Ensuring replica set %q has the correct desiredReplicas annotation", rs.Name)
-		desired, ok := deploymentutil.GetDesiredReplicasAnnotation(rs)
-		if !ok || desired == *(deployment.Spec.Replicas) {
-			continue
-		}
-		err = fmt.Errorf("unexpected desiredReplicas annotation %d for replica set %q", desired, rs.Name)
-		Expect(err).NotTo(HaveOccurred())
-	}
-
-	// Update the deployment with a non-existent image so that the new replica set will be blocked.
-	framework.Logf("Updating deployment %q with a non-existent image", deploymentName)
-	deployment, err = framework.UpdateDeploymentWithRetries(c, ns, d.Name, func(update *extensions.Deployment) {
-		update.Spec.Template.Spec.Containers[0].Image = "nginx:404"
-	})
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for observed generation %d", deployment.Generation)
-	err = framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)
-	Expect(err).NotTo(HaveOccurred())
-
-	deployment, err = c.ExtensionsV1beta1().Deployments(ns).Get(deploymentName, metav1.GetOptions{})
-	Expect(err).NotTo(HaveOccurred())
-
-	if deployment.Status.AvailableReplicas < deploymentutil.MinAvailable(deployment) {
-		Expect(fmt.Errorf("Observed %d available replicas, less than min required %d", deployment.Status.AvailableReplicas, deploymentutil.MinAvailable(deployment))).NotTo(HaveOccurred())
-	}
-
-	framework.Logf("Checking that the replica sets for %q are synced", deploymentName)
-	oldRs, err := c.ExtensionsV1beta1().ReplicaSets(rs.Namespace).Get(rs.Name, metav1.GetOptions{})
-	Expect(err).NotTo(HaveOccurred())
-
-	newRs, err := deploymentutil.GetNewReplicaSet(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	oldCond := replicaSetHasDesiredReplicas(c.ExtensionsV1beta1(), oldRs)
-	err = wait.PollImmediate(10*time.Millisecond, 1*time.Minute, oldCond)
-	Expect(err).NotTo(HaveOccurred())
-
-	newCond := replicaSetHasDesiredReplicas(c.ExtensionsV1beta1(), newRs)
-	err = wait.PollImmediate(10*time.Millisecond, 1*time.Minute, newCond)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Updating the size (down) and template at the same time for deployment %q", deploymentName)
-	newReplicas = int32(5)
-	deployment, err = framework.UpdateDeploymentWithRetries(c, ns, deployment.Name, func(update *extensions.Deployment) {
-		update.Spec.Replicas = &newReplicas
-		update.Spec.Template.Spec.Containers[0].Image = KittenImage
-	})
-	Expect(err).NotTo(HaveOccurred())
-
-	err = framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for deployment status to sync (current available: %d, minimum available: %d)", deployment.Status.AvailableReplicas, deploymentutil.MinAvailable(deployment))
-	Expect(framework.WaitForDeploymentComplete(c, deployment)).NotTo(HaveOccurred())
-
-	oldRSs, _, rs, err = deploymentutil.GetAllReplicaSets(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	for _, rs := range append(oldRSs, rs) {
-		framework.Logf("Ensuring replica set %q has the correct desiredReplicas annotation", rs.Name)
-		desired, ok := deploymentutil.GetDesiredReplicasAnnotation(rs)
-		if !ok || desired == *(deployment.Spec.Replicas) {
-			continue
-		}
-		err = fmt.Errorf("unexpected desiredReplicas annotation %d for replica set %q", desired, rs.Name)
-		Expect(err).NotTo(HaveOccurred())
-	}
-}
-
 func randomScale(d *extensions.Deployment, i int) {
 	switch r := rand.Float32(); {
 	case r < 0.3:
@@ -904,17 +746,6 @@ func testIterativeDeployments(f *framework.Framework) {
 	Expect(framework.WaitForDeploymentWithCondition(c, ns, deploymentName, deploymentutil.NewRSAvailableReason, extensions.DeploymentProgressing)).NotTo(HaveOccurred())
 }
 
-func replicaSetHasDesiredReplicas(rsClient extensionsclient.ReplicaSetsGetter, replicaSet *extensions.ReplicaSet) wait.ConditionFunc {
-	desiredGeneration := replicaSet.Generation
-	return func() (bool, error) {
-		rs, err := rsClient.ReplicaSets(replicaSet.Namespace).Get(replicaSet.Name, metav1.GetOptions{})
-		if err != nil {
-			return false, err
-		}
-		return rs.Status.ObservedGeneration >= desiredGeneration && rs.Status.Replicas == *(rs.Spec.Replicas), nil
-	}
-}
-
 func testDeploymentsControllerRef(f *framework.Framework) {
 	ns := f.Namespace.Name
 	c := f.ClientSet
@@ -954,16 +785,6 @@ func testDeploymentsControllerRef(f *framework.Framework) {
 	Expect(err).NotTo(HaveOccurred())
 }
 
-func waitDeploymentReplicaSetsControllerRef(c clientset.Interface, ns string, uid types.UID, label map[string]string) func() (bool, error) {
-	return func() (bool, error) {
-		err := checkDeploymentReplicaSetsControllerRef(c, ns, uid, label)
-		if err != nil {
-			return false, nil
-		}
-		return true, nil
-	}
-}
-
 func checkDeploymentReplicaSetsControllerRef(c clientset.Interface, ns string, uid types.UID, label map[string]string) error {
 	rsList := listDeploymentReplicaSets(c, ns, label)
 	for _, rs := range rsList.Items {
diff --git a/test/integration/deployment/BUILD b/test/integration/deployment/BUILD
index ee29e8243d..4e88a855b8 100644
--- a/test/integration/deployment/BUILD
+++ b/test/integration/deployment/BUILD
@@ -42,6 +42,7 @@ go_library(
         "//vendor/k8s.io/api/core/v1:go_default_library",
         "//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
         "//vendor/k8s.io/client-go/informers:go_default_library",
         "//vendor/k8s.io/client-go/kubernetes:go_default_library",
diff --git a/test/integration/deployment/deployment_test.go b/test/integration/deployment/deployment_test.go
index deb4807a05..2745b3f0d6 100644
--- a/test/integration/deployment/deployment_test.go
+++ b/test/integration/deployment/deployment_test.go
@@ -876,3 +876,195 @@ func TestOverlappingDeployments(t *testing.T) {
 		}
 	}
 }
+
+// Deployment should not block rollout when updating spec replica number and template at the same time.
+func TestScaledRolloutDeployment(t *testing.T) {
+	s, closeFn, rm, dc, informers, c := dcSetup(t)
+	defer closeFn()
+	name := "test-scaled-rollout-deployment"
+	ns := framework.CreateTestingNamespace(name, s, t)
+	defer framework.DeleteTestingNamespace(ns, s, t)
+
+	stopCh := make(chan struct{})
+	defer close(stopCh)
+	informers.Start(stopCh)
+	go rm.Run(5, stopCh)
+	go dc.Run(5, stopCh)
+
+	// Create a deployment with rolling update strategy, max surge = 3, and max unavailable = 2
+	var err error
+	replicas := int32(10)
+	tester := &deploymentTester{t: t, c: c, deployment: newDeployment(name, ns.Name, replicas)}
+	tester.deployment.Spec.Strategy.RollingUpdate.MaxSurge = intOrStrP(3)
+	tester.deployment.Spec.Strategy.RollingUpdate.MaxUnavailable = intOrStrP(2)
+	tester.deployment, err = c.ExtensionsV1beta1().Deployments(ns.Name).Create(tester.deployment)
+	if err != nil {
+		t.Fatalf("failed to create deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("1", fakeImage); err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitForDeploymentCompleteAndMarkPodsReady(); err != nil {
+		t.Fatalf("deployment %q failed to complete: %v", name, err)
+	}
+
+	// Record current replicaset before starting new rollout
+	firstRS, err := tester.expectNewReplicaSet()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Update the deployment with another new image but do not mark the pods as ready to block new replicaset
+	fakeImage2 := "fakeimage2"
+	tester.deployment, err = tester.updateDeployment(func(update *v1beta1.Deployment) {
+		update.Spec.Template.Spec.Containers[0].Image = fakeImage2
+	})
+	if err != nil {
+		t.Fatalf("failed updating deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("2", fakeImage2); err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify the deployment has minimum available replicas after 2nd rollout
+	tester.deployment, err = c.ExtensionsV1beta1().Deployments(ns.Name).Get(name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("failed to get deployment %q: %v", name, err)
+	}
+	minAvailableReplicas := deploymentutil.MinAvailable(tester.deployment)
+	if tester.deployment.Status.AvailableReplicas < minAvailableReplicas {
+		t.Fatalf("deployment %q does not have minimum number of available replicas after 2nd rollout", name)
+	}
+
+	// Wait for old replicaset of 1st rollout to have desired replicas
+	firstRS, err = c.ExtensionsV1beta1().ReplicaSets(ns.Name).Get(firstRS.Name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("failed to get replicaset %q: %v", firstRS.Name, err)
+	}
+	if err = tester.waitRSStable(firstRS); err != nil {
+		t.Fatal(err)
+	}
+
+	// Wait for new replicaset of 2nd rollout to have desired replicas
+	secondRS, err := tester.expectNewReplicaSet()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitRSStable(secondRS); err != nil {
+		t.Fatal(err)
+	}
+
+	// Scale up the deployment and update its image to another new image simultaneously (this time marks all pods as ready)
+	newReplicas := int32(20)
+	fakeImage3 := "fakeimage3"
+	tester.deployment, err = tester.updateDeployment(func(update *v1beta1.Deployment) {
+		update.Spec.Replicas = &newReplicas
+		update.Spec.Template.Spec.Containers[0].Image = fakeImage3
+	})
+	if err != nil {
+		t.Fatalf("failed updating deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("3", fakeImage3); err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitForDeploymentCompleteAndMarkPodsReady(); err != nil {
+		t.Fatalf("deployment %q failed to complete: %v", name, err)
+	}
+
+	// Verify every replicaset has correct desiredReplicas annotation after 3rd rollout
+	thirdRS, err := deploymentutil.GetNewReplicaSet(tester.deployment, c.ExtensionsV1beta1())
+	if err != nil {
+		t.Fatalf("failed getting new revision 3 replicaset for deployment %q: %v", name, err)
+	}
+	rss := []*v1beta1.ReplicaSet{firstRS, secondRS, thirdRS}
+	for _, curRS := range rss {
+		curRS, err = c.ExtensionsV1beta1().ReplicaSets(ns.Name).Get(curRS.Name, metav1.GetOptions{})
+		if err != nil {
+			t.Fatalf("failed to get replicaset when checking desired replicas annotation: %v", err)
+		}
+		desired, ok := deploymentutil.GetDesiredReplicasAnnotation(curRS)
+		if !ok {
+			t.Fatalf("failed to retrieve desiredReplicas annotation for replicaset %q", curRS.Name)
+		}
+		if desired != *(tester.deployment.Spec.Replicas) {
+			t.Fatalf("unexpected desiredReplicas annotation for replicaset %q: expected %d, got %d", curRS.Name, *(tester.deployment.Spec.Replicas), desired)
+		}
+	}
+
+	// Update the deployment with another new image but do not mark the pods as ready to block new replicaset
+	fakeImage4 := "fakeimage4"
+	tester.deployment, err = tester.updateDeployment(func(update *v1beta1.Deployment) {
+		update.Spec.Template.Spec.Containers[0].Image = fakeImage4
+	})
+	if err != nil {
+		t.Fatalf("failed updating deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("4", fakeImage4); err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify the deployment has minimum available replicas after 4th rollout
+	tester.deployment, err = c.ExtensionsV1beta1().Deployments(ns.Name).Get(name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("failed to get deployment %q: %v", name, err)
+	}
+	minAvailableReplicas = deploymentutil.MinAvailable(tester.deployment)
+	if tester.deployment.Status.AvailableReplicas < minAvailableReplicas {
+		t.Fatalf("deployment %q does not have minimum number of available replicas after 4th rollout", name)
+	}
+
+	// Wait for old replicaset of 3rd rollout to have desired replicas
+	thirdRS, err = c.ExtensionsV1beta1().ReplicaSets(ns.Name).Get(thirdRS.Name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("failed to get replicaset %q: %v", thirdRS.Name, err)
+	}
+	if err = tester.waitRSStable(thirdRS); err != nil {
+		t.Fatal(err)
+	}
+
+	// Wait for new replicaset of 4th rollout to have desired replicas
+	fourthRS, err := tester.expectNewReplicaSet()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitRSStable(fourthRS); err != nil {
+		t.Fatal(err)
+	}
+
+	// Scale down the deployment and update its image to another new image simultaneously (this time marks all pods as ready)
+	newReplicas = int32(5)
+	fakeImage5 := "fakeimage5"
+	tester.deployment, err = tester.updateDeployment(func(update *v1beta1.Deployment) {
+		update.Spec.Replicas = &newReplicas
+		update.Spec.Template.Spec.Containers[0].Image = fakeImage5
+	})
+	if err != nil {
+		t.Fatalf("failed updating deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("5", fakeImage5); err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitForDeploymentCompleteAndMarkPodsReady(); err != nil {
+		t.Fatalf("deployment %q failed to complete: %v", name, err)
+	}
+
+	// Verify every replicaset has correct desiredReplicas annotation after 5th rollout
+	fifthRS, err := deploymentutil.GetNewReplicaSet(tester.deployment, c.ExtensionsV1beta1())
+	if err != nil {
+		t.Fatalf("failed getting new revision 5 replicaset for deployment %q: %v", name, err)
+	}
+	rss = []*v1beta1.ReplicaSet{thirdRS, fourthRS, fifthRS}
+	for _, curRS := range rss {
+		curRS, err = c.ExtensionsV1beta1().ReplicaSets(ns.Name).Get(curRS.Name, metav1.GetOptions{})
+		if err != nil {
+			t.Fatalf("failed to get replicaset when checking desired replicas annotation: %v", err)
+		}
+		desired, ok := deploymentutil.GetDesiredReplicasAnnotation(curRS)
+		if !ok {
+			t.Fatalf("failed to retrieve desiredReplicas annotation for replicaset %q", curRS.Name)
+		}
+		if desired != *(tester.deployment.Spec.Replicas) {
+			t.Fatalf("unexpected desiredReplicas annotation for replicaset %q: expected %d, got %d", curRS.Name, *(tester.deployment.Spec.Replicas), desired)
+		}
+	}
+}
diff --git a/test/integration/deployment/util.go b/test/integration/deployment/util.go
index cfc33fa6ab..dc196c92a3 100644
--- a/test/integration/deployment/util.go
+++ b/test/integration/deployment/util.go
@@ -26,6 +26,7 @@ import (
 	"k8s.io/api/core/v1"
 	"k8s.io/api/extensions/v1beta1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/informers"
 	clientset "k8s.io/client-go/kubernetes"
@@ -80,7 +81,8 @@ func newDeployment(name, ns string, replicas int32) *v1beta1.Deployment {
 			Replicas: &replicas,
 			Selector: &metav1.LabelSelector{MatchLabels: testLabels()},
 			Strategy: v1beta1.DeploymentStrategy{
-				Type: v1beta1.RollingUpdateDeploymentStrategyType,
+				Type:          v1beta1.RollingUpdateDeploymentStrategyType,
+				RollingUpdate: new(v1beta1.RollingUpdateDeployment),
 			},
 			Template: v1.PodTemplateSpec{
 				ObjectMeta: metav1.ObjectMeta{
@@ -212,6 +214,11 @@ func markPodReady(c clientset.Interface, ns string, pod *v1.Pod) error {
 	return err
 }
 
+func intOrStrP(num int) *intstr.IntOrString {
+	intstr := intstr.FromInt(num)
+	return &intstr
+}
+
 // markUpdatedPodsReady manually marks updated Deployment pods status to ready,
 // until the deployment is complete
 func (d *deploymentTester) markUpdatedPodsReady(wg *sync.WaitGroup) {
@@ -405,3 +412,7 @@ func (d *deploymentTester) listUpdatedPods() ([]v1.Pod, error) {
 	}
 	return ownedPods, nil
 }
+
+func (d *deploymentTester) waitRSStable(replicaset *v1beta1.ReplicaSet) error {
+	return testutil.WaitRSStable(d.t, d.c, replicaset, pollInterval, pollTimeout)
+}
diff --git a/test/integration/replicaset/BUILD b/test/integration/replicaset/BUILD
index ea194022e2..58575e50ea 100644
--- a/test/integration/replicaset/BUILD
+++ b/test/integration/replicaset/BUILD
@@ -18,6 +18,7 @@ go_test(
         "//pkg/api/v1/pod:go_default_library",
         "//pkg/controller/replicaset:go_default_library",
         "//test/integration/framework:go_default_library",
+        "//test/utils:go_default_library",
         "//vendor/k8s.io/api/core/v1:go_default_library",
         "//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
         "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
diff --git a/test/integration/replicaset/replicaset_test.go b/test/integration/replicaset/replicaset_test.go
index 7ad3489c7c..91ae5ce7ff 100644
--- a/test/integration/replicaset/replicaset_test.go
+++ b/test/integration/replicaset/replicaset_test.go
@@ -41,6 +41,7 @@ import (
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 	"k8s.io/kubernetes/pkg/controller/replicaset"
 	"k8s.io/kubernetes/test/integration/framework"
+	testutil "k8s.io/kubernetes/test/utils"
 )
 
 const (
@@ -217,15 +218,8 @@ func createRSsPods(t *testing.T, clientSet clientset.Interface, rss []*v1beta1.R
 
 // Verify .Status.Replicas is equal to .Spec.Replicas
 func waitRSStable(t *testing.T, clientSet clientset.Interface, rs *v1beta1.ReplicaSet) {
-	rsClient := clientSet.Extensions().ReplicaSets(rs.Namespace)
-	if err := wait.PollImmediate(interval, timeout, func() (bool, error) {
-		newRS, err := rsClient.Get(rs.Name, metav1.GetOptions{})
-		if err != nil {
-			return false, err
-		}
-		return newRS.Status.Replicas == *rs.Spec.Replicas, nil
-	}); err != nil {
-		t.Fatalf("Failed to verify .Status.Replicas is equal to .Spec.Replicas for rs %s: %v", rs.Name, err)
+	if err := testutil.WaitRSStable(t, clientSet, rs, interval, timeout); err != nil {
+		t.Fatal(err)
 	}
 }
 
diff --git a/test/utils/replicaset.go b/test/utils/replicaset.go
index 9a4b6d05b9..adf30c35d0 100644
--- a/test/utils/replicaset.go
+++ b/test/utils/replicaset.go
@@ -18,6 +18,7 @@ package utils
 
 import (
 	"fmt"
+	"testing"
 	"time"
 
 	extensions "k8s.io/api/extensions/v1beta1"
@@ -50,3 +51,18 @@ func UpdateReplicaSetWithRetries(c clientset.Interface, namespace, name string,
 	}
 	return rs, pollErr
 }
+
+// Verify .Status.Replicas is equal to .Spec.Replicas
+func WaitRSStable(t *testing.T, clientSet clientset.Interface, rs *extensions.ReplicaSet, pollInterval, pollTimeout time.Duration) error {
+	desiredGeneration := rs.Generation
+	if err := wait.PollImmediate(pollInterval, pollTimeout, func() (bool, error) {
+		newRS, err := clientSet.ExtensionsV1beta1().ReplicaSets(rs.Namespace).Get(rs.Name, metav1.GetOptions{})
+		if err != nil {
+			return false, err
+		}
+		return newRS.Status.ObservedGeneration >= desiredGeneration && newRS.Status.Replicas == *rs.Spec.Replicas, nil
+	}); err != nil {
+		return fmt.Errorf("failed to verify .Status.Replicas is equal to .Spec.Replicas for replicaset %q: %v", rs.Name, err)
+	}
+	return nil
+}

From ee5b040e31217eec982341138ebcd92c8248fb17 Mon Sep 17 00:00:00 2001
From: Michael Taufen <mtaufen@google.com>
Date: Fri, 17 Nov 2017 10:49:24 -0800
Subject: [PATCH 32/33] Add kubeletconfig round trip test

---
 pkg/kubelet/apis/kubeletconfig/BUILD          |   1 +
 pkg/kubelet/apis/kubeletconfig/fuzzer/BUILD   |  32 ++++++
 .../apis/kubeletconfig/fuzzer/fuzzer.go       | 100 ++++++++++++++++++
 pkg/kubelet/apis/kubeletconfig/scheme/BUILD   |  13 ++-
 .../apis/kubeletconfig/scheme/scheme_test.go  |  32 ++++++
 .../apis/kubeletconfig/v1alpha1/defaults.go   |  12 +--
 6 files changed, 183 insertions(+), 7 deletions(-)
 create mode 100644 pkg/kubelet/apis/kubeletconfig/fuzzer/BUILD
 create mode 100644 pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go
 create mode 100644 pkg/kubelet/apis/kubeletconfig/scheme/scheme_test.go

diff --git a/pkg/kubelet/apis/kubeletconfig/BUILD b/pkg/kubelet/apis/kubeletconfig/BUILD
index 4046a1a9e7..c62439d626 100644
--- a/pkg/kubelet/apis/kubeletconfig/BUILD
+++ b/pkg/kubelet/apis/kubeletconfig/BUILD
@@ -34,6 +34,7 @@ filegroup(
     name = "all-srcs",
     srcs = [
         ":package-srcs",
+        "//pkg/kubelet/apis/kubeletconfig/fuzzer:all-srcs",
         "//pkg/kubelet/apis/kubeletconfig/scheme:all-srcs",
         "//pkg/kubelet/apis/kubeletconfig/v1alpha1:all-srcs",
         "//pkg/kubelet/apis/kubeletconfig/validation:all-srcs",
diff --git a/pkg/kubelet/apis/kubeletconfig/fuzzer/BUILD b/pkg/kubelet/apis/kubeletconfig/fuzzer/BUILD
new file mode 100644
index 0000000000..2790330cff
--- /dev/null
+++ b/pkg/kubelet/apis/kubeletconfig/fuzzer/BUILD
@@ -0,0 +1,32 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["fuzzer.go"],
+    importpath = "k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig/fuzzer",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/kubelet/apis/kubeletconfig:go_default_library",
+        "//pkg/kubelet/apis/kubeletconfig/v1alpha1:go_default_library",
+        "//pkg/kubelet/qos:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
+        "//pkg/master/ports:go_default_library",
+        "//vendor/github.com/google/gofuzz:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/runtime/serializer:go_default_library",
+    ],
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [":package-srcs"],
+    tags = ["automanaged"],
+    visibility = ["//visibility:public"],
+)
diff --git a/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go b/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go
new file mode 100644
index 0000000000..8fb0ca7ca5
--- /dev/null
+++ b/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go
@@ -0,0 +1,100 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package fuzzer
+
+import (
+	"time"
+
+	"github.com/google/gofuzz"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	runtimeserializer "k8s.io/apimachinery/pkg/runtime/serializer"
+	"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
+	"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig/v1alpha1"
+	"k8s.io/kubernetes/pkg/kubelet/qos"
+	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+	"k8s.io/kubernetes/pkg/master/ports"
+)
+
+// Funcs returns the fuzzer functions for the kubeletconfig apis.
+func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
+	return []interface{}{
+		// provide non-empty values for fields with defaults, so the defaulter doesn't change values during round-trip
+		func(obj *kubeletconfig.KubeletConfiguration, c fuzz.Continue) {
+			c.FuzzNoCustom(obj)
+			obj.ConfigTrialDuration = &metav1.Duration{Duration: 10 * time.Minute}
+			obj.Authentication.Anonymous.Enabled = true
+			obj.Authentication.Webhook.Enabled = false
+			obj.Authentication.Webhook.CacheTTL = metav1.Duration{Duration: 2 * time.Minute}
+			obj.Authorization.Mode = kubeletconfig.KubeletAuthorizationModeAlwaysAllow
+			obj.Authorization.Webhook.CacheAuthorizedTTL = metav1.Duration{Duration: 5 * time.Minute}
+			obj.Authorization.Webhook.CacheUnauthorizedTTL = metav1.Duration{Duration: 30 * time.Second}
+			obj.Address = "0.0.0.0"
+			obj.CAdvisorPort = 4194
+			obj.VolumeStatsAggPeriod = metav1.Duration{Duration: time.Minute}
+			obj.RuntimeRequestTimeout = metav1.Duration{Duration: 2 * time.Minute}
+			obj.CPUCFSQuota = true
+			obj.EventBurst = 10
+			obj.EventRecordQPS = 5
+			obj.EnableControllerAttachDetach = true
+			obj.EnableDebuggingHandlers = true
+			obj.EnableServer = true
+			obj.FileCheckFrequency = metav1.Duration{Duration: 20 * time.Second}
+			obj.HealthzBindAddress = "127.0.0.1"
+			obj.HealthzPort = 10248
+			obj.HostNetworkSources = []string{kubetypes.AllSource}
+			obj.HostPIDSources = []string{kubetypes.AllSource}
+			obj.HostIPCSources = []string{kubetypes.AllSource}
+			obj.HTTPCheckFrequency = metav1.Duration{Duration: 20 * time.Second}
+			obj.ImageMinimumGCAge = metav1.Duration{Duration: 2 * time.Minute}
+			obj.ImageGCHighThresholdPercent = 85
+			obj.ImageGCLowThresholdPercent = 80
+			obj.MaxOpenFiles = 1000000
+			obj.MaxPods = 110
+			obj.NodeStatusUpdateFrequency = metav1.Duration{Duration: 10 * time.Second}
+			obj.CPUManagerPolicy = "none"
+			obj.CPUManagerReconcilePeriod = obj.NodeStatusUpdateFrequency
+			obj.OOMScoreAdj = int32(qos.KubeletOOMScoreAdj)
+			obj.Port = ports.KubeletPort
+			obj.ReadOnlyPort = ports.KubeletReadOnlyPort
+			obj.RegistryBurst = 10
+			obj.RegistryPullQPS = 5
+			obj.ResolverConfig = kubetypes.ResolvConfDefault
+			obj.SerializeImagePulls = true
+			obj.StreamingConnectionIdleTimeout = metav1.Duration{Duration: 4 * time.Hour}
+			obj.SyncFrequency = metav1.Duration{Duration: 1 * time.Minute}
+			obj.ContentType = "application/vnd.kubernetes.protobuf"
+			obj.KubeAPIQPS = 5
+			obj.KubeAPIBurst = 10
+			obj.HairpinMode = v1alpha1.PromiscuousBridge
+			obj.EvictionHard = map[string]string{
+				"memory.available":  "100Mi",
+				"nodefs.available":  "10%",
+				"nodefs.inodesFree": "5%",
+				"imagefs.available": "15%",
+			}
+			obj.EvictionPressureTransitionPeriod = metav1.Duration{Duration: 5 * time.Minute}
+			obj.MakeIPTablesUtilChains = true
+			obj.IPTablesMasqueradeBit = v1alpha1.DefaultIPTablesMasqueradeBit
+			obj.IPTablesDropBit = v1alpha1.DefaultIPTablesDropBit
+			obj.CgroupsPerQOS = true
+			obj.CgroupDriver = "cgroupfs"
+			obj.EnforceNodeAllocatable = v1alpha1.DefaultNodeAllocatableEnforcement
+			obj.ManifestURLHeader = make(map[string][]string)
+		},
+	}
+}
diff --git a/pkg/kubelet/apis/kubeletconfig/scheme/BUILD b/pkg/kubelet/apis/kubeletconfig/scheme/BUILD
index ae301abded..9bc425ac6b 100644
--- a/pkg/kubelet/apis/kubeletconfig/scheme/BUILD
+++ b/pkg/kubelet/apis/kubeletconfig/scheme/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
 
 go_library(
     name = "go_default_library",
@@ -26,3 +26,14 @@ filegroup(
     tags = ["automanaged"],
     visibility = ["//visibility:public"],
 )
+
+go_test(
+    name = "go_default_test",
+    srcs = ["scheme_test.go"],
+    importpath = "k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig/scheme",
+    library = ":go_default_library",
+    deps = [
+        "//pkg/kubelet/apis/kubeletconfig/fuzzer:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/api/testing/roundtrip:go_default_library",
+    ],
+)
diff --git a/pkg/kubelet/apis/kubeletconfig/scheme/scheme_test.go b/pkg/kubelet/apis/kubeletconfig/scheme/scheme_test.go
new file mode 100644
index 0000000000..0f59310709
--- /dev/null
+++ b/pkg/kubelet/apis/kubeletconfig/scheme/scheme_test.go
@@ -0,0 +1,32 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheme
+
+import (
+	"testing"
+
+	"k8s.io/apimachinery/pkg/api/testing/roundtrip"
+	"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig/fuzzer"
+)
+
+func TestRoundTripTypes(t *testing.T) {
+	scheme, _, err := NewSchemeAndCodecs()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	roundtrip.RoundTripTestForScheme(t, scheme, fuzzer.Funcs)
+}
diff --git a/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go b/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go
index e14cdf3fff..90ac66813b 100644
--- a/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go
+++ b/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go
@@ -36,14 +36,14 @@ const (
 	// More details here: https://github.com/kubernetes/kubernetes/issues/50986
 	AutoDetectCloudProvider = "auto-detect"
 
-	defaultIPTablesMasqueradeBit = 14
-	defaultIPTablesDropBit       = 15
+	DefaultIPTablesMasqueradeBit = 14
+	DefaultIPTablesDropBit       = 15
 )
 
 var (
 	zeroDuration = metav1.Duration{}
 	// Refer to [Node Allocatable](https://git.k8s.io/community/contributors/design-proposals/node/node-allocatable.md) doc for more information.
-	defaultNodeAllocatableEnforcement = []string{"pods"}
+	DefaultNodeAllocatableEnforcement = []string{"pods"}
 )
 
 func addDefaultingFuncs(scheme *kruntime.Scheme) error {
@@ -210,11 +210,11 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) {
 		obj.MakeIPTablesUtilChains = boolVar(true)
 	}
 	if obj.IPTablesMasqueradeBit == nil {
-		temp := int32(defaultIPTablesMasqueradeBit)
+		temp := int32(DefaultIPTablesMasqueradeBit)
 		obj.IPTablesMasqueradeBit = &temp
 	}
 	if obj.IPTablesDropBit == nil {
-		temp := int32(defaultIPTablesDropBit)
+		temp := int32(DefaultIPTablesDropBit)
 		obj.IPTablesDropBit = &temp
 	}
 	if obj.CgroupsPerQOS == nil {
@@ -225,7 +225,7 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) {
 		obj.CgroupDriver = "cgroupfs"
 	}
 	if obj.EnforceNodeAllocatable == nil {
-		obj.EnforceNodeAllocatable = defaultNodeAllocatableEnforcement
+		obj.EnforceNodeAllocatable = DefaultNodeAllocatableEnforcement
 	}
 }
 

From 334a0f0620949ce443e68f90a82610b9e59a6d0f Mon Sep 17 00:00:00 2001
From: Mitsuhiro Tanino <mitsuhiro.tanino@hds.com>
Date: Sat, 4 Nov 2017 22:31:29 -0400
Subject: [PATCH 33/33] Block Volume: cmdline printer update

---
 pkg/printers/internalversion/describe.go      |  29 ++
 pkg/printers/internalversion/describe_test.go | 294 ++++++++++++++----
 2 files changed, 267 insertions(+), 56 deletions(-)

diff --git a/pkg/printers/internalversion/describe.go b/pkg/printers/internalversion/describe.go
index b8f5a6d82a..60d4860300 100644
--- a/pkg/printers/internalversion/describe.go
+++ b/pkg/printers/internalversion/describe.go
@@ -1136,6 +1136,9 @@ func describePersistentVolume(pv *api.PersistentVolume, events *api.EventList) (
 		}
 		w.Write(LEVEL_0, "Reclaim Policy:\t%v\n", pv.Spec.PersistentVolumeReclaimPolicy)
 		w.Write(LEVEL_0, "Access Modes:\t%s\n", helper.GetAccessModesAsString(pv.Spec.AccessModes))
+		if pv.Spec.VolumeMode != nil {
+			w.Write(LEVEL_0, "VolumeMode:\t%v\n", *pv.Spec.VolumeMode)
+		}
 		storage := pv.Spec.Capacity[api.ResourceStorage]
 		w.Write(LEVEL_0, "Capacity:\t%s\n", storage.String())
 		w.Write(LEVEL_0, "Message:\t%s\n", pv.Status.Message)
@@ -1235,6 +1238,9 @@ func describePersistentVolumeClaim(pvc *api.PersistentVolumeClaim, events *api.E
 		}
 		w.Write(LEVEL_0, "Capacity:\t%s\n", capacity)
 		w.Write(LEVEL_0, "Access Modes:\t%s\n", accessModes)
+		if pvc.Spec.VolumeMode != nil {
+			w.Write(LEVEL_0, "VolumeMode:\t%v\n", *pvc.Spec.VolumeMode)
+		}
 		if events != nil {
 			DescribeEvents(events, w)
 		}
@@ -1365,6 +1371,7 @@ func describeContainerProbe(container api.Container, w PrefixWriter) {
 }
 
 func describeContainerVolumes(container api.Container, w PrefixWriter) {
+	// Show volumeMounts
 	none := ""
 	if len(container.VolumeMounts) == 0 {
 		none = "\t<none>"
@@ -1383,6 +1390,14 @@ func describeContainerVolumes(container api.Container, w PrefixWriter) {
 		}
 		w.Write(LEVEL_3, "%s from %s (%s)\n", mount.MountPath, mount.Name, strings.Join(flags, ","))
 	}
+	// Show volumeDevices if exists
+	if len(container.VolumeDevices) > 0 {
+		w.Write(LEVEL_2, "Devices:%s\n", none)
+		sort.Sort(SortableVolumeDevices(container.VolumeDevices))
+		for _, device := range container.VolumeDevices {
+			w.Write(LEVEL_3, "%s from %s\n", device.DevicePath, device.Name)
+		}
+	}
 }
 
 func describeContainerEnvVars(container api.Container, resolverFn EnvVarResolverFunc, w PrefixWriter) {
@@ -3803,6 +3818,20 @@ func (list SortableVolumeMounts) Less(i, j int) bool {
 	return list[i].MountPath < list[j].MountPath
 }
 
+type SortableVolumeDevices []api.VolumeDevice
+
+func (list SortableVolumeDevices) Len() int {
+	return len(list)
+}
+
+func (list SortableVolumeDevices) Swap(i, j int) {
+	list[i], list[j] = list[j], list[i]
+}
+
+func (list SortableVolumeDevices) Less(i, j int) bool {
+	return list[i].DevicePath < list[j].DevicePath
+}
+
 // TODO: get rid of this and plumb the caller correctly
 func versionedExtensionsClientV1beta1(internalClient clientset.Interface) clientextensionsv1beta1.ExtensionsV1beta1Interface {
 	if internalClient == nil {
diff --git a/pkg/printers/internalversion/describe_test.go b/pkg/printers/internalversion/describe_test.go
index 20891b2aae..0896dc0bc3 100644
--- a/pkg/printers/internalversion/describe_test.go
+++ b/pkg/printers/internalversion/describe_test.go
@@ -634,6 +634,50 @@ func TestDescribeContainers(t *testing.T) {
 			},
 			expectedElements: []string{"cpu", "1k", "memory", "4G", "storage", "20G"},
 		},
+		// volumeMounts read/write
+		{
+			container: api.Container{
+				Name:  "test",
+				Image: "image",
+				VolumeMounts: []api.VolumeMount{
+					{
+						Name:      "mounted-volume",
+						MountPath: "/opt/",
+					},
+				},
+			},
+			expectedElements: []string{"mounted-volume", "/opt/", "(rw)"},
+		},
+		// volumeMounts readonly
+		{
+			container: api.Container{
+				Name:  "test",
+				Image: "image",
+				VolumeMounts: []api.VolumeMount{
+					{
+						Name:      "mounted-volume",
+						MountPath: "/opt/",
+						ReadOnly:  true,
+					},
+				},
+			},
+			expectedElements: []string{"Mounts", "mounted-volume", "/opt/", "(ro)"},
+		},
+
+		// volumeDevices
+		{
+			container: api.Container{
+				Name:  "test",
+				Image: "image",
+				VolumeDevices: []api.VolumeDevice{
+					{
+						Name:       "volume-device",
+						DevicePath: "/dev/xvda",
+					},
+				},
+			},
+			expectedElements: []string{"Devices", "volume-device", "/dev/xvda"},
+		},
 	}
 
 	for i, testCase := range testCases {
@@ -815,99 +859,237 @@ func TestGetPodsTotalRequests(t *testing.T) {
 }
 
 func TestPersistentVolumeDescriber(t *testing.T) {
-	tests := map[string]*api.PersistentVolume{
-
-		"hostpath": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					HostPath: &api.HostPathVolumeSource{Type: new(api.HostPathType)},
+	block := api.PersistentVolumeBlock
+	file := api.PersistentVolumeFilesystem
+	testCases := []struct {
+		plugin             string
+		pv                 *api.PersistentVolume
+		expectedElements   []string
+		unexpectedElements []string
+	}{
+		{
+			plugin: "hostpath",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						HostPath: &api.HostPathVolumeSource{Type: new(api.HostPathType)},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"gce": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					GCEPersistentDisk: &api.GCEPersistentDiskVolumeSource{},
+		{
+			plugin: "gce",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						GCEPersistentDisk: &api.GCEPersistentDiskVolumeSource{},
+					},
+					VolumeMode: &file,
 				},
 			},
+			expectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"ebs": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					AWSElasticBlockStore: &api.AWSElasticBlockStoreVolumeSource{},
+		{
+			plugin: "ebs",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						AWSElasticBlockStore: &api.AWSElasticBlockStoreVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"nfs": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					NFS: &api.NFSVolumeSource{},
+		{
+			plugin: "nfs",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						NFS: &api.NFSVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"iscsi": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					ISCSI: &api.ISCSIPersistentVolumeSource{},
+		{
+			plugin: "iscsi",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						ISCSI: &api.ISCSIPersistentVolumeSource{},
+					},
+					VolumeMode: &block,
 				},
 			},
+			expectedElements: []string{"VolumeMode", "Block"},
 		},
-		"gluster": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					Glusterfs: &api.GlusterfsVolumeSource{},
+		{
+			plugin: "gluster",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						Glusterfs: &api.GlusterfsVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"rbd": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					RBD: &api.RBDPersistentVolumeSource{},
+		{
+			plugin: "rbd",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						RBD: &api.RBDPersistentVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"quobyte": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					Quobyte: &api.QuobyteVolumeSource{},
+		{
+			plugin: "quobyte",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						Quobyte: &api.QuobyteVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"cinder": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					Cinder: &api.CinderVolumeSource{},
+		{
+			plugin: "cinder",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						Cinder: &api.CinderVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"fc": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					FC: &api.FCVolumeSource{},
+		{
+			plugin: "fc",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						FC: &api.FCVolumeSource{},
+					},
+					VolumeMode: &block,
 				},
 			},
+			expectedElements: []string{"VolumeMode", "Block"},
 		},
 	}
 
-	for name, pv := range tests {
-		fake := fake.NewSimpleClientset(pv)
+	for _, test := range testCases {
+		fake := fake.NewSimpleClientset(test.pv)
 		c := PersistentVolumeDescriber{fake}
 		str, err := c.Describe("foo", "bar", printers.DescriberSettings{ShowEvents: true})
 		if err != nil {
-			t.Errorf("Unexpected error for test %s: %v", name, err)
+			t.Errorf("Unexpected error for test %s: %v", test.plugin, err)
 		}
 		if str == "" {
-			t.Errorf("Unexpected empty string for test %s.  Expected PV Describer output", name)
+			t.Errorf("Unexpected empty string for test %s.  Expected PV Describer output", test.plugin)
+		}
+		for _, expected := range test.expectedElements {
+			if !strings.Contains(str, expected) {
+				t.Errorf("expected to find %q in output: %q", expected, str)
+			}
+		}
+		for _, unexpected := range test.unexpectedElements {
+			if strings.Contains(str, unexpected) {
+				t.Errorf("unexpected to find %q in output: %q", unexpected, str)
+			}
+		}
+	}
+}
+
+func TestPersistentVolumeClaimDescriber(t *testing.T) {
+	block := api.PersistentVolumeBlock
+	file := api.PersistentVolumeFilesystem
+	goldClassName := "gold"
+	testCases := []struct {
+		name               string
+		pvc                *api.PersistentVolumeClaim
+		expectedElements   []string
+		unexpectedElements []string
+	}{
+		{
+			name: "default",
+			pvc: &api.PersistentVolumeClaim{
+				ObjectMeta: metav1.ObjectMeta{Namespace: "foo", Name: "bar"},
+				Spec: api.PersistentVolumeClaimSpec{
+					VolumeName:       "volume1",
+					StorageClassName: &goldClassName,
+				},
+				Status: api.PersistentVolumeClaimStatus{
+					Phase: api.ClaimBound,
+				},
+			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
+		},
+		{
+			name: "filesystem",
+			pvc: &api.PersistentVolumeClaim{
+				ObjectMeta: metav1.ObjectMeta{Namespace: "foo", Name: "bar"},
+				Spec: api.PersistentVolumeClaimSpec{
+					VolumeName:       "volume2",
+					StorageClassName: &goldClassName,
+					VolumeMode:       &file,
+				},
+				Status: api.PersistentVolumeClaimStatus{
+					Phase: api.ClaimBound,
+				},
+			},
+			expectedElements: []string{"VolumeMode", "Filesystem"},
+		},
+		{
+			name: "block",
+			pvc: &api.PersistentVolumeClaim{
+				ObjectMeta: metav1.ObjectMeta{Namespace: "foo", Name: "bar"},
+				Spec: api.PersistentVolumeClaimSpec{
+					VolumeName:       "volume3",
+					StorageClassName: &goldClassName,
+					VolumeMode:       &block,
+				},
+				Status: api.PersistentVolumeClaimStatus{
+					Phase: api.ClaimBound,
+				},
+			},
+			expectedElements: []string{"VolumeMode", "Block"},
+		},
+	}
+
+	for _, test := range testCases {
+		fake := fake.NewSimpleClientset(test.pvc)
+		c := PersistentVolumeClaimDescriber{fake}
+		str, err := c.Describe("foo", "bar", printers.DescriberSettings{ShowEvents: true})
+		if err != nil {
+			t.Errorf("Unexpected error for test %s: %v", test.name, err)
+		}
+		if str == "" {
+			t.Errorf("Unexpected empty string for test %s.  Expected PVC Describer output", test.name)
+		}
+		for _, expected := range test.expectedElements {
+			if !strings.Contains(str, expected) {
+				t.Errorf("expected to find %q in output: %q", expected, str)
+			}
+		}
+		for _, unexpected := range test.unexpectedElements {
+			if strings.Contains(str, unexpected) {
+				t.Errorf("unexpected to find %q in output: %q", unexpected, str)
+			}
 		}
 	}
 }