Merge branch 'master' into upgrade_aliases_branch

2017-11-20 21:20:45 -08:00 · 2017-11-20 21:20:45 -08:00 · 391a9df925
parent 551ffbe7bf e201d34296
commit 391a9df925
100 changed files with 7468 additions and 2351 deletions
--- a/cluster/common.sh
+++ b/cluster/common.sh
@ -668,6 +668,7 @@ ENABLE_PROMETHEUS_TO_SD: $(yaml-quote ${ENABLE_PROMETHEUS_TO_SD:-false})
 ENABLE_POD_PRIORITY: $(yaml-quote ${ENABLE_POD_PRIORITY:-})
 CONTAINER_RUNTIME: $(yaml-quote ${CONTAINER_RUNTIME:-})
 CONTAINER_RUNTIME_ENDPOINT: $(yaml-quote ${CONTAINER_RUNTIME_ENDPOINT:-})
+NODE_LOCAL_SSDS_EXT: $(yaml-quote ${NODE_LOCAL_SSDS_EXT:-})
 LOAD_IMAGE_COMMAND: $(yaml-quote ${LOAD_IMAGE_COMMAND:-})
 EOF
  if [ -n "${KUBELET_PORT:-}" ]; then
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@ -36,6 +36,11 @@ MASTER_ROOT_DISK_SIZE=${MASTER_ROOT_DISK_SIZE:-$(get-master-root-disk-size)}
 NODE_DISK_TYPE=${NODE_DISK_TYPE:-pd-standard}
 NODE_DISK_SIZE=${NODE_DISK_SIZE:-100GB}
 NODE_LOCAL_SSDS=${NODE_LOCAL_SSDS:-0}
+# An extension to local SSDs allowing users to specify block/fs and SCSI/NVMe devices
+# Format of this variable will be "#,scsi/nvme,block/fs" you can specify multiple
+# configurations by seperating them by a semi-colon ex. "2,scsi,fs;1,nvme,block"
+# is a request for 2 SCSI formatted and mounted SSDs and 1 NVMe block device SSD.
+NODE_LOCAL_SSDS_EXT=${NODE_LOCAL_SSDS_EXT:-}
 # Accelerators to be attached to each node. Format "type=<accelerator-type>,count=<accelerator-count>"
 # More information on available GPUs here - https://cloud.google.com/compute/docs/gpus/
 NODE_ACCELERATORS=${NODE_ACCELERATORS:-""}
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@ -25,6 +25,9 @@ set -o errexit
 set -o nounset
 set -o pipefail

+readonly UUID_MNT_PREFIX="/mnt/disks/by-uuid/google-local-ssds"
+readonly UUID_BLOCK_PREFIX="/dev/disk/by-uuid/google-local-ssds"
+
 function setup-os-params {
  # Reset core_pattern. On GCI, the default core_pattern pipes the core dumps to
  # /sbin/crash_reporter which is more restrictive in saving crash dumps. So for
@ -85,11 +88,85 @@ function create-dirs {
  fi
 }

-# Formats the given device ($1) if needed and mounts it at given mount point
+# Gets the total number of $(1) and $(2) type disks specified
+# by the user in ${NODE_LOCAL_SSDS_EXT}
+function get-local-disk-num() {
+  local interface="${1}"
+  local format="${2}"
+
+  localdisknum=0
+  if [[ ! -z "${NODE_LOCAL_SSDS_EXT:-}" ]]; then
+    IFS=";" read -r -a ssdgroups <<< "${NODE_LOCAL_SSDS_EXT:-}"
+    for ssdgroup in "${ssdgroups[@]}"; do
+      IFS="," read -r -a ssdopts <<< "${ssdgroup}"
+      local opnum="${ssdopts[0]}"
+      local opinterface="${ssdopts[1]}"
+      local opformat="${ssdopts[2]}"
+
+      if [[ "${opformat,,}" == "${format,,}" && "${opinterface,,}" == "${interface,,}" ]]; then
+        localdisknum=$((localdisknum+opnum))
+      fi
+    done
+  fi
+}
+
+# Creates a symlink for a ($1) so that it may be used as block storage
+function safe-block-symlink(){
+  local device="${1}"
+  local symdir="${2}"
+  
+  mkdir -p "${symdir}"
+
+  get-or-generate-uuid "${device}"
+  local myuuid="${retuuid}"
+
+  local sym="${symdir}/local-ssd-${myuuid}"
+  # Do not "mkdir -p ${sym}" as that will cause unintended symlink behavior
+  ln -s "${device}" "${sym}"
+  echo "Created a symlink for SSD $ssd at ${sym}"
+  chmod a+w "${sym}"
+}
+
+# Gets a pregenerated UUID from ${ssdmap} if it exists, otherwise generates a new
+# UUID and places it inside ${ssdmap}
+function get-or-generate-uuid(){
+  local device="${1}"
+
+  local ssdmap="/home/kubernetes/localssdmap.txt"
+  echo "Generating or getting UUID from ${ssdmap}"
+
+  if [[ ! -e "${ssdmap}" ]]; then
+    touch "${ssdmap}"
+    chmod +w "${ssdmap}"
+  fi
+
+  # each line of the ssdmap looks like "${device} persistent-uuid"
+  if [[ ! -z $(grep ${device} ${ssdmap}) ]]; then
+    #create symlink based on saved uuid
+    local myuuid=$(grep ${device} ${ssdmap} | cut -d ' ' -f 2)
+  else
+    # generate new uuid and add it to the map
+    local myuuid=$(uuidgen)
+    if [[ ! ${?} -eq 0 ]]; then
+      echo "Failed to generate valid UUID with uuidgen" >&2
+      exit 2
+    fi
+    echo "${device} ${myuuid}" >> "${ssdmap}"
+  fi
+
+  if [[ -z "${myuuid}" ]]; then
+    echo "Failed to get a uuid for device ${device} when symlinking." >&2
+    exit 2
+  fi
+
+  retuuid="${myuuid}"
+}
+
+#Formats the given device ($1) if needed and mounts it at given mount point
 # ($2).
 function safe-format-and-mount() {
-  device=$1
-  mountpoint=$2
+  local device="${1}"
+  local mountpoint="${2}"

  # Format only if the disk is not already formatted.
  if ! tune2fs -l "${device}" ; then
@ -102,18 +179,135 @@ function safe-format-and-mount() {
  mount -o discard,defaults "${device}" "${mountpoint}"
 }

-# Local ssds, if present, are mounted at /mnt/disks/ssdN.
+# Gets a devices UUID and bind mounts the device to mount location in
+# /mnt/disks/by-id/
+function unique-uuid-bind-mount(){
+  local mountpoint="${1}"
+  local actual_device="${2}"
+
+  # Trigger udev refresh so that newly formatted devices are propagated in by-uuid
+  udevadm control --reload-rules
+  udevadm trigger
+  udevadm settle 
+
+  # grep the exact match of actual device, prevents substring matching
+  local myuuid=$(ls -l /dev/disk/by-uuid/ | grep "/${actual_device}$" | tr -s ' ' | cut -d ' ' -f 9)
+  # myuuid should be the uuid of the device as found in /dev/disk/by-uuid/ 
+  if [[ -z "${myuuid}" ]]; then
+    echo "Failed to get a uuid for device ${actual_device} when mounting." >&2
+    exit 2
+  fi
+
+  # bindpoint should be the full path of the to-be-bound device
+  local bindpoint="${UUID_MNT_PREFIX}-${interface}-fs/local-ssd-${myuuid}"
+
+  safe-bind-mount "${mountpoint}" "${bindpoint}"
+}
+
+# Bind mounts device at mountpoint to bindpoint
+function safe-bind-mount(){
+  local mountpoint="${1}"
+  local bindpoint="${2}"
+
+  # Mount device to the mountpoint
+  mkdir -p "${bindpoint}"
+  echo "Binding '${mountpoint}' at '${bindpoint}'"
+  mount --bind "${mountpoint}" "${bindpoint}"
+  chmod a+w "${bindpoint}"
+}
+
+
+# Mounts, bindmounts, or symlinks depending on the interface and format
+# of the incoming device
+function mount-ext(){
+  local ssd="${1}"
+  local devicenum="${2}"
+  local interface="${3}"
+  local format="${4}"
+  
+
+  if [[ -z "${devicenum}" ]]; then
+    echo "Failed to get the local disk number for device ${ssd}" >&2
+    exit 2
+  fi
+
+  # TODO: Handle partitioned disks. Right now this code just ignores partitions
+  if [[ "${format}" == "fs" ]]; then
+    if [[ "${interface}" == "scsi" ]]; then
+      local actual_device=$(readlink -f "${ssd}" | cut -d '/' -f 3)
+      # Error checking
+      if [[ "${actual_device}" != sd* ]]; then
+        echo "'actual_device' is not of the correct format. It must be the kernel name of the device, got ${actual_device} instead" >&2
+        exit 1
+      fi
+      local mountpoint="/mnt/disks/ssd${devicenum}"
+    else
+      # This path is required because the existing Google images do not
+      # expose NVMe devices in /dev/disk/by-id so we are using the /dev/nvme instead
+      local actual_device=$(echo ${ssd} | cut -d '/' -f 3)
+      # Error checking
+      if [[ "${actual_device}" != nvme* ]]; then
+        echo "'actual_device' is not of the correct format. It must be the kernel name of the device, got ${actual_device} instead" >&2
+        exit 1
+      fi
+      local mountpoint="/mnt/disks/ssd-nvme${devicenum}"
+    fi
+
+    safe-format-and-mount "${ssd}" "${mountpoint}"
+    # We only do the bindmount if users are using the new local ssd request method
+    # see https://github.com/kubernetes/kubernetes/pull/53466#discussion_r146431894
+    if [[ ! -z "${NODE_LOCAL_SSDS_EXT:-}" ]]; then
+      unique-uuid-bind-mount "${mountpoint}" "${actual_device}"
+    fi
+  elif [[ "${format}" == "block" ]]; then
+    local symdir="${UUID_BLOCK_PREFIX}-${interface}-block"
+    safe-block-symlink "${ssd}" "${symdir}"
+  else
+    echo "Disk format must be either fs or block, got ${format}"
+  fi
+}
+
+# Local ssds, if present, are mounted or symlinked to their appropriate
+# locations
 function ensure-local-ssds() {
+  get-local-disk-num "scsi" "block"
+  local scsiblocknum="${localdisknum}"
+  local i=0
  for ssd in /dev/disk/by-id/google-local-ssd-*; do
    if [ -e "${ssd}" ]; then
-      ssdnum=`echo ${ssd} | sed -e 's/\/dev\/disk\/by-id\/google-local-ssd-\([0-9]*\)/\1/'`
-      ssdmount="/mnt/disks/ssd${ssdnum}/"
-      mkdir -p ${ssdmount}
-      safe-format-and-mount "${ssd}" ${ssdmount}
-      echo "Mounted local SSD $ssd at ${ssdmount}"
-      chmod a+w ${ssdmount}
+      local devicenum=`echo ${ssd} | sed -e 's/\/dev\/disk\/by-id\/google-local-ssd-\([0-9]*\)/\1/'`
+      if [[ "${i}" -lt "${scsiblocknum}" ]]; then
+        mount-ext "${ssd}" "${devicenum}" "scsi" "block"
+      else
+        # GKE does not set NODE_LOCAL_SSDS so all non-block devices
+        # are assumed to be filesystem devices
+        mount-ext "${ssd}" "${devicenum}" "scsi" "fs"
+      fi
+      i=$((i+1))
    else
-      echo "No local SSD disks found."
+      echo "No local SCSI SSD disks found."
+    fi
+  done
+
+  # The following mounts or symlinks NVMe devices
+  get-local-disk-num "nvme" "block"
+  local nvmeblocknum="${localdisknum}"
+  local i=0
+  for ssd in /dev/nvme*; do
+    if [ -e "${ssd}" ]; then
+      # This workaround to find if the NVMe device is a disk is required because
+      # the existing Google images does not expose NVMe devices in /dev/disk/by-id
+      if [[ `udevadm info --query=property --name=${ssd} | grep DEVTYPE | sed "s/DEVTYPE=//"` == "disk" ]]; then
+        local devicenum=`echo ${ssd} | sed -e 's/\/dev\/nvme0n\([0-9]*\)/\1/'`
+        if [[ "${i}" -lt "${nvmeblocknum}" ]]; then
+          mount-ext "${ssd}" "${devicenum}" "nvme" "block"
+        else
+          mount-ext "${ssd}" "${devicenum}" "nvme" "fs"
+        fi
+        i=$((i+1))
+      fi
+    else
+      echo "No local NVMe SSD disks found."
    fi
  done
 }
--- a/cluster/gce/util.sh
+++ b/cluster/gce/util.sh
@ -18,6 +18,8 @@

 # Use the config file specified in $KUBE_CONFIG_FILE, or default to
 # config-default.sh.
+readonly GCE_MAX_LOCAL_SSD=8
+
 KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../..
 source "${KUBE_ROOT}/cluster/gce/${KUBE_CONFIG_FILE-"config-default.sh"}"
 source "${KUBE_ROOT}/cluster/common.sh"
@ -37,6 +39,11 @@ else
  exit 1
 fi

+if [[ ${NODE_LOCAL_SSDS:-} -ge 1 ]] && [[ ! -z ${NODE_LOCAL_SSDS_EXT:-} ]] ; then
+  echo -e "${color_red}Local SSD: Only one of NODE_LOCAL_SSDS and NODE_LOCAL_SSDS_EXT can be specified at once${color_norm}" >&2
+  exit 2
+fi
+
 if [[ "${MASTER_OS_DISTRIBUTION}" == "gci" ]]; then
    DEFAULT_GCI_PROJECT=google-containers
    if [[ "${GCI_VERSION}" == "cos"* ]]; then
@ -546,6 +553,29 @@ function get-template-name-from-version() {
  echo "${NODE_INSTANCE_PREFIX}-template-${1}" | cut -c 1-63 | sed 's/[\.\+]/-/g;s/-*$//g'
 }

+# validates the NODE_LOCAL_SSDS_EXT variable 
+function validate-node-local-ssds-ext(){
+  ssdopts="${1}"
+
+  if [[ -z "${ssdopts[0]}" || -z "${ssdopts[1]}" || -z "${ssdopts[2]}" ]]; then
+	  echo -e "${color_red}Local SSD: NODE_LOCAL_SSDS_EXT is malformed, found ${ssdopts[0]-_},${ssdopts[1]-_},${ssdopts[2]-_} ${color_norm}" >&2
+    exit 2
+  fi
+  if [[ "${ssdopts[1]}" != "scsi" && "${ssdopts[1]}" != "nvme" ]]; then
+    echo -e "${color_red}Local SSD: Interface must be scsi or nvme, found: ${ssdopts[1]} ${color_norm}" >&2
+    exit 2
+  fi
+  if [[ "${ssdopts[2]}" != "fs" && "${ssdopts[2]}" != "block" ]]; then
+    echo -e "${color_red}Local SSD: Filesystem type must be fs or block, found: ${ssdopts[2]} ${color_norm}"  >&2
+    exit 2
+  fi
+  local_ssd_ext_count=$((local_ssd_ext_count+ssdopts[0]))
+  if [[ "${local_ssd_ext_count}" -gt "${GCE_MAX_LOCAL_SSD}" || "${local_ssd_ext_count}" -lt 1 ]]; then
+    echo -e "${color_red}Local SSD: Total number of local ssds must range from 1 to 8, found: ${local_ssd_ext_count} ${color_norm}" >&2
+    exit 2
+  fi
+}
+
 # Robustly try to create an instance template.
 # $1: The name of the instance template.
 # $2: The scopes flag.
@ -587,6 +617,19 @@ function create-node-template() {
  fi

  local local_ssds=""
+  local_ssd_ext_count=0
+  if [[ ! -z ${NODE_LOCAL_SSDS_EXT:-} ]]; then
+    IFS=";" read -r -a ssdgroups <<< "${NODE_LOCAL_SSDS_EXT:-}"
+    for ssdgroup in "${ssdgroups[@]}"
+    do
+      IFS="," read -r -a ssdopts <<< "${ssdgroup}"
+      validate-node-local-ssds-ext "${ssdopts}"
+      for i in $(seq ${ssdopts[0]}); do
+        local_ssds="$local_ssds--local-ssd=interface=${ssdopts[1]} "
+      done
+    done
+  fi
+  
  if [[ ! -z ${NODE_LOCAL_SSDS+x} ]]; then
    # The NODE_LOCAL_SSDS check below fixes issue #49171
    # Some versions of seq will count down from 1 if "seq 0" is specified
@ -596,6 +639,7 @@ function create-node-template() {
      done
    fi
  fi
+  

  local network=$(make-gcloud-network-argument \
    "${NETWORK_PROJECT}" \
--- a/cmd/kube-apiserver/app/options/BUILD
+++ b/cmd/kube-apiserver/app/options/BUILD
@ -30,6 +30,7 @@ go_library(
        "//plugin/pkg/admission/deny:go_default_library",
        "//plugin/pkg/admission/eventratelimit:go_default_library",
        "//plugin/pkg/admission/exec:go_default_library",
+        "//plugin/pkg/admission/extendedresourcetoleration:go_default_library",
        "//plugin/pkg/admission/gc:go_default_library",
        "//plugin/pkg/admission/imagepolicy:go_default_library",
        "//plugin/pkg/admission/initialresources:go_default_library",
--- a/cmd/kube-apiserver/app/options/plugins.go
+++ b/cmd/kube-apiserver/app/options/plugins.go
@ -32,6 +32,7 @@ import (
 	"k8s.io/kubernetes/plugin/pkg/admission/deny"
 	"k8s.io/kubernetes/plugin/pkg/admission/eventratelimit"
 	"k8s.io/kubernetes/plugin/pkg/admission/exec"
+	"k8s.io/kubernetes/plugin/pkg/admission/extendedresourcetoleration"
 	"k8s.io/kubernetes/plugin/pkg/admission/gc"
 	"k8s.io/kubernetes/plugin/pkg/admission/imagepolicy"
 	"k8s.io/kubernetes/plugin/pkg/admission/initialresources"
@ -61,6 +62,7 @@ func RegisterAllAdmissionPlugins(plugins *admission.Plugins) {
 	deny.Register(plugins)
 	eventratelimit.Register(plugins)
 	exec.Register(plugins)
+	extendedresourcetoleration.Register(plugins)
 	gc.Register(plugins)
 	imagepolicy.Register(plugins)
 	initialresources.Register(plugins)
--- a/cmd/kube-controller-manager/app/certificates.go
+++ b/cmd/kube-controller-manager/app/certificates.go
@ -21,9 +21,13 @@ limitations under the License.
 package app

 import (
+	"fmt"
+	"os"
+
 	"github.com/golang/glog"

 	"k8s.io/apimachinery/pkg/runtime/schema"
+	"k8s.io/kubernetes/cmd/kube-controller-manager/app/options"
 	"k8s.io/kubernetes/pkg/controller/certificates/approver"
 	"k8s.io/kubernetes/pkg/controller/certificates/cleaner"
 	"k8s.io/kubernetes/pkg/controller/certificates/signer"
@ -36,6 +40,45 @@ func startCSRSigningController(ctx ControllerContext) (bool, error) {
 	if ctx.Options.ClusterSigningCertFile == "" || ctx.Options.ClusterSigningKeyFile == "" {
 		return false, nil
 	}
+
+	// Deprecation warning for old defaults.
+	//
+	// * If the signing cert and key are the default paths but the files
+	// exist, warn that the paths need to be specified explicitly in a
+	// later release and the defaults will be removed. We don't expect this
+	// to be the case.
+	//
+	// * If the signing cert and key are default paths but the files don't exist,
+	// bail out of startController without logging.
+	var keyFileExists, keyUsesDefault, certFileExists, certUsesDefault bool
+
+	_, err := os.Stat(ctx.Options.ClusterSigningCertFile)
+	certFileExists = !os.IsNotExist(err)
+
+	certUsesDefault = (ctx.Options.ClusterSigningCertFile == options.DefaultClusterSigningCertFile)
+
+	_, err = os.Stat(ctx.Options.ClusterSigningKeyFile)
+	keyFileExists = !os.IsNotExist(err)
+
+	keyUsesDefault = (ctx.Options.ClusterSigningKeyFile == options.DefaultClusterSigningKeyFile)
+
+	switch {
+	case (keyFileExists && keyUsesDefault) || (certFileExists && certUsesDefault):
+		glog.Warningf("You might be using flag defaulting for --cluster-signing-cert-file and" +
+			" --cluster-signing-key-file. These defaults are deprecated and will be removed" +
+			" in a subsequent release. Please pass these options explicitly.")
+	case (!keyFileExists && keyUsesDefault) && (!certFileExists && certUsesDefault):
+		// This is what we expect right now if people aren't
+		// setting up the signing controller. This isn't
+		// actually a problem since the signer is not a
+		// required controller.
+		return false, nil
+	default:
+		// Note that '!filesExist && !usesDefaults' is obviously
+		// operator error. We don't handle this case here and instead
+		// allow it to be handled by NewCSR... below.
+	}
+
 	c := ctx.ClientBuilder.ClientOrDie("certificate-controller")

 	signer, err := signer.NewCSRSigningController(
@ -46,8 +89,7 @@ func startCSRSigningController(ctx ControllerContext) (bool, error) {
 		ctx.Options.ClusterSigningDuration.Duration,
 	)
 	if err != nil {
-		glog.Errorf("Failed to start certificate controller: %v", err)
-		return false, nil
+		return false, fmt.Errorf("failed to start certificate controller: %v", err)
 	}
 	go signer.Run(1, ctx.Stop)

--- a/cmd/kube-controller-manager/app/options/options.go
+++ b/cmd/kube-controller-manager/app/options/options.go
@ -39,6 +39,16 @@ import (
 	"github.com/spf13/pflag"
 )

+const (
+	// These defaults are deprecated and exported so that we can warn if
+	// they are being used.
+
+	// DefaultClusterSigningCertFile is deprecated. Do not use.
+	DefaultClusterSigningCertFile = "/etc/kubernetes/ca/ca.pem"
+	// DefaultClusterSigningKeyFile is deprecated. Do not use.
+	DefaultClusterSigningKeyFile = "/etc/kubernetes/ca/ca.key"
+)
+
 // CMServer is the main context object for the controller manager.
 type CMServer struct {
 	componentconfig.KubeControllerManagerConfiguration
@ -111,8 +121,8 @@ func NewCMServer() *CMServer {
 			EnableGarbageCollector:                true,
 			ConcurrentGCSyncs:                     20,
 			GCIgnoredResources:                    gcIgnoredResources,
-			ClusterSigningCertFile:                "/etc/kubernetes/ca/ca.pem",
-			ClusterSigningKeyFile:                 "/etc/kubernetes/ca/ca.key",
+			ClusterSigningCertFile:                DefaultClusterSigningCertFile,
+			ClusterSigningKeyFile:                 DefaultClusterSigningKeyFile,
 			ClusterSigningDuration:                metav1.Duration{Duration: helpers.OneYear},
 			ReconcilerSyncLoopPeriod:              metav1.Duration{Duration: 60 * time.Second},
 			EnableTaintManager:                    true,
--- a/cmd/kubeadm/app/apis/kubeadm/validation/validation.go
+++ b/cmd/kubeadm/app/apis/kubeadm/validation/validation.go
@ -140,7 +140,8 @@ func ValidateArgSelection(cfg *kubeadm.NodeConfiguration, fldPath *field.Path) f
 		allErrs = append(allErrs, field.Invalid(fldPath, "", "DiscoveryTokenCACertHashes cannot be used with DiscoveryFile"))
 	}

-	if len(cfg.DiscoveryFile) == 0 && len(cfg.DiscoveryTokenCACertHashes) == 0 && !cfg.DiscoveryTokenUnsafeSkipCAVerification {
+	if len(cfg.DiscoveryFile) == 0 && len(cfg.DiscoveryToken) != 0 &&
+		len(cfg.DiscoveryTokenCACertHashes) == 0 && !cfg.DiscoveryTokenUnsafeSkipCAVerification {
 		allErrs = append(allErrs, field.Invalid(fldPath, "", "using token-based discovery without DiscoveryTokenCACertHashes can be unsafe. set --discovery-token-unsafe-skip-ca-verification to continue"))
 	}

--- a/cmd/kubeadm/app/cmd/upgrade/apply.go
+++ b/cmd/kubeadm/app/cmd/upgrade/apply.go
@ -202,16 +202,16 @@ func EnforceVersionPolicies(flags *applyFlags, versionGetter upgrade.VersionGett
 	if versionSkewErrs != nil {

 		if len(versionSkewErrs.Mandatory) > 0 {
-			return fmt.Errorf("The --version argument is invalid due to these fatal errors: %v", versionSkewErrs.Mandatory)
+			return fmt.Errorf("The --version argument is invalid due to these fatal errors:\n\n%v\nPlease fix the misalignments highlighted above and try upgrading again", kubeadmutil.FormatErrMsg(versionSkewErrs.Mandatory))
 		}

 		if len(versionSkewErrs.Skippable) > 0 {
 			// Return the error if the user hasn't specified the --force flag
 			if !flags.force {
-				return fmt.Errorf("The --version argument is invalid due to these errors: %v. Can be bypassed if you pass the --force flag", versionSkewErrs.Skippable)
+				return fmt.Errorf("The --version argument is invalid due to these errors:\n\n%v\nCan be bypassed if you pass the --force flag", kubeadmutil.FormatErrMsg(versionSkewErrs.Skippable))
 			}
 			// Soft errors found, but --force was specified
-			fmt.Printf("[upgrade/version] Found %d potential version compatibility errors but skipping since the --force flag is set: %v\n", len(versionSkewErrs.Skippable), versionSkewErrs.Skippable)
+			fmt.Printf("[upgrade/version] Found %d potential version compatibility errors but skipping since the --force flag is set: \n\n%v", len(versionSkewErrs.Skippable), kubeadmutil.FormatErrMsg(versionSkewErrs.Skippable))
 		}
 	}
 	return nil
--- a/cmd/kubeadm/app/util/error.go
+++ b/cmd/kubeadm/app/util/error.go
@ -75,3 +75,12 @@ func checkErr(prefix string, err error, handleErr func(string, int)) {
 		handleErr(err.Error(), DefaultErrorExitCode)
 	}
 }
+
+// FormatErrMsg returns a human-readable string describing the slice of errors passed to the function
+func FormatErrMsg(errs []error) string {
+	var errMsg string
+	for _, err := range errs {
+		errMsg = fmt.Sprintf("%s\t-%s\n", errMsg, err.Error())
+	}
+	return errMsg
+}
--- a/cmd/kubeadm/app/util/error_test.go
+++ b/cmd/kubeadm/app/util/error_test.go
@ -50,3 +50,34 @@ func TestCheckErr(t *testing.T) {
 		}
 	}
 }
+
+func TestFormatErrMsg(t *testing.T) {
+	errMsg1 := "specified version to upgrade to v1.9.0-alpha.3 is equal to or lower than the cluster version v1.10.0-alpha.0.69+638add6ddfb6d2. Downgrades are not supported yet"
+	errMsg2 := "specified version to upgrade to v1.9.0-alpha.3 is higher than the kubeadm version v1.9.0-alpha.1.3121+84178212527295-dirty. Upgrade kubeadm first using the tool you used to install kubeadm"
+
+	testCases := []struct {
+		errs   []error
+		expect string
+	}{
+		{
+			errs: []error{
+				fmt.Errorf(errMsg1),
+				fmt.Errorf(errMsg2),
+			},
+			expect: "\t-" + errMsg1 + "\n" + "\t-" + errMsg2 + "\n",
+		},
+		{
+			errs: []error{
+				fmt.Errorf(errMsg1),
+			},
+			expect: "\t-" + errMsg1 + "\n",
+		},
+	}
+
+	for _, testCase := range testCases {
+		got := FormatErrMsg(testCase.errs)
+		if got != testCase.expect {
+			t.Errorf("FormatErrMsg error, expect: %v, got: %v", testCase.expect, got)
+		}
+	}
+}
--- a/cmd/kubelet/app/options/options.go
+++ b/cmd/kubelet/app/options/options.go
@ -20,6 +20,7 @@ package options
 import (
 	"fmt"
 	_ "net/http/pprof"
+	"path/filepath"
 	"runtime"
 	"strings"

@ -154,6 +155,8 @@ type KubeletFlags struct {
 	// This will cause the kubelet to listen to inotify events on the lock file,
 	// releasing it and exiting when another process tries to open that file.
 	ExitOnLockContention bool
+	// seccompProfileRoot is the directory path for seccomp profiles.
+	SeccompProfileRoot string

 	// DEPRECATED FLAGS
 	// minimumGCAge is the minimum age for a finished container before it is
@ -214,6 +217,7 @@ func NewKubeletFlags() *KubeletFlags {
 		NodeLabels:          make(map[string]string),
 		VolumePluginDir:     "/usr/libexec/kubernetes/kubelet-plugins/volume/exec/",
 		RegisterNode:        true,
+		SeccompProfileRoot:  filepath.Join(v1alpha1.DefaultRootDir, "seccomp"),
 	}
 }

@ -338,6 +342,7 @@ func (f *KubeletFlags) AddFlags(fs *pflag.FlagSet) {
 	fs.StringVar(&f.VolumePluginDir, "volume-plugin-dir", f.VolumePluginDir, "<Warning: Alpha feature> The full path of the directory in which to search for additional third party volume plugins")
 	fs.StringVar(&f.LockFilePath, "lock-file", f.LockFilePath, "<Warning: Alpha feature> The path to file for kubelet to use as a lock file.")
 	fs.BoolVar(&f.ExitOnLockContention, "exit-on-lock-contention", f.ExitOnLockContention, "Whether kubelet should exit upon lock-file contention.")
+	fs.StringVar(&f.SeccompProfileRoot, "seccomp-profile-root", f.SeccompProfileRoot, "<Warning: Alpha feature> Directory path for seccomp profiles.")

 	// DEPRECATED FLAGS
 	fs.DurationVar(&f.MinimumGCAge.Duration, "minimum-container-ttl-duration", f.MinimumGCAge.Duration, "Minimum age for a finished container before it is garbage collected.  Examples: '300ms', '10s' or '2h45m'")
@ -405,7 +410,6 @@ func AddKubeletConfigFlags(fs *pflag.FlagSet, c *kubeletconfig.KubeletConfigurat
 		"are generated for the public address and saved to the directory passed to --cert-dir.")
 	fs.StringVar(&c.TLSPrivateKeyFile, "tls-private-key-file", c.TLSPrivateKeyFile, "File containing x509 private key matching --tls-cert-file.")

-	fs.StringVar(&c.SeccompProfileRoot, "seccomp-profile-root", c.SeccompProfileRoot, "Directory path for seccomp profiles.")
 	fs.BoolVar(&c.AllowPrivileged, "allow-privileged", c.AllowPrivileged, "If true, allow containers to request privileged mode.")
 	fs.StringSliceVar(&c.HostNetworkSources, "host-network-sources", c.HostNetworkSources, "Comma-separated list of sources from which the Kubelet allows pods to use of host network.")
 	fs.StringSliceVar(&c.HostPIDSources, "host-pid-sources", c.HostPIDSources, "Comma-separated list of sources from which the Kubelet allows pods to use the host pid namespace.")
--- a/cmd/kubelet/app/server.go
+++ b/cmd/kubelet/app/server.go
@ -728,7 +728,8 @@ func RunKubelet(kubeFlags *options.KubeletFlags, kubeCfg *kubeletconfiginternal.
 		kubeFlags.RegisterSchedulable,
 		kubeFlags.NonMasqueradeCIDR,
 		kubeFlags.KeepTerminatedPodVolumes,
-		kubeFlags.NodeLabels)
+		kubeFlags.NodeLabels,
+		kubeFlags.SeccompProfileRoot)
 	if err != nil {
 		return fmt.Errorf("failed to create kubelet: %v", err)
 	}
@ -800,7 +801,8 @@ func CreateAndInitKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 	registerSchedulable bool,
 	nonMasqueradeCIDR string,
 	keepTerminatedPodVolumes bool,
-	nodeLabels map[string]string) (k kubelet.Bootstrap, err error) {
+	nodeLabels map[string]string,
+	seccompProfileRoot string) (k kubelet.Bootstrap, err error) {
 	// TODO: block until all sources have delivered at least one update to the channel, or break the sync loop
 	// up into "per source" synchronizations

@ -832,7 +834,8 @@ func CreateAndInitKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 		registerSchedulable,
 		nonMasqueradeCIDR,
 		keepTerminatedPodVolumes,
-		nodeLabels)
+		nodeLabels,
+		seccompProfileRoot)
 	if err != nil {
 		return nil, err
 	}
--- a/pkg/cloudprovider/providers/azure/BUILD
+++ b/pkg/cloudprovider/providers/azure/BUILD
@ -13,6 +13,7 @@ go_library(
        "azure_backoff.go",
        "azure_blobDiskController.go",
        "azure_controllerCommon.go",
+        "azure_fakes.go",
        "azure_file.go",
        "azure_instance_metadata.go",
        "azure_instances.go",
@ -48,6 +49,7 @@ go_library(
        "//vendor/k8s.io/api/core/v1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/util/errors:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
        "//vendor/k8s.io/client-go/util/flowcontrol:go_default_library",
    ],
@ -60,10 +62,14 @@ go_test(
    library = ":go_default_library",
    deps = [
        "//pkg/api/v1/service:go_default_library",
+        "//pkg/kubelet/apis:go_default_library",
+        "//vendor/github.com/Azure/azure-sdk-for-go/arm/compute:go_default_library",
        "//vendor/github.com/Azure/azure-sdk-for-go/arm/network:go_default_library",
        "//vendor/github.com/Azure/go-autorest/autorest/to:go_default_library",
        "//vendor/k8s.io/api/core/v1:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
+        "//vendor/k8s.io/client-go/util/flowcontrol:go_default_library",
    ],
 )

--- a/pkg/cloudprovider/providers/azure/azure.go
+++ b/pkg/cloudprovider/providers/azure/azure.go
@ -44,13 +44,14 @@ import (

 const (
 	// CloudProviderName is the value used for the --cloud-provider flag
-	CloudProviderName      = "azure"
-	rateLimitQPSDefault    = 1.0
-	rateLimitBucketDefault = 5
-	backoffRetriesDefault  = 6
-	backoffExponentDefault = 1.5
-	backoffDurationDefault = 5 // in seconds
-	backoffJitterDefault   = 1.0
+	CloudProviderName            = "azure"
+	rateLimitQPSDefault          = 1.0
+	rateLimitBucketDefault       = 5
+	backoffRetriesDefault        = 6
+	backoffExponentDefault       = 1.5
+	backoffDurationDefault       = 5 // in seconds
+	backoffJitterDefault         = 1.0
+	maximumLoadBalancerRuleCount = 148 // According to Azure LB rule default limit
 )

 // Config holds the configuration parsed from the --cloud-config flag
@ -113,6 +114,57 @@ type Config struct {

 	// Use managed service identity for the virtual machine to access Azure ARM APIs
 	UseManagedIdentityExtension bool `json:"useManagedIdentityExtension"`
+
+	// Maximum allowed LoadBalancer Rule Count is the limit enforced by Azure Load balancer
+	MaximumLoadBalancerRuleCount int `json:"maximumLoadBalancerRuleCount"`
+}
+
+// VirtualMachinesClient defines needed functions for azure network.VirtualMachinesClient
+type VirtualMachinesClient interface {
+	CreateOrUpdate(resourceGroupName string, VMName string, parameters compute.VirtualMachine, cancel <-chan struct{}) (<-chan compute.VirtualMachine, <-chan error)
+	Get(resourceGroupName string, VMName string, expand compute.InstanceViewTypes) (result compute.VirtualMachine, err error)
+	List(resourceGroupName string) (result compute.VirtualMachineListResult, err error)
+	ListNextResults(lastResults compute.VirtualMachineListResult) (result compute.VirtualMachineListResult, err error)
+}
+
+// InterfacesClient defines needed functions for azure network.InterfacesClient
+type InterfacesClient interface {
+	CreateOrUpdate(resourceGroupName string, networkInterfaceName string, parameters network.Interface, cancel <-chan struct{}) (<-chan network.Interface, <-chan error)
+	Get(resourceGroupName string, networkInterfaceName string, expand string) (result network.Interface, err error)
+}
+
+// LoadBalancersClient defines needed functions for azure network.LoadBalancersClient
+type LoadBalancersClient interface {
+	CreateOrUpdate(resourceGroupName string, loadBalancerName string, parameters network.LoadBalancer, cancel <-chan struct{}) (<-chan network.LoadBalancer, <-chan error)
+	Delete(resourceGroupName string, loadBalancerName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
+	Get(resourceGroupName string, loadBalancerName string, expand string) (result network.LoadBalancer, err error)
+	List(resourceGroupName string) (result network.LoadBalancerListResult, err error)
+	ListNextResults(lastResult network.LoadBalancerListResult) (result network.LoadBalancerListResult, err error)
+}
+
+// PublicIPAddressesClient defines needed functions for azure network.PublicIPAddressesClient
+type PublicIPAddressesClient interface {
+	CreateOrUpdate(resourceGroupName string, publicIPAddressName string, parameters network.PublicIPAddress, cancel <-chan struct{}) (<-chan network.PublicIPAddress, <-chan error)
+	Delete(resourceGroupName string, publicIPAddressName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
+	Get(resourceGroupName string, publicIPAddressName string, expand string) (result network.PublicIPAddress, err error)
+	List(resourceGroupName string) (result network.PublicIPAddressListResult, err error)
+	ListNextResults(lastResults network.PublicIPAddressListResult) (result network.PublicIPAddressListResult, err error)
+}
+
+// SubnetsClient defines needed functions for azure network.SubnetsClient
+type SubnetsClient interface {
+	CreateOrUpdate(resourceGroupName string, virtualNetworkName string, subnetName string, subnetParameters network.Subnet, cancel <-chan struct{}) (<-chan network.Subnet, <-chan error)
+	Delete(resourceGroupName string, virtualNetworkName string, subnetName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
+	Get(resourceGroupName string, virtualNetworkName string, subnetName string, expand string) (result network.Subnet, err error)
+	List(resourceGroupName string, virtualNetworkName string) (result network.SubnetListResult, err error)
+}
+
+// SecurityGroupsClient defines needed functions for azure network.SecurityGroupsClient
+type SecurityGroupsClient interface {
+	CreateOrUpdate(resourceGroupName string, networkSecurityGroupName string, parameters network.SecurityGroup, cancel <-chan struct{}) (<-chan network.SecurityGroup, <-chan error)
+	Delete(resourceGroupName string, networkSecurityGroupName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error)
+	Get(resourceGroupName string, networkSecurityGroupName string, expand string) (result network.SecurityGroup, err error)
+	List(resourceGroupName string) (result network.SecurityGroupListResult, err error)
 }

 // Cloud holds the config and clients
@ -120,13 +172,13 @@ type Cloud struct {
 	Config
 	Environment              azure.Environment
 	RoutesClient             network.RoutesClient
-	SubnetsClient            network.SubnetsClient
-	InterfacesClient         network.InterfacesClient
+	SubnetsClient            SubnetsClient
+	InterfacesClient         InterfacesClient
 	RouteTablesClient        network.RouteTablesClient
-	LoadBalancerClient       network.LoadBalancersClient
-	PublicIPAddressesClient  network.PublicIPAddressesClient
-	SecurityGroupsClient     network.SecurityGroupsClient
-	VirtualMachinesClient    compute.VirtualMachinesClient
+	LoadBalancerClient       LoadBalancersClient
+	PublicIPAddressesClient  PublicIPAddressesClient
+	SecurityGroupsClient     SecurityGroupsClient
+	VirtualMachinesClient    VirtualMachinesClient
 	StorageAccountClient     storage.AccountsClient
 	DisksClient              disk.DisksClient
 	operationPollRateLimiter flowcontrol.RateLimiter
@ -221,11 +273,12 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) {
 		return nil, err
 	}

-	az.SubnetsClient = network.NewSubnetsClient(az.SubscriptionID)
-	az.SubnetsClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.SubnetsClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.SubnetsClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.SubnetsClient.Client)
+	subnetsClient := network.NewSubnetsClient(az.SubscriptionID)
+	subnetsClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	subnetsClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	subnetsClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&subnetsClient.Client)
+	az.SubnetsClient = subnetsClient

 	az.RouteTablesClient = network.NewRouteTablesClient(az.SubscriptionID)
 	az.RouteTablesClient.BaseURI = az.Environment.ResourceManagerEndpoint
@ -239,35 +292,40 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) {
 	az.RoutesClient.PollingDelay = 5 * time.Second
 	configureUserAgent(&az.RoutesClient.Client)

-	az.InterfacesClient = network.NewInterfacesClient(az.SubscriptionID)
-	az.InterfacesClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.InterfacesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.InterfacesClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.InterfacesClient.Client)
+	interfacesClient := network.NewInterfacesClient(az.SubscriptionID)
+	interfacesClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	interfacesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	interfacesClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&interfacesClient.Client)
+	az.InterfacesClient = interfacesClient

-	az.LoadBalancerClient = network.NewLoadBalancersClient(az.SubscriptionID)
-	az.LoadBalancerClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.LoadBalancerClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.LoadBalancerClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.LoadBalancerClient.Client)
+	loadBalancerClient := network.NewLoadBalancersClient(az.SubscriptionID)
+	loadBalancerClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	loadBalancerClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	loadBalancerClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&loadBalancerClient.Client)
+	az.LoadBalancerClient = loadBalancerClient

-	az.VirtualMachinesClient = compute.NewVirtualMachinesClient(az.SubscriptionID)
-	az.VirtualMachinesClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.VirtualMachinesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.VirtualMachinesClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.VirtualMachinesClient.Client)
+	virtualMachinesClient := compute.NewVirtualMachinesClient(az.SubscriptionID)
+	virtualMachinesClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	virtualMachinesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	virtualMachinesClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&virtualMachinesClient.Client)
+	az.VirtualMachinesClient = virtualMachinesClient

-	az.PublicIPAddressesClient = network.NewPublicIPAddressesClient(az.SubscriptionID)
-	az.PublicIPAddressesClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.PublicIPAddressesClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.PublicIPAddressesClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.PublicIPAddressesClient.Client)
+	publicIPAddressClient := network.NewPublicIPAddressesClient(az.SubscriptionID)
+	publicIPAddressClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	publicIPAddressClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	publicIPAddressClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&publicIPAddressClient.Client)
+	az.PublicIPAddressesClient = publicIPAddressClient

-	az.SecurityGroupsClient = network.NewSecurityGroupsClient(az.SubscriptionID)
-	az.SecurityGroupsClient.BaseURI = az.Environment.ResourceManagerEndpoint
-	az.SecurityGroupsClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
-	az.SecurityGroupsClient.PollingDelay = 5 * time.Second
-	configureUserAgent(&az.SecurityGroupsClient.Client)
+	securityGroupsClient := network.NewSecurityGroupsClient(az.SubscriptionID)
+	securityGroupsClient.BaseURI = az.Environment.ResourceManagerEndpoint
+	securityGroupsClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
+	securityGroupsClient.PollingDelay = 5 * time.Second
+	configureUserAgent(&securityGroupsClient.Client)
+	az.SecurityGroupsClient = securityGroupsClient

 	az.StorageAccountClient = storage.NewAccountsClientWithBaseURI(az.Environment.ResourceManagerEndpoint, az.SubscriptionID)
 	az.StorageAccountClient.Authorizer = autorest.NewBearerAuthorizer(servicePrincipalToken)
@ -327,6 +385,10 @@ func NewCloud(configReader io.Reader) (cloudprovider.Interface, error) {

 	az.metadata = NewInstanceMetadata()

+	if az.MaximumLoadBalancerRuleCount == 0 {
+		az.MaximumLoadBalancerRuleCount = maximumLoadBalancerRuleCount
+	}
+
 	if err := initDiskControllers(&az); err != nil {
 		return nil, err
 	}
--- a/pkg/cloudprovider/providers/azure/azure_backoff.go
+++ b/pkg/cloudprovider/providers/azure/azure_backoff.go
@ -26,11 +26,26 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 )

+// requestBackoff if backoff is disabled in cloud provider it
+// returns a new Backoff object steps = 1
+// This is to make sure that the requested command executes
+// at least once
+func (az *Cloud) requestBackoff() (resourceRequestBackoff wait.Backoff) {
+	if az.CloudProviderBackoff {
+		return az.resourceRequestBackoff
+	}
+	resourceRequestBackoff = wait.Backoff{
+		Steps: 1,
+	}
+
+	return resourceRequestBackoff
+}
+
 // GetVirtualMachineWithRetry invokes az.getVirtualMachine with exponential backoff retry
 func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.VirtualMachine, bool, error) {
 	var machine compute.VirtualMachine
 	var exists bool
-	err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		machine, exists, retryErr = az.getVirtualMachine(name)
 		if retryErr != nil {
@ -46,8 +61,9 @@ func (az *Cloud) GetVirtualMachineWithRetry(name types.NodeName) (compute.Virtua
 // VirtualMachineClientGetWithRetry invokes az.VirtualMachinesClient.Get with exponential backoff retry
 func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string, types compute.InstanceViewTypes) (compute.VirtualMachine, error) {
 	var machine compute.VirtualMachine
-	err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
+		az.operationPollRateLimiter.Accept()
 		machine, retryErr = az.VirtualMachinesClient.Get(resourceGroup, vmName, types)
 		if retryErr != nil {
 			glog.Errorf("backoff: failure, will retry,err=%v", retryErr)
@ -59,10 +75,63 @@ func (az *Cloud) VirtualMachineClientGetWithRetry(resourceGroup, vmName string,
 	return machine, err
 }

+// VirtualMachineClientListWithRetry invokes az.VirtualMachinesClient.List with exponential backoff retry
+func (az *Cloud) VirtualMachineClientListWithRetry() ([]compute.VirtualMachine, error) {
+	allNodes := []compute.VirtualMachine{}
+	var result compute.VirtualMachineListResult
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
+		var retryErr error
+		az.operationPollRateLimiter.Accept()
+		glog.V(10).Infof("VirtualMachinesClient.List(%v): start", az.ResourceGroup)
+		result, retryErr = az.VirtualMachinesClient.List(az.ResourceGroup)
+		glog.V(10).Infof("VirtualMachinesClient.List(%v): end", az.ResourceGroup)
+		if retryErr != nil {
+			glog.Errorf("VirtualMachinesClient.List(%v) - backoff: failure, will retry,err=%v",
+				az.ResourceGroup,
+				retryErr)
+			return false, retryErr
+		}
+		glog.V(2).Infof("VirtualMachinesClient.List(%v) - backoff: success", az.ResourceGroup)
+		return true, nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	appendResults := (result.Value != nil && len(*result.Value) > 0)
+	for appendResults {
+		allNodes = append(allNodes, *result.Value...)
+		appendResults = false
+		// follow the next link to get all the vms for resource group
+		if result.NextLink != nil {
+			err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
+				var retryErr error
+				az.operationPollRateLimiter.Accept()
+				glog.V(10).Infof("VirtualMachinesClient.ListNextResults(%v): start", az.ResourceGroup)
+				result, retryErr = az.VirtualMachinesClient.ListNextResults(result)
+				glog.V(10).Infof("VirtualMachinesClient.ListNextResults(%v): end", az.ResourceGroup)
+				if retryErr != nil {
+					glog.Errorf("VirtualMachinesClient.ListNextResults(%v) - backoff: failure, will retry,err=%v",
+						az.ResourceGroup, retryErr)
+					return false, retryErr
+				}
+				glog.V(2).Infof("VirtualMachinesClient.ListNextResults(%v): success", az.ResourceGroup)
+				return true, nil
+			})
+			if err != nil {
+				return allNodes, err
+			}
+			appendResults = (result.Value != nil && len(*result.Value) > 0)
+		}
+	}
+
+	return allNodes, err
+}
+
 // GetIPForMachineWithRetry invokes az.getIPForMachine with exponential backoff retry
 func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {
 	var ip string
-	err := wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		var retryErr error
 		ip, retryErr = az.getIPForMachine(name)
 		if retryErr != nil {
@ -77,7 +146,7 @@ func (az *Cloud) GetIPForMachineWithRetry(name types.NodeName) (string, error) {

 // CreateOrUpdateSGWithRetry invokes az.SecurityGroupsClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("SecurityGroupsClient.CreateOrUpdate(%s): start", *sg.Name)
 		respChan, errChan := az.SecurityGroupsClient.CreateOrUpdate(az.ResourceGroup, *sg.Name, sg, nil)
@ -90,7 +159,7 @@ func (az *Cloud) CreateOrUpdateSGWithRetry(sg network.SecurityGroup) error {

 // CreateOrUpdateLBWithRetry invokes az.LoadBalancerClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.CreateOrUpdate(%s): start", *lb.Name)
 		respChan, errChan := az.LoadBalancerClient.CreateOrUpdate(az.ResourceGroup, *lb.Name, lb, nil)
@ -101,9 +170,120 @@ func (az *Cloud) CreateOrUpdateLBWithRetry(lb network.LoadBalancer) error {
 	})
 }

+// ListLBWithRetry invokes az.LoadBalancerClient.List with exponential backoff retry
+func (az *Cloud) ListLBWithRetry() ([]network.LoadBalancer, error) {
+	allLBs := []network.LoadBalancer{}
+	var result network.LoadBalancerListResult
+
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
+		var retryErr error
+		az.operationPollRateLimiter.Accept()
+		glog.V(10).Infof("LoadBalancerClient.List(%v): start", az.ResourceGroup)
+		result, retryErr = az.LoadBalancerClient.List(az.ResourceGroup)
+		glog.V(10).Infof("LoadBalancerClient.List(%v): end", az.ResourceGroup)
+		if retryErr != nil {
+			glog.Errorf("LoadBalancerClient.List(%v) - backoff: failure, will retry,err=%v",
+				az.ResourceGroup,
+				retryErr)
+			return false, retryErr
+		}
+		glog.V(2).Infof("LoadBalancerClient.List(%v) - backoff: success", az.ResourceGroup)
+		return true, nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	appendResults := (result.Value != nil && len(*result.Value) > 0)
+	for appendResults {
+		allLBs = append(allLBs, *result.Value...)
+		appendResults = false
+
+		// follow the next link to get all the vms for resource group
+		if result.NextLink != nil {
+			err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
+				var retryErr error
+				az.operationPollRateLimiter.Accept()
+				glog.V(10).Infof("LoadBalancerClient.ListNextResults(%v): start", az.ResourceGroup)
+				result, retryErr = az.LoadBalancerClient.ListNextResults(result)
+				glog.V(10).Infof("LoadBalancerClient.ListNextResults(%v): end", az.ResourceGroup)
+				if retryErr != nil {
+					glog.Errorf("LoadBalancerClient.ListNextResults(%v) - backoff: failure, will retry,err=%v",
+						az.ResourceGroup,
+						retryErr)
+					return false, retryErr
+				}
+				glog.V(2).Infof("LoadBalancerClient.ListNextResults(%v) - backoff: success", az.ResourceGroup)
+				return true, nil
+			})
+			if err != nil {
+				return allLBs, err
+			}
+			appendResults = (result.Value != nil && len(*result.Value) > 0)
+		}
+	}
+
+	return allLBs, nil
+}
+
+// ListPIPWithRetry list the PIP resources in az.ResourceGroup
+func (az *Cloud) ListPIPWithRetry() ([]network.PublicIPAddress, error) {
+	allPIPs := []network.PublicIPAddress{}
+	var result network.PublicIPAddressListResult
+	err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
+		var retryErr error
+		az.operationPollRateLimiter.Accept()
+		glog.V(10).Infof("PublicIPAddressesClient.List(%v): start", az.ResourceGroup)
+		result, retryErr = az.PublicIPAddressesClient.List(az.ResourceGroup)
+		glog.V(10).Infof("PublicIPAddressesClient.List(%v): end", az.ResourceGroup)
+		if retryErr != nil {
+			glog.Errorf("PublicIPAddressesClient.List(%v) - backoff: failure, will retry,err=%v",
+				az.ResourceGroup,
+				retryErr)
+			return false, retryErr
+		}
+		glog.V(2).Infof("PublicIPAddressesClient.List(%v) - backoff: success", az.ResourceGroup)
+		return true, nil
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	appendResults := (result.Value != nil && len(*result.Value) > 0)
+	for appendResults {
+		allPIPs = append(allPIPs, *result.Value...)
+		appendResults = false
+
+		// follow the next link to get all the vms for resource group
+		if result.NextLink != nil {
+			err := wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
+				var retryErr error
+				az.operationPollRateLimiter.Accept()
+				glog.V(10).Infof("PublicIPAddressesClient.ListNextResults(%v): start", az.ResourceGroup)
+				result, retryErr = az.PublicIPAddressesClient.ListNextResults(result)
+				glog.V(10).Infof("PublicIPAddressesClient.ListNextResults(%v): end", az.ResourceGroup)
+				if retryErr != nil {
+					glog.Errorf("PublicIPAddressesClient.ListNextResults(%v) - backoff: failure, will retry,err=%v",
+						az.ResourceGroup,
+						retryErr)
+					return false, retryErr
+				}
+				glog.V(2).Infof("PublicIPAddressesClient.ListNextResults(%v) - backoff: success", az.ResourceGroup)
+				return true, nil
+			})
+			if err != nil {
+				return allPIPs, err
+			}
+			appendResults = (result.Value != nil && len(*result.Value) > 0)
+		}
+	}
+
+	return allPIPs, nil
+}
+
 // CreateOrUpdatePIPWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.CreateOrUpdate(%s): start", *pip.Name)
 		respChan, errChan := az.PublicIPAddressesClient.CreateOrUpdate(az.ResourceGroup, *pip.Name, pip, nil)
@ -116,7 +296,7 @@ func (az *Cloud) CreateOrUpdatePIPWithRetry(pip network.PublicIPAddress) error {

 // CreateOrUpdateInterfaceWithRetry invokes az.PublicIPAddressesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("InterfacesClient.CreateOrUpdate(%s): start", *nic.Name)
 		respChan, errChan := az.InterfacesClient.CreateOrUpdate(az.ResourceGroup, *nic.Name, nic, nil)
@ -129,7 +309,7 @@ func (az *Cloud) CreateOrUpdateInterfaceWithRetry(nic network.Interface) error {

 // DeletePublicIPWithRetry invokes az.PublicIPAddressesClient.Delete with exponential backoff retry
 func (az *Cloud) DeletePublicIPWithRetry(pipName string) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("PublicIPAddressesClient.Delete(%s): start", pipName)
 		respChan, errChan := az.PublicIPAddressesClient.Delete(az.ResourceGroup, pipName, nil)
@ -142,7 +322,7 @@ func (az *Cloud) DeletePublicIPWithRetry(pipName string) error {

 // DeleteLBWithRetry invokes az.LoadBalancerClient.Delete with exponential backoff retry
 func (az *Cloud) DeleteLBWithRetry(lbName string) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("LoadBalancerClient.Delete(%s): start", lbName)
 		respChan, errChan := az.LoadBalancerClient.Delete(az.ResourceGroup, lbName, nil)
@ -155,7 +335,7 @@ func (az *Cloud) DeleteLBWithRetry(lbName string) error {

 // CreateOrUpdateRouteTableWithRetry invokes az.RouteTablesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RouteTablesClient.CreateOrUpdate(%s): start", *routeTable.Name)
 		respChan, errChan := az.RouteTablesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, routeTable, nil)
@ -168,7 +348,7 @@ func (az *Cloud) CreateOrUpdateRouteTableWithRetry(routeTable network.RouteTable

 // CreateOrUpdateRouteWithRetry invokes az.RoutesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RoutesClient.CreateOrUpdate(%s): start", *route.Name)
 		respChan, errChan := az.RoutesClient.CreateOrUpdate(az.ResourceGroup, az.RouteTableName, *route.Name, route, nil)
@ -181,7 +361,7 @@ func (az *Cloud) CreateOrUpdateRouteWithRetry(route network.Route) error {

 // DeleteRouteWithRetry invokes az.RoutesClient.Delete with exponential backoff retry
 func (az *Cloud) DeleteRouteWithRetry(routeName string) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("RoutesClient.Delete(%s): start", az.RouteTableName)
 		respChan, errChan := az.RoutesClient.Delete(az.ResourceGroup, az.RouteTableName, routeName, nil)
@ -194,7 +374,7 @@ func (az *Cloud) DeleteRouteWithRetry(routeName string) error {

 // CreateOrUpdateVMWithRetry invokes az.VirtualMachinesClient.CreateOrUpdate with exponential backoff retry
 func (az *Cloud) CreateOrUpdateVMWithRetry(vmName string, newVM compute.VirtualMachine) error {
-	return wait.ExponentialBackoff(az.resourceRequestBackoff, func() (bool, error) {
+	return wait.ExponentialBackoff(az.requestBackoff(), func() (bool, error) {
 		az.operationPollRateLimiter.Accept()
 		glog.V(10).Infof("VirtualMachinesClient.CreateOrUpdate(%s): start", vmName)
 		respChan, errChan := az.VirtualMachinesClient.CreateOrUpdate(az.ResourceGroup, vmName, newVM, nil)
--- a/pkg/cloudprovider/providers/azure/azure_fakes.go
+++ b/pkg/cloudprovider/providers/azure/azure_fakes.go
@ -0,0 +1,623 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package azure
+
+import (
+	"fmt"
+	"math/rand"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"github.com/Azure/go-autorest/autorest/to"
+
+	"github.com/Azure/azure-sdk-for-go/arm/compute"
+	"github.com/Azure/azure-sdk-for-go/arm/network"
+	"github.com/Azure/go-autorest/autorest"
+)
+
+type fakeAzureLBClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]network.LoadBalancer
+}
+
+func newFakeAzureLBClient() fakeAzureLBClient {
+	fLBC := fakeAzureLBClient{}
+	fLBC.FakeStore = make(map[string]map[string]network.LoadBalancer)
+	fLBC.mutex = &sync.Mutex{}
+	return fLBC
+}
+
+func (fLBC fakeAzureLBClient) CreateOrUpdate(resourceGroupName string, loadBalancerName string, parameters network.LoadBalancer, cancel <-chan struct{}) (<-chan network.LoadBalancer, <-chan error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	resultChan := make(chan network.LoadBalancer, 1)
+	errChan := make(chan error, 1)
+	var result network.LoadBalancer
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fLBC.FakeStore[resourceGroupName]; !ok {
+		fLBC.FakeStore[resourceGroupName] = make(map[string]network.LoadBalancer)
+	}
+
+	// For dynamic ip allocation, just fill in the PrivateIPAddress
+	if parameters.FrontendIPConfigurations != nil {
+		for idx, config := range *parameters.FrontendIPConfigurations {
+			if config.PrivateIPAllocationMethod == network.Dynamic {
+				// Here we randomly assign an ip as private ip
+				// It dosen't smart enough to know whether it is in the subnet's range
+				(*parameters.FrontendIPConfigurations)[idx].PrivateIPAddress = getRandomIPPtr()
+			}
+		}
+	}
+	fLBC.FakeStore[resourceGroupName][loadBalancerName] = parameters
+	result = fLBC.FakeStore[resourceGroupName][loadBalancerName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	err = nil
+	return resultChan, errChan
+}
+
+func (fLBC fakeAzureLBClient) Delete(resourceGroupName string, loadBalancerName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	respChan := make(chan autorest.Response, 1)
+	errChan := make(chan error, 1)
+	var resp autorest.Response
+	var err error
+	defer func() {
+		respChan <- resp
+		errChan <- err
+		close(respChan)
+		close(errChan)
+	}()
+	if rgLBs, ok := fLBC.FakeStore[resourceGroupName]; ok {
+		if _, ok := rgLBs[loadBalancerName]; ok {
+			delete(rgLBs, loadBalancerName)
+			resp.Response = &http.Response{
+				StatusCode: http.StatusAccepted,
+			}
+			err = nil
+			return respChan, errChan
+		}
+	}
+	resp.Response = &http.Response{
+		StatusCode: http.StatusNotFound,
+	}
+	err = autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such LB",
+	}
+	return respChan, errChan
+}
+
+func (fLBC fakeAzureLBClient) Get(resourceGroupName string, loadBalancerName string, expand string) (result network.LoadBalancer, err error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	if _, ok := fLBC.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fLBC.FakeStore[resourceGroupName][loadBalancerName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such LB",
+	}
+}
+
+func (fLBC fakeAzureLBClient) List(resourceGroupName string) (result network.LoadBalancerListResult, err error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	var value []network.LoadBalancer
+	if _, ok := fLBC.FakeStore[resourceGroupName]; ok {
+		for _, v := range fLBC.FakeStore[resourceGroupName] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+
+func (fLBC fakeAzureLBClient) ListNextResults(lastResult network.LoadBalancerListResult) (result network.LoadBalancerListResult, err error) {
+	fLBC.mutex.Lock()
+	defer fLBC.mutex.Unlock()
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = nil
+	return result, nil
+}
+
+type fakeAzurePIPClient struct {
+	mutex          *sync.Mutex
+	FakeStore      map[string]map[string]network.PublicIPAddress
+	SubscriptionID string
+}
+
+const publicIPAddressIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/publicIPAddresses/%s"
+
+// returns the full identifier of a publicIPAddress.
+func getpublicIPAddressID(subscriptionID string, resourceGroupName, pipName string) string {
+	return fmt.Sprintf(
+		publicIPAddressIDTemplate,
+		subscriptionID,
+		resourceGroupName,
+		pipName)
+}
+
+func newFakeAzurePIPClient(subscriptionID string) fakeAzurePIPClient {
+	fAPC := fakeAzurePIPClient{}
+	fAPC.FakeStore = make(map[string]map[string]network.PublicIPAddress)
+	fAPC.SubscriptionID = subscriptionID
+	fAPC.mutex = &sync.Mutex{}
+	return fAPC
+}
+
+func (fAPC fakeAzurePIPClient) CreateOrUpdate(resourceGroupName string, publicIPAddressName string, parameters network.PublicIPAddress, cancel <-chan struct{}) (<-chan network.PublicIPAddress, <-chan error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	resultChan := make(chan network.PublicIPAddress, 1)
+	errChan := make(chan error, 1)
+	var result network.PublicIPAddress
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fAPC.FakeStore[resourceGroupName]; !ok {
+		fAPC.FakeStore[resourceGroupName] = make(map[string]network.PublicIPAddress)
+	}
+
+	// assign id
+	pipID := getpublicIPAddressID(fAPC.SubscriptionID, resourceGroupName, publicIPAddressName)
+	parameters.ID = &pipID
+
+	// only create in the case user has not provided
+	if parameters.PublicIPAddressPropertiesFormat != nil &&
+		parameters.PublicIPAddressPropertiesFormat.PublicIPAllocationMethod == network.Static {
+		// assign ip
+		parameters.IPAddress = getRandomIPPtr()
+	}
+
+	fAPC.FakeStore[resourceGroupName][publicIPAddressName] = parameters
+	result = fAPC.FakeStore[resourceGroupName][publicIPAddressName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	err = nil
+	return resultChan, errChan
+}
+
+func (fAPC fakeAzurePIPClient) Delete(resourceGroupName string, publicIPAddressName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	respChan := make(chan autorest.Response, 1)
+	errChan := make(chan error, 1)
+	var resp autorest.Response
+	var err error
+	defer func() {
+		respChan <- resp
+		errChan <- err
+		close(respChan)
+		close(errChan)
+	}()
+	if rgPIPs, ok := fAPC.FakeStore[resourceGroupName]; ok {
+		if _, ok := rgPIPs[publicIPAddressName]; ok {
+			delete(rgPIPs, publicIPAddressName)
+			resp.Response = &http.Response{
+				StatusCode: http.StatusAccepted,
+			}
+			err = nil
+			return respChan, errChan
+		}
+	}
+	resp.Response = &http.Response{
+		StatusCode: http.StatusNotFound,
+	}
+	err = autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such PIP",
+	}
+	return respChan, errChan
+}
+
+func (fAPC fakeAzurePIPClient) Get(resourceGroupName string, publicIPAddressName string, expand string) (result network.PublicIPAddress, err error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	if _, ok := fAPC.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fAPC.FakeStore[resourceGroupName][publicIPAddressName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such PIP",
+	}
+}
+
+func (fAPC fakeAzurePIPClient) ListNextResults(lastResults network.PublicIPAddressListResult) (result network.PublicIPAddressListResult, err error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	return network.PublicIPAddressListResult{}, nil
+}
+
+func (fAPC fakeAzurePIPClient) List(resourceGroupName string) (result network.PublicIPAddressListResult, err error) {
+	fAPC.mutex.Lock()
+	defer fAPC.mutex.Unlock()
+	var value []network.PublicIPAddress
+	if _, ok := fAPC.FakeStore[resourceGroupName]; ok {
+		for _, v := range fAPC.FakeStore[resourceGroupName] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+
+type fakeAzureInterfacesClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]network.Interface
+}
+
+func newFakeAzureInterfacesClient() fakeAzureInterfacesClient {
+	fIC := fakeAzureInterfacesClient{}
+	fIC.FakeStore = make(map[string]map[string]network.Interface)
+	fIC.mutex = &sync.Mutex{}
+
+	return fIC
+}
+
+func (fIC fakeAzureInterfacesClient) CreateOrUpdate(resourceGroupName string, networkInterfaceName string, parameters network.Interface, cancel <-chan struct{}) (<-chan network.Interface, <-chan error) {
+	fIC.mutex.Lock()
+	defer fIC.mutex.Unlock()
+	resultChan := make(chan network.Interface, 1)
+	errChan := make(chan error, 1)
+	var result network.Interface
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fIC.FakeStore[resourceGroupName]; !ok {
+		fIC.FakeStore[resourceGroupName] = make(map[string]network.Interface)
+	}
+	fIC.FakeStore[resourceGroupName][networkInterfaceName] = parameters
+	result = fIC.FakeStore[resourceGroupName][networkInterfaceName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	err = nil
+
+	return resultChan, errChan
+}
+
+func (fIC fakeAzureInterfacesClient) Get(resourceGroupName string, networkInterfaceName string, expand string) (result network.Interface, err error) {
+	fIC.mutex.Lock()
+	defer fIC.mutex.Unlock()
+	if _, ok := fIC.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fIC.FakeStore[resourceGroupName][networkInterfaceName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such Interface",
+	}
+}
+
+type fakeAzureVirtualMachinesClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]compute.VirtualMachine
+}
+
+func newFakeAzureVirtualMachinesClient() fakeAzureVirtualMachinesClient {
+	fVMC := fakeAzureVirtualMachinesClient{}
+	fVMC.FakeStore = make(map[string]map[string]compute.VirtualMachine)
+	fVMC.mutex = &sync.Mutex{}
+	return fVMC
+}
+
+func (fVMC fakeAzureVirtualMachinesClient) CreateOrUpdate(resourceGroupName string, VMName string, parameters compute.VirtualMachine, cancel <-chan struct{}) (<-chan compute.VirtualMachine, <-chan error) {
+	fVMC.mutex.Lock()
+	defer fVMC.mutex.Unlock()
+	resultChan := make(chan compute.VirtualMachine, 1)
+	errChan := make(chan error, 1)
+	var result compute.VirtualMachine
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fVMC.FakeStore[resourceGroupName]; !ok {
+		fVMC.FakeStore[resourceGroupName] = make(map[string]compute.VirtualMachine)
+	}
+	fVMC.FakeStore[resourceGroupName][VMName] = parameters
+	result = fVMC.FakeStore[resourceGroupName][VMName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	err = nil
+	return resultChan, errChan
+}
+
+func (fVMC fakeAzureVirtualMachinesClient) Get(resourceGroupName string, VMName string, expand compute.InstanceViewTypes) (result compute.VirtualMachine, err error) {
+	fVMC.mutex.Lock()
+	defer fVMC.mutex.Unlock()
+	if _, ok := fVMC.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fVMC.FakeStore[resourceGroupName][VMName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such VM",
+	}
+}
+
+func (fVMC fakeAzureVirtualMachinesClient) List(resourceGroupName string) (result compute.VirtualMachineListResult, err error) {
+	fVMC.mutex.Lock()
+	defer fVMC.mutex.Unlock()
+	var value []compute.VirtualMachine
+	if _, ok := fVMC.FakeStore[resourceGroupName]; ok {
+		for _, v := range fVMC.FakeStore[resourceGroupName] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+func (fVMC fakeAzureVirtualMachinesClient) ListNextResults(lastResults compute.VirtualMachineListResult) (result compute.VirtualMachineListResult, err error) {
+	fVMC.mutex.Lock()
+	defer fVMC.mutex.Unlock()
+	return compute.VirtualMachineListResult{}, nil
+}
+
+type fakeAzureSubnetsClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]network.Subnet
+}
+
+func newFakeAzureSubnetsClient() fakeAzureSubnetsClient {
+	fASC := fakeAzureSubnetsClient{}
+	fASC.FakeStore = make(map[string]map[string]network.Subnet)
+	fASC.mutex = &sync.Mutex{}
+	return fASC
+}
+
+func (fASC fakeAzureSubnetsClient) CreateOrUpdate(resourceGroupName string, virtualNetworkName string, subnetName string, subnetParameters network.Subnet, cancel <-chan struct{}) (<-chan network.Subnet, <-chan error) {
+	fASC.mutex.Lock()
+	defer fASC.mutex.Unlock()
+	resultChan := make(chan network.Subnet, 1)
+	errChan := make(chan error, 1)
+	var result network.Subnet
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
+	if _, ok := fASC.FakeStore[rgVnet]; !ok {
+		fASC.FakeStore[rgVnet] = make(map[string]network.Subnet)
+	}
+	fASC.FakeStore[rgVnet][subnetName] = subnetParameters
+	result = fASC.FakeStore[rgVnet][subnetName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	err = nil
+	return resultChan, errChan
+}
+
+func (fASC fakeAzureSubnetsClient) Delete(resourceGroupName string, virtualNetworkName string, subnetName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error) {
+	fASC.mutex.Lock()
+	defer fASC.mutex.Unlock()
+	respChan := make(chan autorest.Response, 1)
+	errChan := make(chan error, 1)
+	var resp autorest.Response
+	var err error
+	defer func() {
+		respChan <- resp
+		errChan <- err
+		close(respChan)
+		close(errChan)
+	}()
+
+	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
+	if rgSubnets, ok := fASC.FakeStore[rgVnet]; ok {
+		if _, ok := rgSubnets[subnetName]; ok {
+			delete(rgSubnets, subnetName)
+			resp.Response = &http.Response{
+				StatusCode: http.StatusAccepted,
+			}
+			err = nil
+			return respChan, errChan
+		}
+	}
+	resp.Response = &http.Response{
+		StatusCode: http.StatusNotFound,
+	}
+	err = autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such Subnet",
+	}
+	return respChan, errChan
+}
+func (fASC fakeAzureSubnetsClient) Get(resourceGroupName string, virtualNetworkName string, subnetName string, expand string) (result network.Subnet, err error) {
+	fASC.mutex.Lock()
+	defer fASC.mutex.Unlock()
+	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
+	if _, ok := fASC.FakeStore[rgVnet]; ok {
+		if entity, ok := fASC.FakeStore[rgVnet][subnetName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such Subnet",
+	}
+}
+func (fASC fakeAzureSubnetsClient) List(resourceGroupName string, virtualNetworkName string) (result network.SubnetListResult, err error) {
+	fASC.mutex.Lock()
+	defer fASC.mutex.Unlock()
+	rgVnet := strings.Join([]string{resourceGroupName, virtualNetworkName}, "AND")
+	var value []network.Subnet
+	if _, ok := fASC.FakeStore[rgVnet]; ok {
+		for _, v := range fASC.FakeStore[rgVnet] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+
+type fakeAzureNSGClient struct {
+	mutex     *sync.Mutex
+	FakeStore map[string]map[string]network.SecurityGroup
+}
+
+func newFakeAzureNSGClient() fakeAzureNSGClient {
+	fNSG := fakeAzureNSGClient{}
+	fNSG.FakeStore = make(map[string]map[string]network.SecurityGroup)
+	fNSG.mutex = &sync.Mutex{}
+	return fNSG
+}
+
+func (fNSG fakeAzureNSGClient) CreateOrUpdate(resourceGroupName string, networkSecurityGroupName string, parameters network.SecurityGroup, cancel <-chan struct{}) (<-chan network.SecurityGroup, <-chan error) {
+	fNSG.mutex.Lock()
+	defer fNSG.mutex.Unlock()
+	resultChan := make(chan network.SecurityGroup, 1)
+	errChan := make(chan error, 1)
+	var result network.SecurityGroup
+	var err error
+	defer func() {
+		resultChan <- result
+		errChan <- err
+		close(resultChan)
+		close(errChan)
+	}()
+	if _, ok := fNSG.FakeStore[resourceGroupName]; !ok {
+		fNSG.FakeStore[resourceGroupName] = make(map[string]network.SecurityGroup)
+	}
+	fNSG.FakeStore[resourceGroupName][networkSecurityGroupName] = parameters
+	result = fNSG.FakeStore[resourceGroupName][networkSecurityGroupName]
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	err = nil
+	return resultChan, errChan
+}
+
+func (fNSG fakeAzureNSGClient) Delete(resourceGroupName string, networkSecurityGroupName string, cancel <-chan struct{}) (<-chan autorest.Response, <-chan error) {
+	fNSG.mutex.Lock()
+	defer fNSG.mutex.Unlock()
+	respChan := make(chan autorest.Response, 1)
+	errChan := make(chan error, 1)
+	var resp autorest.Response
+	var err error
+	defer func() {
+		respChan <- resp
+		errChan <- err
+		close(respChan)
+		close(errChan)
+	}()
+	if rgSGs, ok := fNSG.FakeStore[resourceGroupName]; ok {
+		if _, ok := rgSGs[networkSecurityGroupName]; ok {
+			delete(rgSGs, networkSecurityGroupName)
+			resp.Response = &http.Response{
+				StatusCode: http.StatusAccepted,
+			}
+			err = nil
+			return respChan, errChan
+		}
+	}
+	resp.Response = &http.Response{
+		StatusCode: http.StatusNotFound,
+	}
+	err = autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such NSG",
+	}
+	return respChan, errChan
+}
+
+func (fNSG fakeAzureNSGClient) Get(resourceGroupName string, networkSecurityGroupName string, expand string) (result network.SecurityGroup, err error) {
+	fNSG.mutex.Lock()
+	defer fNSG.mutex.Unlock()
+	if _, ok := fNSG.FakeStore[resourceGroupName]; ok {
+		if entity, ok := fNSG.FakeStore[resourceGroupName][networkSecurityGroupName]; ok {
+			return entity, nil
+		}
+	}
+	return result, autorest.DetailedError{
+		StatusCode: http.StatusNotFound,
+		Message:    "Not such NSG",
+	}
+}
+
+func (fNSG fakeAzureNSGClient) List(resourceGroupName string) (result network.SecurityGroupListResult, err error) {
+	fNSG.mutex.Lock()
+	defer fNSG.mutex.Unlock()
+	var value []network.SecurityGroup
+	if _, ok := fNSG.FakeStore[resourceGroupName]; ok {
+		for _, v := range fNSG.FakeStore[resourceGroupName] {
+			value = append(value, v)
+		}
+	}
+	result.Response.Response = &http.Response{
+		StatusCode: http.StatusOK,
+	}
+	result.NextLink = nil
+	result.Value = &value
+	return result, nil
+}
+
+func getRandomIPPtr() *string {
+	rand.Seed(time.Now().UnixNano())
+	return to.StringPtr(fmt.Sprintf("%d.%d.%d.%d", rand.Intn(256), rand.Intn(256), rand.Intn(256), rand.Intn(256)))
+}
--- a/pkg/cloudprovider/providers/azure/azure_instances.go
+++ b/pkg/cloudprovider/providers/azure/azure_instances.go
@ -48,19 +48,10 @@ func (az *Cloud) NodeAddresses(name types.NodeName) ([]v1.NodeAddress, error) {
 		}
 		return addresses, nil
 	}
-	ip, err := az.getIPForMachine(name)
+	ip, err := az.GetIPForMachineWithRetry(name)
 	if err != nil {
-		if az.CloudProviderBackoff {
-			glog.V(2).Infof("NodeAddresses(%s) backing off", name)
-			ip, err = az.GetIPForMachineWithRetry(name)
-			if err != nil {
-				glog.V(2).Infof("NodeAddresses(%s) abort backoff", name)
-				return nil, err
-			}
-		} else {
-			glog.Errorf("error: az.NodeAddresses, az.getIPForMachine(%s), err=%v", name, err)
-			return nil, err
-		}
+		glog.V(2).Infof("NodeAddresses(%s) abort backoff", name)
+		return nil, err
 	}

 	return []v1.NodeAddress{
@ -199,39 +190,6 @@ func (az *Cloud) CurrentNodeName(hostname string) (types.NodeName, error) {
 	return types.NodeName(hostname), nil
 }

-func (az *Cloud) listAllNodesInResourceGroup() ([]compute.VirtualMachine, error) {
-	allNodes := []compute.VirtualMachine{}
-
-	az.operationPollRateLimiter.Accept()
-	glog.V(10).Infof("VirtualMachinesClient.List(%s): start", az.ResourceGroup)
-	result, err := az.VirtualMachinesClient.List(az.ResourceGroup)
-	glog.V(10).Infof("VirtualMachinesClient.List(%s): end", az.ResourceGroup)
-	if err != nil {
-		glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.List(%s), err=%v", az.ResourceGroup, err)
-		return nil, err
-	}
-
-	morePages := (result.Value != nil && len(*result.Value) > 1)
-
-	for morePages {
-		allNodes = append(allNodes, *result.Value...)
-
-		az.operationPollRateLimiter.Accept()
-		glog.V(10).Infof("VirtualMachinesClient.ListAllNextResults(%v): start", az.ResourceGroup)
-		result, err = az.VirtualMachinesClient.ListAllNextResults(result)
-		glog.V(10).Infof("VirtualMachinesClient.ListAllNextResults(%v): end", az.ResourceGroup)
-		if err != nil {
-			glog.Errorf("error: az.listAllNodesInResourceGroup(), az.VirtualMachinesClient.ListAllNextResults(%v), err=%v", result, err)
-			return nil, err
-		}
-
-		morePages = (result.Value != nil && len(*result.Value) > 1)
-	}
-
-	return allNodes, nil
-
-}
-
 // mapNodeNameToVMName maps a k8s NodeName to an Azure VM Name
 // This is a simple string cast.
 func mapNodeNameToVMName(nodeName types.NodeName) string {
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.go
--- a/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
+++ b/pkg/cloudprovider/providers/azure/azure_loadbalancer.md
@ -0,0 +1,77 @@
+# Azure LoadBalancer
+
+The way azure define LoadBalancer is different with GCE or AWS. Azure's LB can have multiple frontend IP refs. The GCE and AWS can only allow one, if you want more, you better to have another LB. Because of the fact, Public IP is not part of the LB in Azure. NSG is not part of LB in Azure either. However, you cannot delete them in parallel, Public IP can only be delete after LB's frontend IP ref is removed. 
+
+For different Azure Resources, such as LB, Public IP, NSG. They are the same tier azure resources. We need to make sure there is no connection in their own ensure loops. In another words, They would be eventually reconciled regardless of other resources' state. They should only depends on service state.
+
+Despite the ideal philosophy above, we have to face the reality. NSG depends on LB's frontend ip to adjust NSG rules. So when we want to reconcile NSG, the LB should contain the corresponding frontend ip config.
+
+And also, For Azure, we cannot afford to have more than 1 worker of service_controller. Because, different services could operate on the same LB, concurrent execution could result in conflict or unexpected result. For AWS and GCE, they apparently doesn't have the problem, they use one LB per service, no such conflict.
+
+There are two load balancers per availability set internal and external. There is a limit on number of services that can be associated with a single load balancer.
+By default primary load balancer is selected. Services can be annotated to allow auto selection of available load balancers. Service annotations can also be used to provide specific availability sets that host the load balancers. Note that in case of auto selection or specific availability set selection, when the availability set is lost incase of downtime or cluster scale down the services are currently not auto assigned to an available load balancer.
+Service Annotation for Auto and specific load balancer mode
+
+- service.beta.kubernetes.io/azure-load-balancer-mode" (__auto__|as1,as2...)
+
+## Introduce Functions
+
+- reconcileLoadBalancer(clusterName string, service *v1.Service, nodes []*v1.Node, wantLb bool) (*network.LoadBalancer, error)
+  - Go through lb's properties, update based on wantLb
+  - If any change on the lb, no matter if the lb exists or not
+    - Call az cloud to CreateOrUpdate on this lb, or Delete if nothing left
+  - return lb, err
+
+- reconcileSecurityGroup(clusterName string, service *v1.Service, lbIP *string, wantLb bool) (*network.SecurityGroup, error)
+  - Go though NSG' properties, update based on wantLb
+    - Use destinationIPAddress as target address if possible
+    - Consolidate NSG rules if possible
+  - If any change on the NSG, (the NSG should always exists)
+    - Call az cloud to CreateOrUpdate on this NSG
+  - return sg, err
+
+- reconcilePublicIP(clusterName string, service *v1.Service, wantLb bool) (*network.PublicIPAddress, error)
+  - List all the public ip in the resource group
+  - Make sure we only touch Public IP resources has tags[service] = "namespace/serviceName"
+    - skip for wantLb && !isInternal && pipName == desiredPipName
+    - delete other public ip resources if any
+  - if !isInternal && wantLb 
+    - ensure Public IP with desiredPipName exists
+
+- getServiceLoadBalancer(service *v1.Service, clusterName string, nodes []*v1.Node, wantLb bool) (lb, status, exists, error)
+  - gets the loadbalancer for the service if it already exists
+  - If wantLb is TRUE then -it selects a new load balancer, the selction helps distribute the services across load balancers
+  - In case the selected load balancer does not exists it returns network.LoadBalancer struct with added metadata (such as name, location) and existsLB set to FALSE 
+  - By default - cluster default LB is returned
+
+## Define interface behaviors
+
+### GetLoadBalancer
+
+- Get LoadBalancer status, return status, error
+  - return the load balancer status for this service
+  - it will not create or update or delete any resource
+
+### EnsureLoadBalancer
+
+- Reconcile LB for the flipped service
+  - Call reconcileLoadBalancer(clusterName, flipedService, nil, false/* wantLb */)
+- Reconcile Public IP
+  - Call reconcilePublicIP(cluster, service, true)
+- Reconcile LB's related and owned resources, such as FrontEndIPConfig, Rules, Probe.
+  - Call reconcileLoadBalancer(clusterName, service, nodes, true /* wantLb */)
+- Reconcile NSG rules, it need to be called after reconcileLB
+  - Call reconcileSecurityGroup(clusterName, service, lbStatus, true /* wantLb */)
+
+### UpdateLoadBalancer
+
+- Has no difference with EnsureLoadBalancer
+
+### EnsureLoadBalancerDeleted
+
+- Reconcile NSG first, before reconcile LB, because SG need LB to be there
+  - Call reconcileSecurityGroup(clusterName, service, nil, false /* wantLb */)
+- Reconcile LB's related and owned resources, such as FrontEndIPConfig, Rules, Probe.
+  - Call reconcileLoadBalancer(clusterName, service, nodes, false)
+- Reconcile Public IP, public IP needs related LB reconciled first
+  - Call reconcilePublicIP(cluster, service, false)
--- a/pkg/cloudprovider/providers/azure/azure_test.go
+++ b/pkg/cloudprovider/providers/azure/azure_test.go
--- a/pkg/cloudprovider/providers/azure/azure_util.go
+++ b/pkg/cloudprovider/providers/azure/azure_util.go
@ -21,6 +21,7 @@ import (
 	"fmt"
 	"hash/crc32"
 	"regexp"
+	"sort"
 	"strconv"
 	"strings"

@ -31,6 +32,7 @@ import (
 	"github.com/Azure/azure-sdk-for-go/arm/network"
 	"github.com/golang/glog"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/sets"
 )

 const (
@ -44,6 +46,12 @@ const (
 	loadBalancerRuleIDTemplate  = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/loadBalancers/%s/loadBalancingRules/%s"
 	loadBalancerProbeIDTemplate = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/loadBalancers/%s/probes/%s"
 	securityRuleIDTemplate      = "/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Network/networkSecurityGroups/%s/securityRules/%s"
+
+	// InternalLoadBalancerNameSuffix is load balancer posfix
+	InternalLoadBalancerNameSuffix = "-internal"
+
+	// nodeLabelRole specifies the role of a node
+	nodeLabelRole = "kubernetes.io/role"
 )

 var providerIDRE = regexp.MustCompile(`^` + CloudProviderName + `://(?:.*)/Microsoft.Compute/virtualMachines/(.+)$`)
@ -116,6 +124,143 @@ func (az *Cloud) getSecurityRuleID(securityRuleName string) string {
 		securityRuleName)
 }

+// returns the full identifier of a publicIPAddress.
+func (az *Cloud) getpublicIPAddressID(pipName string) string {
+	return fmt.Sprintf(
+		publicIPAddressIDTemplate,
+		az.SubscriptionID,
+		az.ResourceGroup,
+		pipName)
+}
+
+// getLoadBalancerAvailabilitySetNames selects all possible availability sets for
+// service load balancer, if the service has no loadbalancer mode annotaion returns the
+// primary availability set if service annotation for loadbalancer availability set
+// exists then return the eligible a availability set
+func (az *Cloud) getLoadBalancerAvailabilitySetNames(service *v1.Service, nodes []*v1.Node) (availabilitySetNames *[]string, err error) {
+	hasMode, isAuto, serviceAvailabilitySetNames := getServiceLoadBalancerMode(service)
+	if !hasMode {
+		// no mode specified in service annotation default to PrimaryAvailabilitySetName
+		availabilitySetNames = &[]string{az.Config.PrimaryAvailabilitySetName}
+		return availabilitySetNames, nil
+	}
+	availabilitySetNames, err = az.getAgentPoolAvailabiliySets(nodes)
+	if err != nil {
+		glog.Errorf("az.getLoadBalancerAvailabilitySetNames - getAgentPoolAvailabiliySets failed err=(%v)", err)
+		return nil, err
+	}
+	if len(*availabilitySetNames) == 0 {
+		glog.Errorf("az.getLoadBalancerAvailabilitySetNames - No availability sets found for nodes in the cluster, node count(%d)", len(nodes))
+		return nil, fmt.Errorf("No availability sets found for nodes, node count(%d)", len(nodes))
+	}
+	// sort the list to have deterministic selection
+	sort.Strings(*availabilitySetNames)
+	if !isAuto {
+		if serviceAvailabilitySetNames == nil || len(serviceAvailabilitySetNames) == 0 {
+			return nil, fmt.Errorf("service annotation for LoadBalancerMode is empty, it should have __auto__ or availability sets value")
+		}
+		// validate availability set exists
+		var found bool
+		for sasx := range serviceAvailabilitySetNames {
+			for asx := range *availabilitySetNames {
+				if strings.EqualFold((*availabilitySetNames)[asx], serviceAvailabilitySetNames[sasx]) {
+					found = true
+					serviceAvailabilitySetNames[sasx] = (*availabilitySetNames)[asx]
+					break
+				}
+			}
+			if !found {
+				glog.Errorf("az.getLoadBalancerAvailabilitySetNames - Availability set (%s) in service annotation not found", serviceAvailabilitySetNames[sasx])
+				return nil, fmt.Errorf("availability set (%s) - not found", serviceAvailabilitySetNames[sasx])
+			}
+		}
+		availabilitySetNames = &serviceAvailabilitySetNames
+	}
+
+	return availabilitySetNames, nil
+}
+
+// lists the virtual machines for for the resource group and then builds
+// a list of availability sets that match the nodes available to k8s
+func (az *Cloud) getAgentPoolAvailabiliySets(nodes []*v1.Node) (agentPoolAvailabilitySets *[]string, err error) {
+	vms, err := az.VirtualMachineClientListWithRetry()
+	if err != nil {
+		glog.Errorf("az.getNodeAvailabilitySet - VirtualMachineClientListWithRetry failed, err=%v", err)
+		return nil, err
+	}
+	vmNameToAvailabilitySetID := make(map[string]string, len(vms))
+	for vmx := range vms {
+		vm := vms[vmx]
+		if vm.AvailabilitySet != nil {
+			vmNameToAvailabilitySetID[*vm.Name] = *vm.AvailabilitySet.ID
+		}
+	}
+	availabilitySetIDs := sets.NewString()
+	agentPoolAvailabilitySets = &[]string{}
+	for nx := range nodes {
+		nodeName := (*nodes[nx]).Name
+		if isMasterNode(nodes[nx]) {
+			continue
+		}
+		asID, ok := vmNameToAvailabilitySetID[nodeName]
+		if !ok {
+			glog.Errorf("az.getNodeAvailabilitySet - Node(%s) has no availability sets", nodeName)
+			return nil, fmt.Errorf("Node (%s) - has no availability sets", nodeName)
+		}
+		if availabilitySetIDs.Has(asID) {
+			// already added in the list
+			continue
+		}
+		asName, err := getLastSegment(asID)
+		if err != nil {
+			glog.Errorf("az.getNodeAvailabilitySet - Node (%s)- getLastSegment(%s), err=%v", nodeName, asID, err)
+			return nil, err
+		}
+		// AvailabilitySet ID is currently upper cased in a indeterministic way
+		// We want to keep it lower case, before the ID get fixed
+		asName = strings.ToLower(asName)
+
+		*agentPoolAvailabilitySets = append(*agentPoolAvailabilitySets, asName)
+	}
+
+	return agentPoolAvailabilitySets, nil
+}
+
+func (az *Cloud) mapLoadBalancerNameToAvailabilitySet(lbName string, clusterName string) (availabilitySetName string) {
+	availabilitySetName = strings.TrimSuffix(lbName, InternalLoadBalancerNameSuffix)
+	if strings.EqualFold(clusterName, lbName) {
+		availabilitySetName = az.Config.PrimaryAvailabilitySetName
+	}
+
+	return availabilitySetName
+}
+
+// For a load balancer, all frontend ip should reference either a subnet or publicIpAddress.
+// Thus Azure do not allow mixed type (public and internal) load balancer.
+// So we'd have a separate name for internal load balancer.
+// This would be the name for Azure LoadBalancer resource.
+func (az *Cloud) getLoadBalancerName(clusterName string, availabilitySetName string, isInternal bool) string {
+	lbNamePrefix := availabilitySetName
+	if strings.EqualFold(availabilitySetName, az.Config.PrimaryAvailabilitySetName) {
+		lbNamePrefix = clusterName
+	}
+	if isInternal {
+		return fmt.Sprintf("%s%s", lbNamePrefix, InternalLoadBalancerNameSuffix)
+	}
+	return lbNamePrefix
+}
+
+// isMasterNode returns returns true is the node has a master role label.
+// The master role is determined by looking for:
+// * a kubernetes.io/role="master" label
+func isMasterNode(node *v1.Node) bool {
+	if val, ok := node.Labels[nodeLabelRole]; ok && val == "master" {
+		return true
+	}
+
+	return false
+}
+
 // returns the deepest child's identifier from a full identifier string.
 func getLastSegment(ID string) (string, error) {
 	parts := strings.Split(ID, "/")
@ -179,16 +324,8 @@ func getPrimaryIPConfig(nic network.Interface) (*network.InterfaceIPConfiguratio
 	return nil, fmt.Errorf("failed to determine the determine primary ipconfig. nicname=%q", *nic.Name)
 }

-// For a load balancer, all frontend ip should reference either a subnet or publicIpAddress.
-// Thus Azure do not allow mixed type (public and internal) load balancer.
-// So we'd have a separate name for internal load balancer.
-// This would be the name for Azure LoadBalancer resource.
-func getLoadBalancerName(clusterName string, isInternal bool) string {
-	if isInternal {
-		return fmt.Sprintf("%s-internal", clusterName)
-	}
-
-	return clusterName
+func isInternalLoadBalancer(lb *network.LoadBalancer) bool {
+	return strings.HasSuffix(*lb.Name, InternalLoadBalancerNameSuffix)
 }

 func getBackendPoolName(clusterName string) string {
@ -203,6 +340,10 @@ func getLoadBalancerRuleName(service *v1.Service, port v1.ServicePort, subnetNam
 }

 func getSecurityRuleName(service *v1.Service, port v1.ServicePort, sourceAddrPrefix string) string {
+	if useSharedSecurityRule(service) {
+		safePrefix := strings.Replace(sourceAddrPrefix, "/", "_", -1)
+		return fmt.Sprintf("shared-%s-%d-%s", port.Protocol, port.Port, safePrefix)
+	}
 	safePrefix := strings.Replace(sourceAddrPrefix, "/", "_", -1)
 	return fmt.Sprintf("%s-%s-%d-%s", getRulePrefix(service), port.Protocol, port.Port, safePrefix)
 }
--- a/pkg/cloudprovider/providers/azure/azure_wrap.go
+++ b/pkg/cloudprovider/providers/azure/azure_wrap.go
@ -40,6 +40,19 @@ func checkResourceExistsFromError(err error) (bool, error) {
 	return false, v
 }

+// If it is StatusNotFound return nil,
+// Otherwise, return what it is
+func ignoreStatusNotFoundFromError(err error) error {
+	if err == nil {
+		return nil
+	}
+	v, ok := err.(autorest.DetailedError)
+	if ok && v.StatusCode == http.StatusNotFound {
+		return nil
+	}
+	return err
+}
+
 func (az *Cloud) getVirtualMachine(nodeName types.NodeName) (vm compute.VirtualMachine, exists bool, err error) {
 	var realErr error

@ -103,7 +116,6 @@ func (az *Cloud) getSecurityGroup() (sg network.SecurityGroup, exists bool, err

 func (az *Cloud) getAzureLoadBalancer(name string) (lb network.LoadBalancer, exists bool, err error) {
 	var realErr error
-
 	az.operationPollRateLimiter.Accept()
 	glog.V(10).Infof("LoadBalancerClient.Get(%s): start", name)
 	lb, err = az.LoadBalancerClient.Get(az.ResourceGroup, name, "")
@ -121,6 +133,25 @@ func (az *Cloud) getAzureLoadBalancer(name string) (lb network.LoadBalancer, exi
 	return lb, exists, err
 }

+func (az *Cloud) listLoadBalancers() (lbListResult network.LoadBalancerListResult, exists bool, err error) {
+	var realErr error
+
+	az.operationPollRateLimiter.Accept()
+	glog.V(10).Infof("LoadBalancerClient.List(%s): start", az.ResourceGroup)
+	lbListResult, err = az.LoadBalancerClient.List(az.ResourceGroup)
+	glog.V(10).Infof("LoadBalancerClient.List(%s): end", az.ResourceGroup)
+	exists, realErr = checkResourceExistsFromError(err)
+	if realErr != nil {
+		return lbListResult, false, realErr
+	}
+
+	if !exists {
+		return lbListResult, false, nil
+	}
+
+	return lbListResult, exists, err
+}
+
 func (az *Cloud) getPublicIPAddress(name string) (pip network.PublicIPAddress, exists bool, err error) {
 	var realErr error

--- a/pkg/cloudprovider/providers/vsphere/BUILD
+++ b/pkg/cloudprovider/providers/vsphere/BUILD
@ -9,6 +9,7 @@ load(
 go_library(
    name = "go_default_library",
    srcs = [
+        "nodemanager.go",
        "vsphere.go",
        "vsphere_util.go",
    ],
@ -21,13 +22,15 @@ go_library(
        "//pkg/controller:go_default_library",
        "//vendor/github.com/golang/glog:go_default_library",
        "//vendor/github.com/vmware/govmomi:go_default_library",
-        "//vendor/github.com/vmware/govmomi/object:go_default_library",
        "//vendor/github.com/vmware/govmomi/vim25:go_default_library",
        "//vendor/github.com/vmware/govmomi/vim25/mo:go_default_library",
        "//vendor/golang.org/x/net/context:go_default_library",
        "//vendor/gopkg.in/gcfg.v1:go_default_library",
        "//vendor/k8s.io/api/core/v1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
+        "//vendor/k8s.io/client-go/informers:go_default_library",
+        "//vendor/k8s.io/client-go/tools/cache:go_default_library",
    ],
 )

--- a/pkg/cloudprovider/providers/vsphere/nodemanager.go
+++ b/pkg/cloudprovider/providers/vsphere/nodemanager.go
@ -0,0 +1,295 @@
+/*
+Copyright 2016 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vsphere
+
+import (
+	"fmt"
+	"github.com/golang/glog"
+	"golang.org/x/net/context"
+	"k8s.io/api/core/v1"
+	k8stypes "k8s.io/apimachinery/pkg/types"
+	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib"
+	"strings"
+	"sync"
+)
+
+// Stores info about the kubernetes node
+type NodeInfo struct {
+	dataCenter *vclib.Datacenter
+	vm         *vclib.VirtualMachine
+	vcServer   string
+}
+
+type NodeManager struct {
+	// TODO: replace map with concurrent map when k8s supports go v1.9
+
+	// Maps the VC server to VSphereInstance
+	vsphereInstanceMap map[string]*VSphereInstance
+	// Maps node name to node info.
+	nodeInfoMap map[string]*NodeInfo
+	// Maps node name to node structure
+	registeredNodes map[string]*v1.Node
+
+	// Mutexes
+	registeredNodesLock sync.RWMutex
+	nodeInfoLock        sync.RWMutex
+}
+
+type NodeDetails struct {
+	NodeName string
+	vm       *vclib.VirtualMachine
+}
+
+// TODO: Make it configurable in vsphere.conf
+const (
+	POOL_SIZE  = 8
+	QUEUE_SIZE = POOL_SIZE * 10
+)
+
+func (nm *NodeManager) DiscoverNode(node *v1.Node) error {
+	type VmSearch struct {
+		vc         string
+		datacenter *vclib.Datacenter
+	}
+
+	var mutex = &sync.Mutex{}
+	var globalErrMutex = &sync.Mutex{}
+	var queueChannel chan *VmSearch
+	var wg sync.WaitGroup
+	var globalErr *error
+
+	queueChannel = make(chan *VmSearch, QUEUE_SIZE)
+	nodeUUID := node.Status.NodeInfo.SystemUUID
+	vmFound := false
+	globalErr = nil
+
+	setGlobalErr := func(err error) {
+		globalErrMutex.Lock()
+		globalErr = &err
+		globalErrMutex.Unlock()
+	}
+
+	setVMFound := func(found bool) {
+		mutex.Lock()
+		vmFound = found
+		mutex.Unlock()
+	}
+
+	getVMFound := func() bool {
+		mutex.Lock()
+		found := vmFound
+		mutex.Unlock()
+		return found
+	}
+
+	go func() {
+		var datacenterObjs []*vclib.Datacenter
+		for vc, vsi := range nm.vsphereInstanceMap {
+
+			found := getVMFound()
+			if found == true {
+				break
+			}
+
+			// Create context
+			ctx, cancel := context.WithCancel(context.Background())
+			defer cancel()
+
+			err := vsi.conn.Connect(ctx)
+			if err != nil {
+				glog.V(4).Info("Discovering node error vc:", err)
+				setGlobalErr(err)
+				continue
+			}
+
+			if vsi.cfg.Datacenters == "" {
+				datacenterObjs, err = vclib.GetAllDatacenter(ctx, vsi.conn)
+				if err != nil {
+					glog.V(4).Info("Discovering node error dc:", err)
+					setGlobalErr(err)
+					continue
+				}
+			} else {
+				datacenters := strings.Split(vsi.cfg.Datacenters, ",")
+				for _, dc := range datacenters {
+					dc = strings.TrimSpace(dc)
+					if dc == "" {
+						continue
+					}
+					datacenterObj, err := vclib.GetDatacenter(ctx, vsi.conn, dc)
+					if err != nil {
+						glog.V(4).Info("Discovering node error dc:", err)
+						setGlobalErr(err)
+						continue
+					}
+					datacenterObjs = append(datacenterObjs, datacenterObj)
+				}
+			}
+
+			for _, datacenterObj := range datacenterObjs {
+				found := getVMFound()
+				if found == true {
+					break
+				}
+
+				glog.V(4).Infof("Finding node %s in vc=%s and datacenter=%s", node.Name, vc, datacenterObj.Name())
+				queueChannel <- &VmSearch{
+					vc:         vc,
+					datacenter: datacenterObj,
+				}
+			}
+		}
+		close(queueChannel)
+	}()
+
+	for i := 0; i < POOL_SIZE; i++ {
+		go func() {
+			for res := range queueChannel {
+				ctx, cancel := context.WithCancel(context.Background())
+				defer cancel()
+				vm, err := res.datacenter.GetVMByUUID(ctx, nodeUUID)
+				if err != nil {
+					glog.V(4).Infof("Error %q while looking for vm=%+v in vc=%s and datacenter=%s",
+						err, node.Name, vm, res.vc, res.datacenter.Name())
+					if err != vclib.ErrNoVMFound {
+						setGlobalErr(err)
+					} else {
+						glog.V(4).Infof("Did not find node %s in vc=%s and datacenter=%s",
+							node.Name, res.vc, res.datacenter.Name(), err)
+					}
+					continue
+				}
+				if vm != nil {
+					glog.V(4).Infof("Found node %s as vm=%+v in vc=%s and datacenter=%s",
+						node.Name, vm, res.vc, res.datacenter.Name())
+
+					nodeInfo := &NodeInfo{dataCenter: res.datacenter, vm: vm, vcServer: res.vc}
+					nm.addNodeInfo(node.ObjectMeta.Name, nodeInfo)
+					for range queueChannel {
+					}
+					setVMFound(true)
+					break
+				}
+			}
+			wg.Done()
+		}()
+		wg.Add(1)
+	}
+	wg.Wait()
+	if vmFound {
+		return nil
+	}
+	if globalErr != nil {
+		return *globalErr
+	}
+
+	glog.V(4).Infof("Discovery Node: %q vm not found", node.Name)
+	return vclib.ErrNoVMFound
+}
+
+func (nm *NodeManager) RegisterNode(node *v1.Node) error {
+	nm.addNode(node)
+	nm.DiscoverNode(node)
+	return nil
+}
+
+func (nm *NodeManager) UnRegisterNode(node *v1.Node) error {
+	nm.removeNode(node)
+	return nil
+}
+
+func (nm *NodeManager) RediscoverNode(nodeName k8stypes.NodeName) error {
+	node, err := nm.GetNode(nodeName)
+
+	if err != nil {
+		return err
+	}
+	return nm.DiscoverNode(&node)
+}
+
+func (nm *NodeManager) GetNode(nodeName k8stypes.NodeName) (v1.Node, error) {
+	nm.registeredNodesLock.RLock()
+	node := nm.registeredNodes[convertToString(nodeName)]
+	nm.registeredNodesLock.RUnlock()
+	if node == nil {
+		return v1.Node{}, vclib.ErrNoVMFound
+	}
+	return *node, nil
+}
+
+func (nm *NodeManager) addNode(node *v1.Node) {
+	nm.registeredNodesLock.Lock()
+	nm.registeredNodes[node.ObjectMeta.Name] = node
+	nm.registeredNodesLock.Unlock()
+}
+
+func (nm *NodeManager) removeNode(node *v1.Node) {
+	nm.registeredNodesLock.Lock()
+	delete(nm.registeredNodes, node.ObjectMeta.Name)
+	nm.registeredNodesLock.Unlock()
+}
+
+// GetNodeInfo returns a NodeInfo which datacenter, vm and vc server ip address.
+// This method returns an error if it is unable find node VCs and DCs listed in vSphere.conf
+// NodeInfo returned may not be updated to reflect current VM location.
+func (nm *NodeManager) GetNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error) {
+	getNodeInfo := func(nodeName k8stypes.NodeName) *NodeInfo {
+		nm.nodeInfoLock.RLock()
+		nodeInfo := nm.nodeInfoMap[convertToString(nodeName)]
+		nm.nodeInfoLock.RUnlock()
+		return nodeInfo
+	}
+	nodeInfo := getNodeInfo(nodeName)
+	if nodeInfo == nil {
+		err := nm.RediscoverNode(nodeName)
+		if err != nil {
+			glog.V(4).Infof("error %q node info for node %q not found", err, convertToString(nodeName))
+			return NodeInfo{}, err
+		}
+		nodeInfo = getNodeInfo(nodeName)
+	}
+	return *nodeInfo, nil
+}
+
+func (nm *NodeManager) GetNodeDetails() []NodeDetails {
+	nm.nodeInfoLock.RLock()
+	defer nm.nodeInfoLock.RUnlock()
+	var nodeDetails []NodeDetails
+	for nodeName, nodeInfo := range nm.nodeInfoMap {
+		nodeDetails = append(nodeDetails, NodeDetails{nodeName, nodeInfo.vm})
+	}
+	return nodeDetails
+}
+
+func (nm *NodeManager) addNodeInfo(nodeName string, nodeInfo *NodeInfo) {
+	nm.nodeInfoLock.Lock()
+	nm.nodeInfoMap[nodeName] = nodeInfo
+	nm.nodeInfoLock.Unlock()
+}
+
+func (nm *NodeManager) GetVSphereInstance(nodeName k8stypes.NodeName) (VSphereInstance, error) {
+	nodeInfo, err := nm.GetNodeInfo(nodeName)
+	if err != nil {
+		glog.V(4).Infof("node info for node %q not found", convertToString(nodeName))
+		return VSphereInstance{}, err
+	}
+	vsphereInstance := nm.vsphereInstanceMap[nodeInfo.vcServer]
+	if vsphereInstance == nil {
+		return VSphereInstance{}, fmt.Errorf("vSphereInstance for vc server %q not found while looking for node %q", nodeInfo.vcServer, convertToString(nodeName))
+	}
+	return *vsphereInstance, nil
+}
--- a/pkg/cloudprovider/providers/vsphere/vclib/custom_errors.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/custom_errors.go
@ -25,6 +25,7 @@ const (
 	NoDevicesFoundErrMsg       = "No devices found"
 	DiskNotFoundErrMsg         = "No vSphere disk ID found"
 	InvalidVolumeOptionsErrMsg = "VolumeOptions verification failed"
+	NoVMFoundErrMsg            = "No VM found"
 )

 // Error constants
@ -34,4 +35,5 @@ var (
 	ErrNoDevicesFound       = errors.New(NoDevicesFoundErrMsg)
 	ErrNoDiskIDFound        = errors.New(DiskNotFoundErrMsg)
 	ErrInvalidVolumeOptions = errors.New(InvalidVolumeOptionsErrMsg)
+	ErrNoVMFound            = errors.New(NoVMFoundErrMsg)
 )
--- a/pkg/cloudprovider/providers/vsphere/vclib/datacenter.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/datacenter.go
@ -49,6 +49,22 @@ func GetDatacenter(ctx context.Context, connection *VSphereConnection, datacente
 	return &dc, nil
 }

+// GetAllDatacenter returns all the DataCenter Objects
+func GetAllDatacenter(ctx context.Context, connection *VSphereConnection) ([]*Datacenter, error) {
+	var dc []*Datacenter
+	finder := find.NewFinder(connection.GoVmomiClient.Client, true)
+	datacenters, err := finder.DatacenterList(ctx, "*")
+	if err != nil {
+		glog.Errorf("Failed to find the datacenter. err: %+v", err)
+		return nil, err
+	}
+	for _, datacenter := range datacenters {
+		dc = append(dc, &(Datacenter{datacenter}))
+	}
+
+	return dc, nil
+}
+
 // GetVMByUUID gets the VM object from the given vmUUID
 func (dc *Datacenter) GetVMByUUID(ctx context.Context, vmUUID string) (*VirtualMachine, error) {
 	s := object.NewSearchIndex(dc.Client())
@ -60,7 +76,7 @@ func (dc *Datacenter) GetVMByUUID(ctx context.Context, vmUUID string) (*VirtualM
 	}
 	if svm == nil {
 		glog.Errorf("Unable to find VM by UUID. VM UUID: %s", vmUUID)
-		return nil, fmt.Errorf("Failed to find VM by UUID: %s", vmUUID)
+		return nil, ErrNoVMFound
 	}
 	virtualMachine := VirtualMachine{object.NewVirtualMachine(dc.Client(), svm.Reference()), dc}
 	return &virtualMachine, nil
@ -79,6 +95,41 @@ func (dc *Datacenter) GetVMByPath(ctx context.Context, vmPath string) (*VirtualM
 	return &virtualMachine, nil
 }

+// GetAllDatastores gets the datastore URL to DatastoreInfo map for all the datastores in
+// the datacenter.
+func (dc *Datacenter) GetAllDatastores(ctx context.Context) (map[string]*DatastoreInfo, error) {
+	finder := getFinder(dc)
+	datastores, err := finder.DatastoreList(ctx, "*")
+	if err != nil {
+		glog.Errorf("Failed to get all the datastores. err: %+v", err)
+		return nil, err
+	}
+	var dsList []types.ManagedObjectReference
+	for _, ds := range datastores {
+		dsList = append(dsList, ds.Reference())
+	}
+
+	var dsMoList []mo.Datastore
+	pc := property.DefaultCollector(dc.Client())
+	properties := []string{DatastoreInfoProperty}
+	err = pc.Retrieve(ctx, dsList, properties, &dsMoList)
+	if err != nil {
+		glog.Errorf("Failed to get Datastore managed objects from datastore objects."+
+			" dsObjList: %+v, properties: %+v, err: %v", dsList, properties, err)
+		return nil, err
+	}
+
+	dsURLInfoMap := make(map[string]*DatastoreInfo)
+	for _, dsMo := range dsMoList {
+		dsURLInfoMap[dsMo.Info.GetDatastoreInfo().Url] = &DatastoreInfo{
+			&Datastore{object.NewDatastore(dc.Client(), dsMo.Reference()),
+				dc},
+			dsMo.Info.GetDatastoreInfo()}
+	}
+	glog.V(9).Infof("dsURLInfoMap : %+v", dsURLInfoMap)
+	return dsURLInfoMap, nil
+}
+
 // GetDatastoreByPath gets the Datastore object from the given vmDiskPath
 func (dc *Datacenter) GetDatastoreByPath(ctx context.Context, vmDiskPath string) (*Datastore, error) {
 	datastorePathObj := new(object.DatastorePath)
@ -109,6 +160,23 @@ func (dc *Datacenter) GetDatastoreByName(ctx context.Context, name string) (*Dat
 	return &datastore, nil
 }

+// GetResourcePool gets the resource pool for the given path
+func (dc *Datacenter) GetResourcePool(ctx context.Context, computePath string) (*object.ResourcePool, error) {
+	finder := getFinder(dc)
+	var computeResource *object.ComputeResource
+	var err error
+	if computePath == "" {
+		computeResource, err = finder.DefaultComputeResource(ctx)
+	} else {
+		computeResource, err = finder.ComputeResource(ctx, computePath)
+	}
+	if err != nil {
+		glog.Errorf("Failed to get the ResourcePool for computePath '%s'. err: %+v", computePath, err)
+		return nil, err
+	}
+	return computeResource.ResourcePool(ctx)
+}
+
 // GetFolderByPath gets the Folder Object from the given folder path
 // folderPath should be the full path to folder
 func (dc *Datacenter) GetFolderByPath(ctx context.Context, folderPath string) (*Folder, error) {
--- a/pkg/cloudprovider/providers/vsphere/vclib/datastore.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/datastore.go
@ -17,6 +17,7 @@ limitations under the License.
 package vclib

 import (
+	"fmt"
 	"github.com/golang/glog"
 	"github.com/vmware/govmomi/object"
 	"github.com/vmware/govmomi/property"
@ -32,6 +33,16 @@ type Datastore struct {
 	Datacenter *Datacenter
 }

+// DatastoreInfo is a structure to store the Datastore and it's Info.
+type DatastoreInfo struct {
+	*Datastore
+	Info *types.DatastoreInfo
+}
+
+func (di DatastoreInfo) String() string {
+	return fmt.Sprintf("Datastore: %+v, datastore URL: %s", di.Datastore, di.Info.Url)
+}
+
 // CreateDirectory creates the directory at location specified by directoryPath.
 // If the intermediate level folders do not exist, and the parameter createParents is true, all the non-existent folders are created.
 // directoryPath must be in the format "[vsanDatastore] kubevols"
--- a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vdm.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vdm.go
@ -70,13 +70,13 @@ func (diskManager virtualDiskManager) Create(ctx context.Context, datastore *vcl
 }

 // Delete implements Disk's Delete interface
-func (diskManager virtualDiskManager) Delete(ctx context.Context, datastore *vclib.Datastore) error {
+func (diskManager virtualDiskManager) Delete(ctx context.Context, datacenter *vclib.Datacenter) error {
 	// Create a virtual disk manager
-	virtualDiskManager := object.NewVirtualDiskManager(datastore.Client())
-	diskPath := vclib.RemoveClusterFromVDiskPath(diskManager.diskPath)
+	virtualDiskManager := object.NewVirtualDiskManager(datacenter.Client())
+	diskPath := vclib.RemoveStorageClusterORFolderNameFromVDiskPath(diskManager.diskPath)
 	requestTime := time.Now()
 	// Delete virtual disk
-	task, err := virtualDiskManager.DeleteVirtualDisk(ctx, diskPath, datastore.Datacenter.Datacenter)
+	task, err := virtualDiskManager.DeleteVirtualDisk(ctx, diskPath, datacenter.Datacenter)
 	if err != nil {
 		glog.Errorf("Failed to delete virtual disk. err: %v", err)
 		vclib.RecordvSphereMetric(vclib.APIDeleteVolume, requestTime, err)
--- a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/virtualdisk.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/virtualdisk.go
@ -40,7 +40,7 @@ const (
 // VirtualDiskProvider defines interfaces for creating disk
 type VirtualDiskProvider interface {
 	Create(ctx context.Context, datastore *vclib.Datastore) (string, error)
-	Delete(ctx context.Context, datastore *vclib.Datastore) error
+	Delete(ctx context.Context, datacenter *vclib.Datacenter) error
 }

 // getDiskManager returns vmDiskManager or vdmDiskManager based on given volumeoptions
@ -75,6 +75,6 @@ func (virtualDisk *VirtualDisk) Create(ctx context.Context, datastore *vclib.Dat
 }

 // Delete gets appropriate disk manager and calls respective delete method
-func (virtualDisk *VirtualDisk) Delete(ctx context.Context, datastore *vclib.Datastore) error {
-	return getDiskManager(virtualDisk, VirtualDiskDeleteOperation).Delete(ctx, datastore)
+func (virtualDisk *VirtualDisk) Delete(ctx context.Context, datacenter *vclib.Datacenter) error {
+	return getDiskManager(virtualDisk, VirtualDiskDeleteOperation).Delete(ctx, datacenter)
 }
--- a/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vmdm.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers/vmdm.go
@ -157,7 +157,7 @@ func (vmdisk vmDiskManager) Create(ctx context.Context, datastore *vclib.Datasto
 	return vmdisk.diskPath, nil
 }

-func (vmdisk vmDiskManager) Delete(ctx context.Context, datastore *vclib.Datastore) error {
+func (vmdisk vmDiskManager) Delete(ctx context.Context, datacenter *vclib.Datacenter) error {
 	return fmt.Errorf("vmDiskManager.Delete is not supported")
 }

--- a/pkg/cloudprovider/providers/vsphere/vclib/pbm.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/pbm.go
@ -85,7 +85,7 @@ func (pbmClient *PbmClient) IsDatastoreCompatible(ctx context.Context, storagePo

 // GetCompatibleDatastores filters and returns compatible list of datastores for given storage policy id
 // For Non Compatible Datastores, fault message with the Datastore Name is also returned
-func (pbmClient *PbmClient) GetCompatibleDatastores(ctx context.Context, storagePolicyID string, datastores []*Datastore) ([]*Datastore, string, error) {
+func (pbmClient *PbmClient) GetCompatibleDatastores(ctx context.Context, dc *Datacenter, storagePolicyID string, datastores []*DatastoreInfo) ([]*DatastoreInfo, string, error) {
 	var (
 		dsMorNameMap                                = getDsMorNameMap(ctx, datastores)
 		localizedMessagesForNotCompatibleDatastores = ""
@ -96,7 +96,7 @@ func (pbmClient *PbmClient) GetCompatibleDatastores(ctx context.Context, storage
 		return nil, "", err
 	}
 	compatibleHubs := compatibilityResult.CompatibleDatastores()
-	var compatibleDatastoreList []*Datastore
+	var compatibleDatastoreList []*DatastoreInfo
 	for _, hub := range compatibleHubs {
 		compatibleDatastoreList = append(compatibleDatastoreList, getDatastoreFromPlacementHub(datastores, hub))
 	}
@ -121,7 +121,7 @@ func (pbmClient *PbmClient) GetCompatibleDatastores(ctx context.Context, storage
 }

 // GetPlacementCompatibilityResult gets placement compatibility result based on storage policy requirements.
-func (pbmClient *PbmClient) GetPlacementCompatibilityResult(ctx context.Context, storagePolicyID string, datastore []*Datastore) (pbm.PlacementCompatibilityResult, error) {
+func (pbmClient *PbmClient) GetPlacementCompatibilityResult(ctx context.Context, storagePolicyID string, datastore []*DatastoreInfo) (pbm.PlacementCompatibilityResult, error) {
 	var hubs []pbmtypes.PbmPlacementHub
 	for _, ds := range datastore {
 		hubs = append(hubs, pbmtypes.PbmPlacementHub{
@ -145,7 +145,7 @@ func (pbmClient *PbmClient) GetPlacementCompatibilityResult(ctx context.Context,
 }

 // getDataStoreForPlacementHub returns matching datastore associated with given pbmPlacementHub
-func getDatastoreFromPlacementHub(datastore []*Datastore, pbmPlacementHub pbmtypes.PbmPlacementHub) *Datastore {
+func getDatastoreFromPlacementHub(datastore []*DatastoreInfo, pbmPlacementHub pbmtypes.PbmPlacementHub) *DatastoreInfo {
 	for _, ds := range datastore {
 		if ds.Reference().Type == pbmPlacementHub.HubType && ds.Reference().Value == pbmPlacementHub.HubId {
 			return ds
@ -155,7 +155,7 @@ func getDatastoreFromPlacementHub(datastore []*Datastore, pbmPlacementHub pbmtyp
 }

 // getDsMorNameMap returns map of ds Mor and Datastore Object Name
-func getDsMorNameMap(ctx context.Context, datastores []*Datastore) map[string]string {
+func getDsMorNameMap(ctx context.Context, datastores []*DatastoreInfo) map[string]string {
 	dsMorNameMap := make(map[string]string)
 	for _, ds := range datastores {
 		dsObjectName, err := ds.ObjectName(ctx)
--- a/pkg/cloudprovider/providers/vsphere/vclib/utils.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/utils.go
@ -25,6 +25,8 @@ import (
 	"github.com/golang/glog"
 	"github.com/vmware/govmomi/find"
 	"github.com/vmware/govmomi/object"
+	"github.com/vmware/govmomi/vim25/mo"
+	"github.com/vmware/govmomi/vim25/soap"
 	"github.com/vmware/govmomi/vim25/types"
 )

@ -121,10 +123,10 @@ func getSCSIControllers(vmDevices object.VirtualDeviceList) []*types.VirtualCont
 	return scsiControllers
 }

-// RemoveClusterFromVDiskPath removes the cluster or folder path from the vDiskPath
+// RemoveStorageClusterORFolderNameFromVDiskPath removes the cluster or folder path from the vDiskPath
 // for vDiskPath [DatastoreCluster/sharedVmfs-0] kubevols/e2e-vmdk-1234.vmdk, return value is [sharedVmfs-0] kubevols/e2e-vmdk-1234.vmdk
 // for vDiskPath [sharedVmfs-0] kubevols/e2e-vmdk-1234.vmdk, return value remains same [sharedVmfs-0] kubevols/e2e-vmdk-1234.vmdk
-func RemoveClusterFromVDiskPath(vDiskPath string) string {
+func RemoveStorageClusterORFolderNameFromVDiskPath(vDiskPath string) string {
 	datastore := regexp.MustCompile("\\[(.*?)\\]").FindStringSubmatch(vDiskPath)[1]
 	if filepath.Base(datastore) != datastore {
 		vDiskPath = strings.Replace(vDiskPath, datastore, filepath.Base(datastore), 1)
@ -172,3 +174,40 @@ func IsValidUUID(uuid string) bool {
 	r := regexp.MustCompile("^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$")
 	return r.MatchString(uuid)
 }
+
+// IsManagedObjectNotFoundError returns true if error is of type ManagedObjectNotFound
+func IsManagedObjectNotFoundError(err error) bool {
+	isManagedObjectNotFoundError := false
+	if soap.IsSoapFault(err) {
+		_, isManagedObjectNotFoundError = soap.ToSoapFault(err).VimFault().(types.ManagedObjectNotFound)
+	}
+	return isManagedObjectNotFoundError
+}
+
+// VerifyVolumePathsForVM verifies if the volume paths (volPaths) are attached to VM.
+func VerifyVolumePathsForVM(vmMo mo.VirtualMachine, volPaths []string, nodeName string, nodeVolumeMap map[string]map[string]bool) {
+	// Verify if the volume paths are present on the VM backing virtual disk devices
+	vmDevices := object.VirtualDeviceList(vmMo.Config.Hardware.Device)
+	VerifyVolumePathsForVMDevices(vmDevices, volPaths, nodeName, nodeVolumeMap)
+
+}
+
+// VerifyVolumePathsForVMDevices verifies if the volume paths (volPaths) are attached to VM.
+func VerifyVolumePathsForVMDevices(vmDevices object.VirtualDeviceList, volPaths []string, nodeName string, nodeVolumeMap map[string]map[string]bool) {
+	volPathsMap := make(map[string]bool)
+	for _, volPath := range volPaths {
+		volPathsMap[volPath] = true
+	}
+	// Verify if the volume paths are present on the VM backing virtual disk devices
+	for _, device := range vmDevices {
+		if vmDevices.TypeName(device) == "VirtualDisk" {
+			virtualDevice := device.GetVirtualDevice()
+			if backing, ok := virtualDevice.Backing.(*types.VirtualDiskFlatVer2BackingInfo); ok {
+				if volPathsMap[backing.FileName] {
+					setNodeVolumeMap(nodeVolumeMap, backing.FileName, nodeName, true)
+				}
+			}
+		}
+	}
+
+}
--- a/pkg/cloudprovider/providers/vsphere/vclib/virtualmachine.go
+++ b/pkg/cloudprovider/providers/vsphere/vclib/virtualmachine.go
@ -23,6 +23,7 @@ import (

 	"github.com/golang/glog"
 	"github.com/vmware/govmomi/object"
+	"github.com/vmware/govmomi/property"
 	"github.com/vmware/govmomi/vim25/mo"
 	"github.com/vmware/govmomi/vim25/types"
 )
@ -63,7 +64,7 @@ func (vm *VirtualMachine) AttachDisk(ctx context.Context, vmDiskPath string, vol
 		return "", fmt.Errorf("Not a valid SCSI Controller Type. Valid options are %q", SCSIControllerTypeValidOptions())
 	}
 	vmDiskPathCopy := vmDiskPath
-	vmDiskPath = RemoveClusterFromVDiskPath(vmDiskPath)
+	vmDiskPath = RemoveStorageClusterORFolderNameFromVDiskPath(vmDiskPath)
 	attached, err := vm.IsDiskAttached(ctx, vmDiskPath)
 	if err != nil {
 		glog.Errorf("Error occurred while checking if disk is attached on VM: %q. vmDiskPath: %q, err: %+v", vm.InventoryPath, vmDiskPath, err)
@ -75,6 +76,20 @@ func (vm *VirtualMachine) AttachDisk(ctx context.Context, vmDiskPath string, vol
 		return diskUUID, nil
 	}

+	if volumeOptions.StoragePolicyName != "" {
+		pbmClient, err := NewPbmClient(ctx, vm.Client())
+		if err != nil {
+			glog.Errorf("Error occurred while creating new pbmClient. err: %+v", err)
+			return "", err
+		}
+
+		volumeOptions.StoragePolicyID, err = pbmClient.ProfileIDByName(ctx, volumeOptions.StoragePolicyName)
+		if err != nil {
+			glog.Errorf("Failed to get Profile ID by name: %s. err: %+v", volumeOptions.StoragePolicyName, err)
+			return "", err
+		}
+	}
+
 	dsObj, err := vm.Datacenter.GetDatastoreByPath(ctx, vmDiskPathCopy)
 	if err != nil {
 		glog.Errorf("Failed to get datastore from vmDiskPath: %q. err: %+v", vmDiskPath, err)
@ -139,7 +154,7 @@ func (vm *VirtualMachine) AttachDisk(ctx context.Context, vmDiskPath string, vol

 // DetachDisk detaches the disk specified by vmDiskPath
 func (vm *VirtualMachine) DetachDisk(ctx context.Context, vmDiskPath string) error {
-	vmDiskPath = RemoveClusterFromVDiskPath(vmDiskPath)
+	vmDiskPath = RemoveStorageClusterORFolderNameFromVDiskPath(vmDiskPath)
 	device, err := vm.getVirtualDeviceByPath(ctx, vmDiskPath)
 	if err != nil {
 		glog.Errorf("Disk ID not found for VM: %q with diskPath: %q", vm.InventoryPath, vmDiskPath)
@ -186,7 +201,7 @@ func (vm *VirtualMachine) IsActive(ctx context.Context) (bool, error) {
 }

 // GetAllAccessibleDatastores gets the list of accessible Datastores for the given Virtual Machine
-func (vm *VirtualMachine) GetAllAccessibleDatastores(ctx context.Context) ([]*Datastore, error) {
+func (vm *VirtualMachine) GetAllAccessibleDatastores(ctx context.Context) ([]*DatastoreInfo, error) {
 	host, err := vm.HostSystem(ctx)
 	if err != nil {
 		glog.Errorf("Failed to get host system for VM: %q. err: %+v", vm.InventoryPath, err)
@ -199,9 +214,28 @@ func (vm *VirtualMachine) GetAllAccessibleDatastores(ctx context.Context) ([]*Da
 		glog.Errorf("Failed to retrieve datastores for host: %+v. err: %+v", host, err)
 		return nil, err
 	}
-	var dsObjList []*Datastore
+	var dsRefList []types.ManagedObjectReference
 	for _, dsRef := range hostSystemMo.Datastore {
-		dsObjList = append(dsObjList, &Datastore{object.NewDatastore(vm.Client(), dsRef), vm.Datacenter})
+		dsRefList = append(dsRefList, dsRef)
+	}
+
+	var dsMoList []mo.Datastore
+	pc := property.DefaultCollector(vm.Client())
+	properties := []string{DatastoreInfoProperty}
+	err = pc.Retrieve(ctx, dsRefList, properties, &dsMoList)
+	if err != nil {
+		glog.Errorf("Failed to get Datastore managed objects from datastore objects."+
+			" dsObjList: %+v, properties: %+v, err: %v", dsRefList, properties, err)
+		return nil, err
+	}
+	glog.V(9).Infof("Result dsMoList: %+v", dsMoList)
+	var dsObjList []*DatastoreInfo
+	for _, dsMo := range dsMoList {
+		dsObjList = append(dsObjList,
+			&DatastoreInfo{
+				&Datastore{object.NewDatastore(vm.Client(), dsMo.Reference()),
+					vm.Datacenter},
+				dsMo.Info.GetDatastoreInfo()})
 	}
 	return dsObjList, nil
 }
--- a/pkg/cloudprovider/providers/vsphere/vsphere.go
+++ b/pkg/cloudprovider/providers/vsphere/vsphere.go
--- a/pkg/cloudprovider/providers/vsphere/vsphere_test.go
+++ b/pkg/cloudprovider/providers/vsphere/vsphere_test.go
@ -39,7 +39,7 @@ func configFromEnv() (cfg VSphereConfig, ok bool) {
 	cfg.Global.Password = os.Getenv("VSPHERE_PASSWORD")
 	cfg.Global.Datacenter = os.Getenv("VSPHERE_DATACENTER")
 	cfg.Network.PublicNetwork = os.Getenv("VSPHERE_PUBLIC_NETWORK")
-	cfg.Global.Datastore = os.Getenv("VSPHERE_DATASTORE")
+	cfg.Global.DefaultDatastore = os.Getenv("VSPHERE_DATASTORE")
 	cfg.Disk.SCSIControllerType = os.Getenv("VSPHERE_SCSICONTROLLER_TYPE")
 	cfg.Global.WorkingDir = os.Getenv("VSPHERE_WORKING_DIR")
 	cfg.Global.VMName = os.Getenv("VSPHERE_VM_NAME")
@ -103,7 +103,7 @@ func TestNewVSphere(t *testing.T) {
 		t.Skipf("No config found in environment")
 	}

-	_, err := newVSphere(cfg)
+	_, err := newControllerNode(cfg)
 	if err != nil {
 		t.Fatalf("Failed to construct/authenticate vSphere: %s", err)
 	}
@ -116,7 +116,7 @@ func TestVSphereLogin(t *testing.T) {
 	}

 	// Create vSphere configuration object
-	vs, err := newVSphere(cfg)
+	vs, err := newControllerNode(cfg)
 	if err != nil {
 		t.Fatalf("Failed to construct/authenticate vSphere: %s", err)
 	}
@ -126,11 +126,16 @@ func TestVSphereLogin(t *testing.T) {
 	defer cancel()

 	// Create vSphere client
-	err = vs.conn.Connect(ctx)
+	var vcInstance *VSphereInstance
+	if vcInstance, ok = vs.vsphereInstanceMap[cfg.Global.VCenterIP]; !ok {
+		t.Fatalf("Couldn't get vSphere instance: %s", cfg.Global.VCenterIP)
+	}
+
+	err = vcInstance.conn.Connect(ctx)
 	if err != nil {
 		t.Errorf("Failed to connect to vSphere: %s", err)
 	}
-	defer vs.conn.GoVmomiClient.Logout(ctx)
+	defer vcInstance.conn.GoVmomiClient.Logout(ctx)
 }

 func TestZones(t *testing.T) {
@ -154,7 +159,7 @@ func TestInstances(t *testing.T) {
 		t.Skipf("No config found in environment")
 	}

-	vs, err := newVSphere(cfg)
+	vs, err := newControllerNode(cfg)
 	if err != nil {
 		t.Fatalf("Failed to construct/authenticate vSphere: %s", err)
 	}
@ -213,7 +218,7 @@ func TestVolumes(t *testing.T) {
 		t.Skipf("No config found in environment")
 	}

-	vs, err := newVSphere(cfg)
+	vs, err := newControllerNode(cfg)
 	if err != nil {
 		t.Fatalf("Failed to construct/authenticate vSphere: %s", err)
 	}
--- a/pkg/cloudprovider/providers/vsphere/vsphere_util.go
+++ b/pkg/cloudprovider/providers/vsphere/vsphere_util.go
@ -28,14 +28,16 @@ import (

 	"github.com/golang/glog"
 	"github.com/vmware/govmomi"
-	"github.com/vmware/govmomi/object"
 	"github.com/vmware/govmomi/vim25"
-	"github.com/vmware/govmomi/vim25/mo"

 	"fmt"

+	"github.com/vmware/govmomi/vim25/mo"
+	"k8s.io/api/core/v1"
+	k8stypes "k8s.io/apimachinery/pkg/types"
 	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib"
 	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib/diskmanagers"
+	"path/filepath"
 )

 const (
@ -55,10 +57,28 @@ func GetVSphere() (*VSphere, error) {
 		return nil, err
 	}
 	vSphereConn.GoVmomiClient = client
+	vsphereIns := &VSphereInstance{
+		conn: vSphereConn,
+		cfg: &VirtualCenterConfig{
+			User:              cfg.Global.User,
+			Password:          cfg.Global.Password,
+			VCenterPort:       cfg.Global.VCenterPort,
+			Datacenters:       cfg.Global.Datacenters,
+			RoundTripperCount: cfg.Global.RoundTripperCount,
+		},
+	}
+	vsphereInsMap := make(map[string]*VSphereInstance)
+	vsphereInsMap[cfg.Global.VCenterIP] = vsphereIns
+	// TODO: Initialize nodeManager and set it in VSphere.
 	vs := &VSphere{
-		conn:            vSphereConn,
-		cfg:             cfg,
-		localInstanceID: "",
+		vsphereInstanceMap: vsphereInsMap,
+		hostName:           "",
+		cfg:                cfg,
+		nodeManager: &NodeManager{
+			vsphereInstanceMap: vsphereInsMap,
+			nodeInfoMap:        make(map[string]*NodeInfo),
+			registeredNodes:    make(map[string]*v1.Node),
+		},
 	}
 	runtime.SetFinalizer(vs, logout)
 	return vs, nil
@ -70,14 +90,18 @@ func getVSphereConfig() *VSphereConfig {
 	cfg.Global.VCenterPort = os.Getenv("VSPHERE_VCENTER_PORT")
 	cfg.Global.User = os.Getenv("VSPHERE_USER")
 	cfg.Global.Password = os.Getenv("VSPHERE_PASSWORD")
-	cfg.Global.Datacenter = os.Getenv("VSPHERE_DATACENTER")
-	cfg.Global.Datastore = os.Getenv("VSPHERE_DATASTORE")
+	cfg.Global.Datacenters = os.Getenv("VSPHERE_DATACENTER")
+	cfg.Global.DefaultDatastore = os.Getenv("VSPHERE_DATASTORE")
 	cfg.Global.WorkingDir = os.Getenv("VSPHERE_WORKING_DIR")
 	cfg.Global.VMName = os.Getenv("VSPHERE_VM_NAME")
 	cfg.Global.InsecureFlag = false
 	if strings.ToLower(os.Getenv("VSPHERE_INSECURE")) == "true" {
 		cfg.Global.InsecureFlag = true
 	}
+	cfg.Workspace.VCenterIP = cfg.Global.VCenterIP
+	cfg.Workspace.Datacenter = cfg.Global.Datacenters
+	cfg.Workspace.DefaultDatastore = cfg.Global.DefaultDatastore
+	cfg.Workspace.Folder = cfg.Global.WorkingDir
 	return &cfg
 }

@ -127,49 +151,83 @@ func getvmUUID() (string, error) {
 	return uuid, nil
 }

-// Get all datastores accessible for the virtual machine object.
-func getSharedDatastoresInK8SCluster(ctx context.Context, folder *vclib.Folder) ([]*vclib.Datastore, error) {
-	vmList, err := folder.GetVirtualMachines(ctx)
+// Returns the accessible datastores for the given node VM.
+func getAccessibleDatastores(ctx context.Context, nodeVmDetail *NodeDetails, nodeManager *NodeManager) ([]*vclib.DatastoreInfo, error) {
+	accessibleDatastores, err := nodeVmDetail.vm.GetAllAccessibleDatastores(ctx)
 	if err != nil {
-		glog.Errorf("Failed to get virtual machines in the kubernetes cluster: %s, err: %+v", folder.InventoryPath, err)
-		return nil, err
+		// Check if the node VM is not found which indicates that the node info in the node manager is stale.
+		// If so, rediscover the node and retry.
+		if vclib.IsManagedObjectNotFoundError(err) {
+			glog.V(4).Infof("error %q ManagedObjectNotFound for node %q. Rediscovering...", err, nodeVmDetail.NodeName)
+			err = nodeManager.RediscoverNode(convertToK8sType(nodeVmDetail.NodeName))
+			if err == nil {
+				glog.V(4).Infof("Discovered node %s successfully", nodeVmDetail.NodeName)
+				nodeInfo, err := nodeManager.GetNodeInfo(convertToK8sType(nodeVmDetail.NodeName))
+				if err != nil {
+					glog.V(4).Infof("error %q getting node info for node %+v", err, nodeVmDetail)
+					return nil, err
+				}
+
+				accessibleDatastores, err = nodeInfo.vm.GetAllAccessibleDatastores(ctx)
+				if err != nil {
+					glog.V(4).Infof("error %q getting accessible datastores for node %+v", err, nodeVmDetail)
+					return nil, err
+				}
+			} else {
+				glog.V(4).Infof("error %q rediscovering node %+v", err, nodeVmDetail)
+				return nil, err
+			}
+		} else {
+			glog.V(4).Infof("error %q getting accessible datastores for node %+v", err, nodeVmDetail)
+			return nil, err
+		}
 	}
-	if vmList == nil || len(vmList) == 0 {
-		glog.Errorf("No virtual machines found in the kubernetes cluster: %s", folder.InventoryPath)
-		return nil, fmt.Errorf("No virtual machines found in the kubernetes cluster: %s", folder.InventoryPath)
+	return accessibleDatastores, nil
+}
+
+// Get all datastores accessible for the virtual machine object.
+func getSharedDatastoresInK8SCluster(ctx context.Context, dc *vclib.Datacenter, nodeManager *NodeManager) ([]*vclib.DatastoreInfo, error) {
+	nodeVmDetails := nodeManager.GetNodeDetails()
+	if nodeVmDetails == nil || len(nodeVmDetails) == 0 {
+		msg := fmt.Sprintf("Kubernetes node nodeVmDetail details is empty. nodeVmDetails : %+v", nodeVmDetails)
+		glog.Error(msg)
+		return nil, fmt.Errorf(msg)
 	}
-	index := 0
-	var sharedDatastores []*vclib.Datastore
-	for _, vm := range vmList {
-		vmName, err := vm.ObjectName(ctx)
+	var sharedDatastores []*vclib.DatastoreInfo
+	for index, nodeVmDetail := range nodeVmDetails {
+		glog.V(9).Infof("Getting accessible datastores for node %s", nodeVmDetail.NodeName)
+		accessibleDatastores, err := getAccessibleDatastores(ctx, &nodeVmDetail, nodeManager)
 		if err != nil {
 			return nil, err
 		}
-		if !strings.HasPrefix(vmName, DummyVMPrefixName) {
-			accessibleDatastores, err := vm.GetAllAccessibleDatastores(ctx)
-			if err != nil {
-				return nil, err
+		if index == 0 {
+			sharedDatastores = accessibleDatastores
+		} else {
+			sharedDatastores = intersect(sharedDatastores, accessibleDatastores)
+			if len(sharedDatastores) == 0 {
+				return nil, fmt.Errorf("No shared datastores found in the Kubernetes cluster for nodeVmDetails: %+v", nodeVmDetails)
 			}
-			if index == 0 {
-				sharedDatastores = accessibleDatastores
-			} else {
-				sharedDatastores = intersect(sharedDatastores, accessibleDatastores)
-				if len(sharedDatastores) == 0 {
-					return nil, fmt.Errorf("No shared datastores found in the Kubernetes cluster: %s", folder.InventoryPath)
-				}
-			}
-			index++
 		}
 	}
+	glog.V(9).Infof("sharedDatastores : %+v", sharedDatastores)
+	sharedDatastores, err := getDatastoresForEndpointVC(ctx, dc, sharedDatastores)
+	if err != nil {
+		glog.Errorf("Failed to get shared datastores from endpoint VC. err: %+v", err)
+		return nil, err
+	}
+	glog.V(9).Infof("sharedDatastores at endpoint VC: %+v", sharedDatastores)
 	return sharedDatastores, nil
 }

-func intersect(list1 []*vclib.Datastore, list2 []*vclib.Datastore) []*vclib.Datastore {
-	var sharedDs []*vclib.Datastore
+func intersect(list1 []*vclib.DatastoreInfo, list2 []*vclib.DatastoreInfo) []*vclib.DatastoreInfo {
+	glog.V(9).Infof("list1: %+v", list1)
+	glog.V(9).Infof("list2: %+v", list2)
+	var sharedDs []*vclib.DatastoreInfo
 	for _, val1 := range list1 {
 		// Check if val1 is found in list2
 		for _, val2 := range list2 {
-			if val1.Reference().Value == val2.Reference().Value {
+			// Intersection is performed based on the datastoreUrl as this uniquely identifies the datastore.
+			if val1.Info.Url == val2.Info.Url {
 				sharedDs = append(sharedDs, val1)
 				break
 			}
@ -178,46 +236,42 @@ func intersect(list1 []*vclib.Datastore, list2 []*vclib.Datastore) []*vclib.Data
 	return sharedDs
 }

-// Get the datastores accessible for the virtual machine object.
-func getAllAccessibleDatastores(ctx context.Context, client *vim25.Client, vmMo mo.VirtualMachine) ([]string, error) {
-	host := vmMo.Summary.Runtime.Host
-	if host == nil {
-		return nil, errors.New("VM doesn't have a HostSystem")
-	}
-	var hostSystemMo mo.HostSystem
-	s := object.NewSearchIndex(client)
-	err := s.Properties(ctx, host.Reference(), []string{DatastoreProperty}, &hostSystemMo)
-	if err != nil {
-		return nil, err
-	}
-	var dsRefValues []string
-	for _, dsRef := range hostSystemMo.Datastore {
-		dsRefValues = append(dsRefValues, dsRef.Value)
-	}
-	return dsRefValues, nil
-}
-
 // getMostFreeDatastore gets the best fit compatible datastore by free space.
-func getMostFreeDatastoreName(ctx context.Context, client *vim25.Client, dsObjList []*vclib.Datastore) (string, error) {
-	dsMoList, err := dsObjList[0].Datacenter.GetDatastoreMoList(ctx, dsObjList, []string{DatastoreInfoProperty})
-	if err != nil {
-		return "", err
-	}
+func getMostFreeDatastoreName(ctx context.Context, client *vim25.Client, dsInfoList []*vclib.DatastoreInfo) (string, error) {
 	var curMax int64
 	curMax = -1
 	var index int
-	for i, dsMo := range dsMoList {
-		dsFreeSpace := dsMo.Info.GetDatastoreInfo().FreeSpace
+	for i, dsInfo := range dsInfoList {
+		dsFreeSpace := dsInfo.Info.GetDatastoreInfo().FreeSpace
 		if dsFreeSpace > curMax {
 			curMax = dsFreeSpace
 			index = i
 		}
 	}
-	return dsMoList[index].Info.GetDatastoreInfo().Name, nil
+	return dsInfoList[index].Info.GetDatastoreInfo().Name, nil
 }

-func getPbmCompatibleDatastore(ctx context.Context, client *vim25.Client, storagePolicyName string, folder *vclib.Folder) (string, error) {
-	pbmClient, err := vclib.NewPbmClient(ctx, client)
+// Returns the datastores in the given datacenter by performing lookup based on datastore URL.
+func getDatastoresForEndpointVC(ctx context.Context, dc *vclib.Datacenter, sharedDsInfos []*vclib.DatastoreInfo) ([]*vclib.DatastoreInfo, error) {
+	var datastores []*vclib.DatastoreInfo
+	allDsInfoMap, err := dc.GetAllDatastores(ctx)
+	if err != nil {
+		return nil, err
+	}
+	for _, sharedDsInfo := range sharedDsInfos {
+		dsInfo, ok := allDsInfoMap[sharedDsInfo.Info.Url]
+		if ok {
+			datastores = append(datastores, dsInfo)
+		} else {
+			glog.V(4).Infof("Warning: Shared datastore with URL %s does not exist in endpoint VC", sharedDsInfo.Info.Url)
+		}
+	}
+	glog.V(9).Infof("Datastore from endpoint VC: %+v", datastores)
+	return datastores, nil
+}
+
+func getPbmCompatibleDatastore(ctx context.Context, dc *vclib.Datacenter, storagePolicyName string, nodeManager *NodeManager) (string, error) {
+	pbmClient, err := vclib.NewPbmClient(ctx, dc.Client())
 	if err != nil {
 		return "", err
 	}
@ -226,35 +280,40 @@ func getPbmCompatibleDatastore(ctx context.Context, client *vim25.Client, storag
 		glog.Errorf("Failed to get Profile ID by name: %s. err: %+v", storagePolicyName, err)
 		return "", err
 	}
-	sharedDsList, err := getSharedDatastoresInK8SCluster(ctx, folder)
+	sharedDs, err := getSharedDatastoresInK8SCluster(ctx, dc, nodeManager)
 	if err != nil {
-		glog.Errorf("Failed to get shared datastores from kubernetes cluster: %s. err: %+v", folder.InventoryPath, err)
+		glog.Errorf("Failed to get shared datastores. err: %+v", err)
 		return "", err
 	}
-	compatibleDatastores, _, err := pbmClient.GetCompatibleDatastores(ctx, storagePolicyID, sharedDsList)
+	if len(sharedDs) == 0 {
+		msg := "No shared datastores found in the endpoint virtual center"
+		glog.Errorf(msg)
+		return "", errors.New(msg)
+	}
+	compatibleDatastores, _, err := pbmClient.GetCompatibleDatastores(ctx, dc, storagePolicyID, sharedDs)
 	if err != nil {
-		glog.Errorf("Failed to get compatible datastores from datastores : %+v with storagePolicy: %s. err: %+v", sharedDsList, storagePolicyID, err)
+		glog.Errorf("Failed to get compatible datastores from datastores : %+v with storagePolicy: %s. err: %+v",
+			sharedDs, storagePolicyID, err)
 		return "", err
 	}
-	datastore, err := getMostFreeDatastoreName(ctx, client, compatibleDatastores)
+	glog.V(9).Infof("compatibleDatastores : %+v", compatibleDatastores)
+	datastore, err := getMostFreeDatastoreName(ctx, dc.Client(), compatibleDatastores)
 	if err != nil {
 		glog.Errorf("Failed to get most free datastore from compatible datastores: %+v. err: %+v", compatibleDatastores, err)
 		return "", err
 	}
+	glog.V(4).Infof("Most free datastore : %+s", datastore)
 	return datastore, err
 }

-func (vs *VSphere) setVMOptions(ctx context.Context, dc *vclib.Datacenter) (*vclib.VMOptions, error) {
+func (vs *VSphere) setVMOptions(ctx context.Context, dc *vclib.Datacenter, resourcePoolPath string) (*vclib.VMOptions, error) {
 	var vmOptions vclib.VMOptions
-	vm, err := dc.GetVMByPath(ctx, vs.cfg.Global.WorkingDir+"/"+vs.localInstanceID)
+	resourcePool, err := dc.GetResourcePool(ctx, resourcePoolPath)
 	if err != nil {
 		return nil, err
 	}
-	resourcePool, err := vm.GetResourcePool(ctx)
-	if err != nil {
-		return nil, err
-	}
-	folder, err := dc.GetFolderByPath(ctx, vs.cfg.Global.WorkingDir)
+	glog.V(9).Infof("Resource pool path %s, resourcePool %+v", resourcePoolPath, resourcePool)
+	folder, err := dc.GetFolderByPath(ctx, vs.cfg.Workspace.Folder)
 	if err != nil {
 		return nil, err
 	}
@ -270,28 +329,27 @@ func (vs *VSphere) cleanUpDummyVMs(dummyVMPrefix string) {
 	defer cancel()
 	for {
 		time.Sleep(CleanUpDummyVMRoutineInterval * time.Minute)
-		// Ensure client is logged in and session is valid
-		err := vs.conn.Connect(ctx)
+		vsi, err := vs.getVSphereInstanceForServer(vs.cfg.Workspace.VCenterIP, ctx)
 		if err != nil {
-			glog.V(4).Infof("Failed to connect to VC with err: %+v. Retrying again...", err)
+			glog.V(4).Infof("Failed to get VSphere instance with err: %+v. Retrying again...", err)
 			continue
 		}
-		dc, err := vclib.GetDatacenter(ctx, vs.conn, vs.cfg.Global.Datacenter)
+		dc, err := vclib.GetDatacenter(ctx, vsi.conn, vs.cfg.Workspace.Datacenter)
 		if err != nil {
-			glog.V(4).Infof("Failed to get the datacenter: %s from VC. err: %+v", vs.cfg.Global.Datacenter, err)
+			glog.V(4).Infof("Failed to get the datacenter: %s from VC. err: %+v", vs.cfg.Workspace.Datacenter, err)
 			continue
 		}
 		// Get the folder reference for global working directory where the dummy VM needs to be created.
-		vmFolder, err := dc.GetFolderByPath(ctx, vs.cfg.Global.WorkingDir)
+		vmFolder, err := dc.GetFolderByPath(ctx, vs.cfg.Workspace.Folder)
 		if err != nil {
-			glog.V(4).Infof("Unable to get the kubernetes folder: %q reference. err: %+v", vs.cfg.Global.WorkingDir, err)
+			glog.V(4).Infof("Unable to get the kubernetes folder: %q reference. err: %+v", vs.cfg.Workspace.Folder, err)
 			continue
 		}
 		// A write lock is acquired to make sure the cleanUp routine doesn't delete any VM's created by ongoing PVC requests.
 		defer cleanUpDummyVMLock.Lock()
 		err = diskmanagers.CleanUpDummyVMs(ctx, vmFolder, dc)
 		if err != nil {
-			glog.V(4).Infof("Unable to clean up dummy VM's in the kubernetes cluster: %q. err: %+v", vs.cfg.Global.WorkingDir, err)
+			glog.V(4).Infof("Unable to clean up dummy VM's in the kubernetes cluster: %q. err: %+v", vs.cfg.Workspace.Folder, err)
 		}
 	}
 }
@ -353,3 +411,118 @@ func setdatastoreFolderIDMap(
 	}
 	folderNameIDMap[folderName] = folderID
 }
+
+func convertVolPathToDevicePath(ctx context.Context, dc *vclib.Datacenter, volPath string) (string, error) {
+	volPath = vclib.RemoveStorageClusterORFolderNameFromVDiskPath(volPath)
+	// Get the canonical volume path for volPath.
+	canonicalVolumePath, err := getcanonicalVolumePath(ctx, dc, volPath)
+	if err != nil {
+		glog.Errorf("Failed to get canonical vsphere volume path for volume: %s. err: %+v", volPath, err)
+		return "", err
+	}
+	// Check if the volume path contains .vmdk extension. If not, add the extension and update the nodeVolumes Map
+	if len(canonicalVolumePath) > 0 && filepath.Ext(canonicalVolumePath) != ".vmdk" {
+		canonicalVolumePath += ".vmdk"
+	}
+	return canonicalVolumePath, nil
+}
+
+// convertVolPathsToDevicePaths removes cluster or folder path from volPaths and convert to canonicalPath
+func (vs *VSphere) convertVolPathsToDevicePaths(ctx context.Context, nodeVolumes map[k8stypes.NodeName][]string) (map[k8stypes.NodeName][]string, error) {
+	vmVolumes := make(map[k8stypes.NodeName][]string)
+	for nodeName, volPaths := range nodeVolumes {
+		nodeInfo, err := vs.nodeManager.GetNodeInfo(nodeName)
+		if err != nil {
+			return nil, err
+		}
+
+		_, err = vs.getVSphereInstanceForServer(nodeInfo.vcServer, ctx)
+		if err != nil {
+			return nil, err
+		}
+
+		for i, volPath := range volPaths {
+			deviceVolPath, err := convertVolPathToDevicePath(ctx, nodeInfo.dataCenter, volPath)
+			if err != nil {
+				glog.Errorf("Failed to convert vsphere volume path %s to device path for volume %s. err: %+v", volPath, deviceVolPath, err)
+				return nil, err
+			}
+			volPaths[i] = deviceVolPath
+		}
+		vmVolumes[nodeName] = volPaths
+	}
+	return vmVolumes, nil
+}
+
+// checkDiskAttached verifies volumes are attached to the VMs which are in same vCenter and Datacenter
+// Returns nodes if exist any for which VM is not found in that vCenter and Datacenter
+func (vs *VSphere) checkDiskAttached(ctx context.Context, nodes []k8stypes.NodeName, nodeVolumes map[k8stypes.NodeName][]string, attached map[string]map[string]bool, retry bool) ([]k8stypes.NodeName, error) {
+	var nodesToRetry []k8stypes.NodeName
+	var vmList []*vclib.VirtualMachine
+	var nodeInfo NodeInfo
+	var err error
+
+	for _, nodeName := range nodes {
+		nodeInfo, err = vs.nodeManager.GetNodeInfo(nodeName)
+		if err != nil {
+			return nodesToRetry, err
+		}
+		vmList = append(vmList, nodeInfo.vm)
+	}
+
+	// Making sure session is valid
+	_, err = vs.getVSphereInstanceForServer(nodeInfo.vcServer, ctx)
+	if err != nil {
+		return nodesToRetry, err
+	}
+
+	// If any of the nodes are not present property collector query will fail for entire operation
+	vmMoList, err := nodeInfo.dataCenter.GetVMMoList(ctx, vmList, []string{"config.hardware.device", "name", "config.uuid"})
+	if err != nil {
+		if vclib.IsManagedObjectNotFoundError(err) && !retry {
+			glog.V(4).Infof("checkDiskAttached: ManagedObjectNotFound for property collector query for nodes: %+v vms: %+v", nodes, vmList)
+			// Property Collector Query failed
+			// VerifyVolumePaths per VM
+			for _, nodeName := range nodes {
+				nodeInfo, err := vs.nodeManager.GetNodeInfo(nodeName)
+				if err != nil {
+					return nodesToRetry, err
+				}
+				devices, err := nodeInfo.vm.VirtualMachine.Device(ctx)
+				if err != nil {
+					if vclib.IsManagedObjectNotFoundError(err) {
+						glog.V(4).Infof("checkDiskAttached: ManagedObjectNotFound for Kubernetes node: %s with vSphere Virtual Machine reference: %v", nodeName, nodeInfo.vm)
+						nodesToRetry = append(nodesToRetry, nodeName)
+						continue
+					}
+					return nodesToRetry, err
+				}
+				glog.V(4).Infof("Verifying Volume Paths by devices for node %s and VM %s", nodeName, nodeInfo.vm)
+				vclib.VerifyVolumePathsForVMDevices(devices, nodeVolumes[nodeName], convertToString(nodeName), attached)
+			}
+		}
+		return nodesToRetry, err
+	}
+
+	vmMoMap := make(map[string]mo.VirtualMachine)
+	for _, vmMo := range vmMoList {
+		if vmMo.Config == nil {
+			glog.Errorf("Config is not available for VM: %q", vmMo.Name)
+			continue
+		}
+		glog.V(9).Infof("vmMoMap vmname: %q vmuuid: %s", vmMo.Name, strings.ToLower(vmMo.Config.Uuid))
+		vmMoMap[strings.ToLower(vmMo.Config.Uuid)] = vmMo
+	}
+
+	glog.V(9).Infof("vmMoMap: +%v", vmMoMap)
+
+	for _, nodeName := range nodes {
+		node, err := vs.nodeManager.GetNode(nodeName)
+		if err != nil {
+			return nodesToRetry, err
+		}
+		glog.V(9).Infof("Verifying volume for nodeName: %q with nodeuuid: %s", nodeName, node.Status.NodeInfo.SystemUUID, vmMoMap)
+		vclib.VerifyVolumePathsForVM(vmMoMap[strings.ToLower(node.Status.NodeInfo.SystemUUID)], nodeVolumes[nodeName], convertToString(nodeName), attached)
+	}
+	return nodesToRetry, nil
+}
--- a/pkg/kubelet/apis/kubeletconfig/BUILD
+++ b/pkg/kubelet/apis/kubeletconfig/BUILD
@ -34,6 +34,7 @@ filegroup(
    name = "all-srcs",
    srcs = [
        ":package-srcs",
+        "//pkg/kubelet/apis/kubeletconfig/fuzzer:all-srcs",
        "//pkg/kubelet/apis/kubeletconfig/scheme:all-srcs",
        "//pkg/kubelet/apis/kubeletconfig/v1alpha1:all-srcs",
        "//pkg/kubelet/apis/kubeletconfig/validation:all-srcs",
--- a/pkg/kubelet/apis/kubeletconfig/fuzzer/BUILD
+++ b/pkg/kubelet/apis/kubeletconfig/fuzzer/BUILD
@ -0,0 +1,32 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["fuzzer.go"],
+    importpath = "k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig/fuzzer",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/kubelet/apis/kubeletconfig:go_default_library",
+        "//pkg/kubelet/apis/kubeletconfig/v1alpha1:go_default_library",
+        "//pkg/kubelet/qos:go_default_library",
+        "//pkg/kubelet/types:go_default_library",
+        "//pkg/master/ports:go_default_library",
+        "//vendor/github.com/google/gofuzz:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/runtime/serializer:go_default_library",
+    ],
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [":package-srcs"],
+    tags = ["automanaged"],
+    visibility = ["//visibility:public"],
+)
--- a/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go
+++ b/pkg/kubelet/apis/kubeletconfig/fuzzer/fuzzer.go
@ -0,0 +1,100 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package fuzzer
+
+import (
+	"time"
+
+	"github.com/google/gofuzz"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	runtimeserializer "k8s.io/apimachinery/pkg/runtime/serializer"
+	"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
+	"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig/v1alpha1"
+	"k8s.io/kubernetes/pkg/kubelet/qos"
+	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
+	"k8s.io/kubernetes/pkg/master/ports"
+)
+
+// Funcs returns the fuzzer functions for the kubeletconfig apis.
+func Funcs(codecs runtimeserializer.CodecFactory) []interface{} {
+	return []interface{}{
+		// provide non-empty values for fields with defaults, so the defaulter doesn't change values during round-trip
+		func(obj *kubeletconfig.KubeletConfiguration, c fuzz.Continue) {
+			c.FuzzNoCustom(obj)
+			obj.ConfigTrialDuration = &metav1.Duration{Duration: 10 * time.Minute}
+			obj.Authentication.Anonymous.Enabled = true
+			obj.Authentication.Webhook.Enabled = false
+			obj.Authentication.Webhook.CacheTTL = metav1.Duration{Duration: 2 * time.Minute}
+			obj.Authorization.Mode = kubeletconfig.KubeletAuthorizationModeAlwaysAllow
+			obj.Authorization.Webhook.CacheAuthorizedTTL = metav1.Duration{Duration: 5 * time.Minute}
+			obj.Authorization.Webhook.CacheUnauthorizedTTL = metav1.Duration{Duration: 30 * time.Second}
+			obj.Address = "0.0.0.0"
+			obj.CAdvisorPort = 4194
+			obj.VolumeStatsAggPeriod = metav1.Duration{Duration: time.Minute}
+			obj.RuntimeRequestTimeout = metav1.Duration{Duration: 2 * time.Minute}
+			obj.CPUCFSQuota = true
+			obj.EventBurst = 10
+			obj.EventRecordQPS = 5
+			obj.EnableControllerAttachDetach = true
+			obj.EnableDebuggingHandlers = true
+			obj.EnableServer = true
+			obj.FileCheckFrequency = metav1.Duration{Duration: 20 * time.Second}
+			obj.HealthzBindAddress = "127.0.0.1"
+			obj.HealthzPort = 10248
+			obj.HostNetworkSources = []string{kubetypes.AllSource}
+			obj.HostPIDSources = []string{kubetypes.AllSource}
+			obj.HostIPCSources = []string{kubetypes.AllSource}
+			obj.HTTPCheckFrequency = metav1.Duration{Duration: 20 * time.Second}
+			obj.ImageMinimumGCAge = metav1.Duration{Duration: 2 * time.Minute}
+			obj.ImageGCHighThresholdPercent = 85
+			obj.ImageGCLowThresholdPercent = 80
+			obj.MaxOpenFiles = 1000000
+			obj.MaxPods = 110
+			obj.NodeStatusUpdateFrequency = metav1.Duration{Duration: 10 * time.Second}
+			obj.CPUManagerPolicy = "none"
+			obj.CPUManagerReconcilePeriod = obj.NodeStatusUpdateFrequency
+			obj.OOMScoreAdj = int32(qos.KubeletOOMScoreAdj)
+			obj.Port = ports.KubeletPort
+			obj.ReadOnlyPort = ports.KubeletReadOnlyPort
+			obj.RegistryBurst = 10
+			obj.RegistryPullQPS = 5
+			obj.ResolverConfig = kubetypes.ResolvConfDefault
+			obj.SerializeImagePulls = true
+			obj.StreamingConnectionIdleTimeout = metav1.Duration{Duration: 4 * time.Hour}
+			obj.SyncFrequency = metav1.Duration{Duration: 1 * time.Minute}
+			obj.ContentType = "application/vnd.kubernetes.protobuf"
+			obj.KubeAPIQPS = 5
+			obj.KubeAPIBurst = 10
+			obj.HairpinMode = v1alpha1.PromiscuousBridge
+			obj.EvictionHard = map[string]string{
+				"memory.available":  "100Mi",
+				"nodefs.available":  "10%",
+				"nodefs.inodesFree": "5%",
+				"imagefs.available": "15%",
+			}
+			obj.EvictionPressureTransitionPeriod = metav1.Duration{Duration: 5 * time.Minute}
+			obj.MakeIPTablesUtilChains = true
+			obj.IPTablesMasqueradeBit = v1alpha1.DefaultIPTablesMasqueradeBit
+			obj.IPTablesDropBit = v1alpha1.DefaultIPTablesDropBit
+			obj.CgroupsPerQOS = true
+			obj.CgroupDriver = "cgroupfs"
+			obj.EnforceNodeAllocatable = v1alpha1.DefaultNodeAllocatableEnforcement
+			obj.ManifestURLHeader = make(map[string][]string)
+		},
+	}
+}
--- a/pkg/kubelet/apis/kubeletconfig/helpers.go
+++ b/pkg/kubelet/apis/kubeletconfig/helpers.go
@ -25,7 +25,6 @@ func KubeletConfigurationPathRefs(kc *KubeletConfiguration) []*string {
 	paths = append(paths, &kc.Authentication.X509.ClientCAFile)
 	paths = append(paths, &kc.TLSCertFile)
 	paths = append(paths, &kc.TLSPrivateKeyFile)
-	paths = append(paths, &kc.SeccompProfileRoot)
 	paths = append(paths, &kc.ResolverConfig)
 	return paths
 }
--- a/pkg/kubelet/apis/kubeletconfig/helpers_test.go
+++ b/pkg/kubelet/apis/kubeletconfig/helpers_test.go
@ -132,7 +132,6 @@ var (
 		"Authentication.X509.ClientCAFile",
 		"TLSCertFile",
 		"TLSPrivateKeyFile",
-		"SeccompProfileRoot",
 		"ResolverConfig",
 	)

--- a/pkg/kubelet/apis/kubeletconfig/scheme/BUILD
+++ b/pkg/kubelet/apis/kubeletconfig/scheme/BUILD
@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")

 go_library(
    name = "go_default_library",
@ -26,3 +26,14 @@ filegroup(
    tags = ["automanaged"],
    visibility = ["//visibility:public"],
 )
+
+go_test(
+    name = "go_default_test",
+    srcs = ["scheme_test.go"],
+    importpath = "k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig/scheme",
+    library = ":go_default_library",
+    deps = [
+        "//pkg/kubelet/apis/kubeletconfig/fuzzer:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/api/testing/roundtrip:go_default_library",
+    ],
+)
--- a/pkg/kubelet/apis/kubeletconfig/scheme/scheme_test.go
+++ b/pkg/kubelet/apis/kubeletconfig/scheme/scheme_test.go
@ -0,0 +1,32 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scheme
+
+import (
+	"testing"
+
+	"k8s.io/apimachinery/pkg/api/testing/roundtrip"
+	"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig/fuzzer"
+)
+
+func TestRoundTripTypes(t *testing.T) {
+	scheme, _, err := NewSchemeAndCodecs()
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	roundtrip.RoundTripTestForScheme(t, scheme, fuzzer.Funcs)
+}
--- a/pkg/kubelet/apis/kubeletconfig/types.go
+++ b/pkg/kubelet/apis/kubeletconfig/types.go
@ -89,8 +89,6 @@ type KubeletConfiguration struct {
 	Authentication KubeletAuthentication
 	// authorization specifies how requests to the Kubelet's server are authorized
 	Authorization KubeletAuthorization
-	// seccompProfileRoot is the directory path for seccomp profiles.
-	SeccompProfileRoot string
 	// allowPrivileged enables containers to request privileged mode.
 	// Defaults to false.
 	AllowPrivileged bool
--- a/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go
+++ b/pkg/kubelet/apis/kubeletconfig/v1alpha1/defaults.go
@ -17,7 +17,6 @@ limitations under the License.
 package v1alpha1

 import (
-	"path/filepath"
 	"time"

 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -37,14 +36,14 @@ const (
 	// More details here: https://github.com/kubernetes/kubernetes/issues/50986
 	AutoDetectCloudProvider = "auto-detect"

-	defaultIPTablesMasqueradeBit = 14
-	defaultIPTablesDropBit       = 15
+	DefaultIPTablesMasqueradeBit = 14
+	DefaultIPTablesDropBit       = 15
 )

 var (
 	zeroDuration = metav1.Duration{}
 	// Refer to [Node Allocatable](https://git.k8s.io/community/contributors/design-proposals/node/node-allocatable.md) doc for more information.
-	defaultNodeAllocatableEnforcement = []string{"pods"}
+	DefaultNodeAllocatableEnforcement = []string{"pods"}
 )

 func addDefaultingFuncs(scheme *kruntime.Scheme) error {
@ -177,9 +176,6 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) {
 	if obj.SerializeImagePulls == nil {
 		obj.SerializeImagePulls = boolVar(true)
 	}
-	if obj.SeccompProfileRoot == "" {
-		obj.SeccompProfileRoot = filepath.Join(DefaultRootDir, "seccomp")
-	}
 	if obj.StreamingConnectionIdleTimeout == zeroDuration {
 		obj.StreamingConnectionIdleTimeout = metav1.Duration{Duration: 4 * time.Hour}
 	}
@ -214,11 +210,11 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) {
 		obj.MakeIPTablesUtilChains = boolVar(true)
 	}
 	if obj.IPTablesMasqueradeBit == nil {
-		temp := int32(defaultIPTablesMasqueradeBit)
+		temp := int32(DefaultIPTablesMasqueradeBit)
 		obj.IPTablesMasqueradeBit = &temp
 	}
 	if obj.IPTablesDropBit == nil {
-		temp := int32(defaultIPTablesDropBit)
+		temp := int32(DefaultIPTablesDropBit)
 		obj.IPTablesDropBit = &temp
 	}
 	if obj.CgroupsPerQOS == nil {
@ -229,7 +225,7 @@ func SetDefaults_KubeletConfiguration(obj *KubeletConfiguration) {
 		obj.CgroupDriver = "cgroupfs"
 	}
 	if obj.EnforceNodeAllocatable == nil {
-		obj.EnforceNodeAllocatable = defaultNodeAllocatableEnforcement
+		obj.EnforceNodeAllocatable = DefaultNodeAllocatableEnforcement
 	}
 }

--- a/pkg/kubelet/apis/kubeletconfig/v1alpha1/types.go
+++ b/pkg/kubelet/apis/kubeletconfig/v1alpha1/types.go
@ -89,8 +89,6 @@ type KubeletConfiguration struct {
 	Authentication KubeletAuthentication `json:"authentication"`
 	// authorization specifies how requests to the Kubelet's server are authorized
 	Authorization KubeletAuthorization `json:"authorization"`
-	// seccompProfileRoot is the directory path for seccomp profiles.
-	SeccompProfileRoot string `json:"seccompProfileRoot"`
 	// allowPrivileged enables containers to request privileged mode.
 	// Defaults to false.
 	AllowPrivileged *bool `json:"allowPrivileged"`
--- a/pkg/kubelet/apis/kubeletconfig/v1alpha1/zz_generated.conversion.go
+++ b/pkg/kubelet/apis/kubeletconfig/v1alpha1/zz_generated.conversion.go
@ -163,7 +163,6 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_kubeletconfig_KubeletConfigura
 	if err := Convert_v1alpha1_KubeletAuthorization_To_kubeletconfig_KubeletAuthorization(&in.Authorization, &out.Authorization, s); err != nil {
 		return err
 	}
-	out.SeccompProfileRoot = in.SeccompProfileRoot
 	if err := v1.Convert_Pointer_bool_To_bool(&in.AllowPrivileged, &out.AllowPrivileged, s); err != nil {
 		return err
 	}
@ -289,7 +288,6 @@ func autoConvert_kubeletconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigura
 	if err := Convert_kubeletconfig_KubeletAuthorization_To_v1alpha1_KubeletAuthorization(&in.Authorization, &out.Authorization, s); err != nil {
 		return err
 	}
-	out.SeccompProfileRoot = in.SeccompProfileRoot
 	if err := v1.Convert_bool_To_Pointer_bool(&in.AllowPrivileged, &out.AllowPrivileged, s); err != nil {
 		return err
 	}
--- a/pkg/kubelet/cadvisor/cadvisor_unsupported.go
+++ b/pkg/kubelet/cadvisor/cadvisor_unsupported.go
@ -31,7 +31,7 @@ type cadvisorUnsupported struct {

 var _ Interface = new(cadvisorUnsupported)

-func New(address string, port uint, imageFsInfoProvider ImageFsInfoProvider, rootPath string) (Interface, error) {
+func New(address string, port uint, imageFsInfoProvider ImageFsInfoProvider, rootPath string, usingLegacyStats bool) (Interface, error) {
 	return &cadvisorUnsupported{}, nil
 }

--- a/pkg/kubelet/cadvisor/cadvisor_windows.go
+++ b/pkg/kubelet/cadvisor/cadvisor_windows.go
@ -32,7 +32,7 @@ type cadvisorClient struct {
 var _ Interface = new(cadvisorClient)

 // New creates a cAdvisor and exports its API on the specified port if port > 0.
-func New(address string, port uint, imageFsInfoProvider ImageFsInfoProvider, rootPath string) (Interface, error) {
+func New(address string, port uint, imageFsInfoProvider ImageFsInfoProvider, rootPath string, usingLegacyStats bool) (Interface, error) {
 	client, err := winstats.NewPerfCounterClient()
 	return &cadvisorClient{winStatsClient: client}, err
 }
--- a/pkg/kubelet/cm/container_manager_linux.go
+++ b/pkg/kubelet/cm/container_manager_linux.go
@ -128,7 +128,7 @@ type containerManagerImpl struct {
 	// Interface for QoS cgroup management
 	qosContainerManager QOSContainerManager
 	// Interface for exporting and allocating devices reported by device plugins.
-	devicePluginHandler deviceplugin.Handler
+	devicePluginManager deviceplugin.Manager
 	// Interface for CPU affinity management.
 	cpuManager cpumanager.Manager
 }
@ -274,11 +274,11 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
 		}
 	}

-	glog.Infof("Creating device plugin handler: %t", devicePluginEnabled)
+	glog.Infof("Creating device plugin manager: %t", devicePluginEnabled)
 	if devicePluginEnabled {
-		cm.devicePluginHandler, err = deviceplugin.NewHandlerImpl(updateDeviceCapacityFunc)
+		cm.devicePluginManager, err = deviceplugin.NewManagerImpl(updateDeviceCapacityFunc)
 	} else {
-		cm.devicePluginHandler, err = deviceplugin.NewHandlerStub()
+		cm.devicePluginManager, err = deviceplugin.NewManagerStub()
 	}
 	if err != nil {
 		return nil, err
@ -597,7 +597,7 @@ func (cm *containerManagerImpl) Start(node *v1.Node,
 	}, time.Second, stopChan)

 	// Starts device plugin manager.
-	if err := cm.devicePluginHandler.Start(deviceplugin.ActivePodsFunc(activePods)); err != nil {
+	if err := cm.devicePluginManager.Start(deviceplugin.ActivePodsFunc(activePods)); err != nil {
 		return err
 	}
 	return nil
@ -622,7 +622,7 @@ func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Containe
 	opts := &kubecontainer.RunContainerOptions{}
 	// Allocate should already be called during predicateAdmitHandler.Admit(),
 	// just try to fetch device runtime information from cached state here
-	devOpts := cm.devicePluginHandler.GetDeviceRunContainerOptions(pod, container)
+	devOpts := cm.devicePluginManager.GetDeviceRunContainerOptions(pod, container)
 	if devOpts == nil {
 		return opts, nil
 	}
@ -633,7 +633,7 @@ func (cm *containerManagerImpl) GetResources(pod *v1.Pod, container *v1.Containe
 }

 func (cm *containerManagerImpl) UpdatePluginResources(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
-	return cm.devicePluginHandler.Allocate(node, attrs)
+	return cm.devicePluginManager.Allocate(node, attrs)
 }

 func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList {
--- a/pkg/kubelet/cm/deviceplugin/BUILD
+++ b/pkg/kubelet/cm/deviceplugin/BUILD
@ -9,11 +9,10 @@ load(
 go_library(
    name = "go_default_library",
    srcs = [
-        "device_plugin_handler.go",
-        "device_plugin_handler_stub.go",
        "device_plugin_stub.go",
        "endpoint.go",
        "manager.go",
+        "manager_stub.go",
        "pod_devices.go",
        "types.go",
    ],
@ -49,7 +48,6 @@ filegroup(
 go_test(
    name = "go_default_test",
    srcs = [
-        "device_plugin_handler_test.go",
        "endpoint_test.go",
        "manager_test.go",
    ],
--- a/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go
+++ b/pkg/kubelet/cm/deviceplugin/device_plugin_handler.go
@ -1,365 +0,0 @@
-/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package deviceplugin
-
-import (
-	"encoding/json"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"sync"
-
-	"github.com/golang/glog"
-
-	"k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/resource"
-	"k8s.io/apimachinery/pkg/util/sets"
-	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
-	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
-	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
-)
-
-// ActivePodsFunc is a function that returns a list of pods to reconcile.
-type ActivePodsFunc func() []*v1.Pod
-
-// Handler defines the functions used to manage and access device plugin resources.
-type Handler interface {
-	// Start starts device plugin registration service.
-	Start(activePods ActivePodsFunc) error
-	// Devices returns all of registered devices keyed by resourceName.
-	Devices() map[string][]pluginapi.Device
-	// Allocate scans through containers in the pod spec
-	// If it finds the container requires device plugin resource, it:
-	// 1. Checks whether it already has this information in its cached state.
-	// 2. If not, it calls Allocate and populate its cached state afterwards.
-	// 3. If there is no cached state and Allocate fails, it returns an error.
-	// 4. Otherwise, it updates allocatableResource in nodeInfo if necessary,
-	// to make sure it is at least equal to the pod's requested capacity for
-	// any registered device plugin resource
-	Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error
-	// GetDeviceRunContainerOptions checks whether we have cached containerDevices
-	// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
-	// for the found one. An empty struct is returned in case no cached state is found.
-	GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions
-}
-
-// HandlerImpl implements the actual functionality to manage device plugin resources.
-type HandlerImpl struct {
-	// TODO: consider to change this to RWMutex.
-	sync.Mutex
-	// devicePluginManager is an implementation of deviceplugin.Manager interface.
-	devicePluginManager Manager
-	// activePods is a method for listing active pods on the node
-	// so the amount of pluginResources requested by existing pods
-	// could be counted when updating allocated devices
-	activePods ActivePodsFunc
-	// devicePluginManagerMonitorCallback is used for updating devices' states in one time call.
-	// e.g. a new device is advertised, two old devices are deleted and a running device fails.
-	devicePluginManagerMonitorCallback MonitorCallback
-	// allDevices contains all of registered resourceNames and their exported device IDs.
-	allDevices map[string]sets.String
-	// allocatedDevices contains allocated deviceIds, keyed by resourceName.
-	allocatedDevices map[string]sets.String
-	// podDevices contains pod to allocated device mapping.
-	podDevices podDevices
-}
-
-// NewHandlerImpl creates a HandlerImpl to manage device plugin resources.
-// updateCapacityFunc is called to update ContainerManager capacity when
-// device capacity changes.
-func NewHandlerImpl(updateCapacityFunc func(v1.ResourceList)) (*HandlerImpl, error) {
-	glog.V(2).Infof("Creating Device Plugin Handler")
-	handler := &HandlerImpl{
-		allDevices:       make(map[string]sets.String),
-		allocatedDevices: make(map[string]sets.String),
-		podDevices:       make(podDevices),
-	}
-
-	deviceManagerMonitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {
-		var capacity = v1.ResourceList{}
-		kept := append(updated, added...)
-		if _, ok := handler.allDevices[resourceName]; !ok {
-			handler.allDevices[resourceName] = sets.NewString()
-		}
-		// For now, Handler only keeps track of healthy devices.
-		// We can revisit this later when the need comes to track unhealthy devices here.
-		for _, dev := range kept {
-			if dev.Health == pluginapi.Healthy {
-				handler.allDevices[resourceName].Insert(dev.ID)
-			} else {
-				handler.allDevices[resourceName].Delete(dev.ID)
-			}
-		}
-		for _, dev := range deleted {
-			handler.allDevices[resourceName].Delete(dev.ID)
-		}
-		capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(handler.allDevices[resourceName].Len()), resource.DecimalSI)
-		updateCapacityFunc(capacity)
-	}
-
-	mgr, err := NewManagerImpl(pluginapi.KubeletSocket, deviceManagerMonitorCallback)
-	if err != nil {
-		return nil, fmt.Errorf("Failed to initialize device plugin manager: %+v", err)
-	}
-
-	handler.devicePluginManager = mgr
-	handler.devicePluginManagerMonitorCallback = deviceManagerMonitorCallback
-
-	return handler, nil
-}
-
-// Start initializes podDevices and allocatedDevices information from checkpoint-ed state
-// and starts device plugin registration service.
-func (h *HandlerImpl) Start(activePods ActivePodsFunc) error {
-	h.activePods = activePods
-
-	// Loads in allocatedDevices information from disk.
-	err := h.readCheckpoint()
-	if err != nil {
-		glog.Warningf("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date. Err: %v", err)
-	}
-
-	return h.devicePluginManager.Start()
-}
-
-// Devices returns all of registered devices keyed by resourceName.
-func (h *HandlerImpl) Devices() map[string][]pluginapi.Device {
-	return h.devicePluginManager.Devices()
-}
-
-// Returns list of device Ids we need to allocate with Allocate rpc call.
-// Returns empty list in case we don't need to issue the Allocate rpc call.
-func (h *HandlerImpl) devicesToAllocate(podUID, contName, resource string, required int) (sets.String, error) {
-	h.Lock()
-	defer h.Unlock()
-	needed := required
-	// Gets list of devices that have already been allocated.
-	// This can happen if a container restarts for example.
-	devices := h.podDevices.containerDevices(podUID, contName, resource)
-	if devices != nil {
-		glog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, contName, podUID, devices.List())
-		needed = needed - devices.Len()
-		// A pod's resource is not expected to change once admitted by the API server,
-		// so just fail loudly here. We can revisit this part if this no longer holds.
-		if needed != 0 {
-			return nil, fmt.Errorf("pod %v container %v changed request for resource %v from %v to %v", podUID, contName, resource, devices.Len(), required)
-		}
-	}
-	if needed == 0 {
-		// No change, no work.
-		return nil, nil
-	}
-	devices = sets.NewString()
-	// Needs to allocate additional devices.
-	if h.allocatedDevices[resource] == nil {
-		h.allocatedDevices[resource] = sets.NewString()
-	}
-	// Gets Devices in use.
-	devicesInUse := h.allocatedDevices[resource]
-	// Gets a list of available devices.
-	available := h.allDevices[resource].Difference(devicesInUse)
-	if int(available.Len()) < needed {
-		return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
-	}
-	allocated := available.UnsortedList()[:needed]
-	// Updates h.allocatedDevices with allocated devices to prevent them
-	// from being allocated to other pods/containers, given that we are
-	// not holding lock during the rpc call.
-	for _, device := range allocated {
-		h.allocatedDevices[resource].Insert(device)
-		devices.Insert(device)
-	}
-	return devices, nil
-}
-
-// allocateContainerResources attempts to allocate all of required device
-// plugin resources for the input container, issues an Allocate rpc request
-// for each new device resource requirement, processes their AllocateResponses,
-// and updates the cached containerDevices on success.
-func (h *HandlerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container) error {
-	podUID := string(pod.UID)
-	contName := container.Name
-	allocatedDevicesUpdated := false
-	for k, v := range container.Resources.Limits {
-		resource := string(k)
-		needed := int(v.Value())
-		glog.V(3).Infof("needs %d %s", needed, resource)
-		if _, registeredResource := h.allDevices[resource]; !registeredResource {
-			continue
-		}
-		// Updates allocatedDevices to garbage collect any stranded resources
-		// before doing the device plugin allocation.
-		if !allocatedDevicesUpdated {
-			h.updateAllocatedDevices(h.activePods())
-			allocatedDevicesUpdated = true
-		}
-		allocDevices, err := h.devicesToAllocate(podUID, contName, resource, needed)
-		if err != nil {
-			return err
-		}
-		if allocDevices == nil || len(allocDevices) <= 0 {
-			continue
-		}
-		// devicePluginManager.Allocate involves RPC calls to device plugin, which
-		// could be heavy-weight. Therefore we want to perform this operation outside
-		// mutex lock. Note if Allcate call fails, we may leave container resources
-		// partially allocated for the failed container. We rely on updateAllocatedDevices()
-		// to garbage collect these resources later. Another side effect is that if
-		// we have X resource A and Y resource B in total, and two containers, container1
-		// and container2 both require X resource A and Y resource B. Both allocation
-		// requests may fail if we serve them in mixed order.
-		// TODO: may revisit this part later if we see inefficient resource allocation
-		// in real use as the result of this. Should also consider to parallize device
-		// plugin Allocate grpc calls if it becomes common that a container may require
-		// resources from multiple device plugins.
-		resp, err := h.devicePluginManager.Allocate(resource, allocDevices.UnsortedList())
-		if err != nil {
-			// In case of allocation failure, we want to restore h.allocatedDevices
-			// to the actual allocated state from h.podDevices.
-			h.Lock()
-			h.allocatedDevices = h.podDevices.devices()
-			h.Unlock()
-			return err
-		}
-
-		// Update internal cached podDevices state.
-		h.Lock()
-		h.podDevices.insert(podUID, contName, resource, allocDevices, resp)
-		h.Unlock()
-	}
-
-	// Checkpoints device to container allocation information.
-	return h.writeCheckpoint()
-}
-
-// Allocate attempts to allocate all of required device plugin resources,
-// and update Allocatable resources in nodeInfo if necessary
-func (h *HandlerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
-	pod := attrs.Pod
-	// TODO: Reuse devices between init containers and regular containers.
-	for _, container := range pod.Spec.InitContainers {
-		if err := h.allocateContainerResources(pod, &container); err != nil {
-			return err
-		}
-	}
-	for _, container := range pod.Spec.Containers {
-		if err := h.allocateContainerResources(pod, &container); err != nil {
-			return err
-		}
-	}
-
-	// quick return if no pluginResources requested
-	if _, podRequireDevicePluginResource := h.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
-		return nil
-	}
-
-	h.sanitizeNodeAllocatable(node)
-
-	return nil
-}
-
-// sanitizeNodeAllocatable scans through allocatedDevices in DevicePluginHandler
-// and if necessary, updates allocatableResource in nodeInfo to at least equal to
-// the allocated capacity. This allows pods that have already been scheduled on
-// the node to pass GeneralPredicates admission checking even upon device plugin failure.
-func (h *HandlerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
-	var newAllocatableResource *schedulercache.Resource
-	allocatableResource := node.AllocatableResource()
-	if allocatableResource.ScalarResources == nil {
-		allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
-	}
-	for resource, devices := range h.allocatedDevices {
-		needed := devices.Len()
-		quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
-		if ok && int(quant) >= needed {
-			continue
-		}
-		// Needs to update nodeInfo.AllocatableResource to make sure
-		// NodeInfo.allocatableResource at least equal to the capacity already allocated.
-		if newAllocatableResource == nil {
-			newAllocatableResource = allocatableResource.Clone()
-		}
-		newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
-	}
-	if newAllocatableResource != nil {
-		node.SetAllocatableResource(newAllocatableResource)
-	}
-}
-
-// GetDeviceRunContainerOptions checks whether we have cached containerDevices
-// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
-// for the found one. An empty struct is returned in case no cached state is found.
-func (h *HandlerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
-	h.Lock()
-	defer h.Unlock()
-	return h.podDevices.deviceRunContainerOptions(string(pod.UID), container.Name)
-}
-
-// updateAllocatedDevices gets a list of active pods and then frees any Devices that are bound to
-// terminated pods. Returns error on failure.
-func (h *HandlerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
-	h.Lock()
-	defer h.Unlock()
-	activePodUids := sets.NewString()
-	for _, pod := range activePods {
-		activePodUids.Insert(string(pod.UID))
-	}
-	allocatedPodUids := h.podDevices.pods()
-	podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
-	if len(podsToBeRemoved) <= 0 {
-		return
-	}
-	glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
-	h.podDevices.delete(podsToBeRemoved.List())
-	// Regenerated allocatedDevices after we update pod allocation information.
-	h.allocatedDevices = h.podDevices.devices()
-}
-
-// Checkpoints device to container allocation information to disk.
-func (h *HandlerImpl) writeCheckpoint() error {
-	h.Lock()
-	data := h.podDevices.toCheckpointData()
-	h.Unlock()
-
-	dataJSON, err := json.Marshal(data)
-	if err != nil {
-		return err
-	}
-	filepath := h.devicePluginManager.CheckpointFile()
-	return ioutil.WriteFile(filepath, dataJSON, 0644)
-}
-
-// Reads device to container allocation information from disk, and populates
-// h.allocatedDevices accordingly.
-func (h *HandlerImpl) readCheckpoint() error {
-	filepath := h.devicePluginManager.CheckpointFile()
-	content, err := ioutil.ReadFile(filepath)
-	if err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("failed to read checkpoint file %q: %v", filepath, err)
-	}
-	glog.V(2).Infof("Read checkpoint file %s\n", filepath)
-	var data checkpointData
-	if err := json.Unmarshal(content, &data); err != nil {
-		return fmt.Errorf("failed to unmarshal checkpoint data: %v", err)
-	}
-
-	h.Lock()
-	defer h.Unlock()
-	h.podDevices.fromCheckpointData(data)
-	h.allocatedDevices = h.podDevices.devices()
-	return nil
-}
--- a/pkg/kubelet/cm/deviceplugin/device_plugin_handler_test.go
+++ b/pkg/kubelet/cm/deviceplugin/device_plugin_handler_test.go
@ -1,414 +0,0 @@
-/*
-Copyright 2017 The Kubernetes Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package deviceplugin
-
-import (
-	"flag"
-	"fmt"
-	"reflect"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-
-	"k8s.io/api/core/v1"
-	"k8s.io/apimachinery/pkg/api/resource"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/util/sets"
-	"k8s.io/apimachinery/pkg/util/uuid"
-	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
-	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
-	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
-)
-
-func TestUpdateCapacity(t *testing.T) {
-	var expected = v1.ResourceList{}
-	as := assert.New(t)
-	verifyCapacityFunc := func(updates v1.ResourceList) {
-		as.Equal(expected, updates)
-	}
-	testHandler, err := NewHandlerImpl(verifyCapacityFunc)
-	as.NotNil(testHandler)
-	as.Nil(err)
-
-	devs := []pluginapi.Device{
-		{ID: "Device1", Health: pluginapi.Healthy},
-		{ID: "Device2", Health: pluginapi.Healthy},
-		{ID: "Device3", Health: pluginapi.Unhealthy},
-	}
-
-	resourceName := "resource1"
-	// Adds three devices for resource1, two healthy and one unhealthy.
-	// Expects capacity for resource1 to be 2.
-	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
-	testHandler.devicePluginManagerMonitorCallback(resourceName, devs, []pluginapi.Device{}, []pluginapi.Device{})
-	// Deletes an unhealthy device should NOT change capacity.
-	testHandler.devicePluginManagerMonitorCallback(resourceName, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[2]})
-	// Updates a healthy device to unhealthy should reduce capacity by 1.
-	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(1), resource.DecimalSI)
-	// Deletes a healthy device should reduce capacity by 1.
-	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(0), resource.DecimalSI)
-	// Tests adding another resource.
-	delete(expected, v1.ResourceName(resourceName))
-	resourceName2 := "resource2"
-	expected[v1.ResourceName(resourceName2)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
-	testHandler.devicePluginManagerMonitorCallback(resourceName2, devs, []pluginapi.Device{}, []pluginapi.Device{})
-}
-
-type stringPairType struct {
-	value1 string
-	value2 string
-}
-
-// DevicePluginManager stub to test device Allocation behavior.
-type DevicePluginManagerTestStub struct {
-	// All data structs are keyed by resourceName+DevId
-	devRuntimeDevices map[string][]stringPairType
-	devRuntimeMounts  map[string][]stringPairType
-	devRuntimeEnvs    map[string][]stringPairType
-}
-
-func NewDevicePluginManagerTestStub() (*DevicePluginManagerTestStub, error) {
-	return &DevicePluginManagerTestStub{
-		devRuntimeDevices: make(map[string][]stringPairType),
-		devRuntimeMounts:  make(map[string][]stringPairType),
-		devRuntimeEnvs:    make(map[string][]stringPairType),
-	}, nil
-}
-
-func (m *DevicePluginManagerTestStub) Start() error {
-	return nil
-}
-
-func (m *DevicePluginManagerTestStub) Devices() map[string][]pluginapi.Device {
-	return make(map[string][]pluginapi.Device)
-}
-
-func (m *DevicePluginManagerTestStub) Allocate(resourceName string, devIds []string) (*pluginapi.AllocateResponse, error) {
-	resp := new(pluginapi.AllocateResponse)
-	resp.Envs = make(map[string]string)
-	for _, id := range devIds {
-		key := resourceName + id
-		fmt.Printf("Alloc device %v for resource %v\n", id, resourceName)
-		for _, dev := range m.devRuntimeDevices[key] {
-			fmt.Printf("Add dev %v %v\n", dev.value1, dev.value2)
-			resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
-				ContainerPath: dev.value1,
-				HostPath:      dev.value2,
-				Permissions:   "mrw",
-			})
-		}
-		for _, mount := range m.devRuntimeMounts[key] {
-			fmt.Printf("Add mount %v %v\n", mount.value1, mount.value2)
-			resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
-				ContainerPath: mount.value1,
-				HostPath:      mount.value2,
-				ReadOnly:      true,
-			})
-		}
-		for _, env := range m.devRuntimeEnvs[key] {
-			fmt.Printf("Add env %v %v\n", env.value1, env.value2)
-			resp.Envs[env.value1] = env.value2
-		}
-	}
-	return resp, nil
-}
-
-func (m *DevicePluginManagerTestStub) Stop() error {
-	return nil
-}
-
-func (m *DevicePluginManagerTestStub) CheckpointFile() string {
-	return "/tmp/device-plugin-checkpoint"
-}
-
-func constructDevices(devices []string) sets.String {
-	ret := sets.NewString()
-	for _, dev := range devices {
-		ret.Insert(dev)
-	}
-	return ret
-}
-
-func constructAllocResp(devices, mounts, envs map[string]string) *pluginapi.AllocateResponse {
-	resp := &pluginapi.AllocateResponse{}
-	for k, v := range devices {
-		resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
-			HostPath:      k,
-			ContainerPath: v,
-			Permissions:   "mrw",
-		})
-	}
-	for k, v := range mounts {
-		resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
-			ContainerPath: k,
-			HostPath:      v,
-			ReadOnly:      true,
-		})
-	}
-	resp.Envs = make(map[string]string)
-	for k, v := range envs {
-		resp.Envs[k] = v
-	}
-	return resp
-}
-
-func TestCheckpoint(t *testing.T) {
-	resourceName1 := "domain1.com/resource1"
-	resourceName2 := "domain2.com/resource2"
-
-	m, err := NewDevicePluginManagerTestStub()
-	as := assert.New(t)
-	as.Nil(err)
-
-	testHandler := &HandlerImpl{
-		devicePluginManager: m,
-		allDevices:          make(map[string]sets.String),
-		allocatedDevices:    make(map[string]sets.String),
-		podDevices:          make(podDevices),
-	}
-
-	testHandler.podDevices.insert("pod1", "con1", resourceName1,
-		constructDevices([]string{"dev1", "dev2"}),
-		constructAllocResp(map[string]string{"/dev/r1dev1": "/dev/r1dev1", "/dev/r1dev2": "/dev/r1dev2"},
-			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
-	testHandler.podDevices.insert("pod1", "con1", resourceName2,
-		constructDevices([]string{"dev1", "dev2"}),
-		constructAllocResp(map[string]string{"/dev/r2dev1": "/dev/r2dev1", "/dev/r2dev2": "/dev/r2dev2"},
-			map[string]string{"/home/r2lib1": "/usr/r2lib1"},
-			map[string]string{"r2devices": "dev1 dev2"}))
-	testHandler.podDevices.insert("pod1", "con2", resourceName1,
-		constructDevices([]string{"dev3"}),
-		constructAllocResp(map[string]string{"/dev/r1dev3": "/dev/r1dev3"},
-			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
-	testHandler.podDevices.insert("pod2", "con1", resourceName1,
-		constructDevices([]string{"dev4"}),
-		constructAllocResp(map[string]string{"/dev/r1dev4": "/dev/r1dev4"},
-			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
-
-	expectedPodDevices := testHandler.podDevices
-	expectedAllocatedDevices := testHandler.podDevices.devices()
-
-	err = testHandler.writeCheckpoint()
-	as.Nil(err)
-	testHandler.podDevices = make(podDevices)
-	err = testHandler.readCheckpoint()
-	as.Nil(err)
-
-	as.Equal(len(expectedPodDevices), len(testHandler.podDevices))
-	for podUID, containerDevices := range expectedPodDevices {
-		for conName, resources := range containerDevices {
-			for resource := range resources {
-				as.True(reflect.DeepEqual(
-					expectedPodDevices.containerDevices(podUID, conName, resource),
-					testHandler.podDevices.containerDevices(podUID, conName, resource)))
-				opts1 := expectedPodDevices.deviceRunContainerOptions(podUID, conName)
-				opts2 := testHandler.podDevices.deviceRunContainerOptions(podUID, conName)
-				as.Equal(len(opts1.Envs), len(opts2.Envs))
-				as.Equal(len(opts1.Mounts), len(opts2.Mounts))
-				as.Equal(len(opts1.Devices), len(opts2.Devices))
-			}
-		}
-	}
-	as.True(reflect.DeepEqual(expectedAllocatedDevices, testHandler.allocatedDevices))
-}
-
-type activePodsStub struct {
-	activePods []*v1.Pod
-}
-
-func (a *activePodsStub) getActivePods() []*v1.Pod {
-	return a.activePods
-}
-
-func (a *activePodsStub) updateActivePods(newPods []*v1.Pod) {
-	a.activePods = newPods
-}
-
-func TestPodContainerDeviceAllocation(t *testing.T) {
-	flag.Set("alsologtostderr", fmt.Sprintf("%t", true))
-	var logLevel string
-	flag.StringVar(&logLevel, "logLevel", "4", "test")
-	flag.Lookup("v").Value.Set(logLevel)
-
-	resourceName1 := "domain1.com/resource1"
-	resourceQuantity1 := *resource.NewQuantity(int64(2), resource.DecimalSI)
-	devID1 := "dev1"
-	devID2 := "dev2"
-	resourceName2 := "domain2.com/resource2"
-	resourceQuantity2 := *resource.NewQuantity(int64(1), resource.DecimalSI)
-	devID3 := "dev3"
-	devID4 := "dev4"
-
-	m, err := NewDevicePluginManagerTestStub()
-	as := assert.New(t)
-	as.Nil(err)
-	monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
-	podsStub := activePodsStub{
-		activePods: []*v1.Pod{},
-	}
-	cachedNode := &v1.Node{
-		Status: v1.NodeStatus{
-			Allocatable: v1.ResourceList{},
-		},
-	}
-	nodeInfo := &schedulercache.NodeInfo{}
-	nodeInfo.SetNode(cachedNode)
-
-	testHandler := &HandlerImpl{
-		devicePluginManager:                m,
-		devicePluginManagerMonitorCallback: monitorCallback,
-		allDevices:                         make(map[string]sets.String),
-		allocatedDevices:                   make(map[string]sets.String),
-		podDevices:                         make(podDevices),
-		activePods:                         podsStub.getActivePods,
-	}
-	testHandler.allDevices[resourceName1] = sets.NewString()
-	testHandler.allDevices[resourceName1].Insert(devID1)
-	testHandler.allDevices[resourceName1].Insert(devID2)
-	testHandler.allDevices[resourceName2] = sets.NewString()
-	testHandler.allDevices[resourceName2].Insert(devID3)
-	testHandler.allDevices[resourceName2].Insert(devID4)
-
-	m.devRuntimeDevices[resourceName1+devID1] = append(m.devRuntimeDevices[resourceName1+devID1], stringPairType{"/dev/aaa", "/dev/aaa"})
-	m.devRuntimeDevices[resourceName1+devID1] = append(m.devRuntimeDevices[resourceName1+devID1], stringPairType{"/dev/bbb", "/dev/bbb"})
-	m.devRuntimeDevices[resourceName1+devID2] = append(m.devRuntimeDevices[resourceName1+devID2], stringPairType{"/dev/ccc", "/dev/ccc"})
-	m.devRuntimeMounts[resourceName1+devID1] = append(m.devRuntimeMounts[resourceName1+devID1], stringPairType{"/container_dir1/file1", "host_dir1/file1"})
-	m.devRuntimeMounts[resourceName1+devID2] = append(m.devRuntimeMounts[resourceName1+devID2], stringPairType{"/container_dir1/file2", "host_dir1/file2"})
-	m.devRuntimeEnvs[resourceName1+devID2] = append(m.devRuntimeEnvs[resourceName1+devID2], stringPairType{"key1", "val1"})
-	m.devRuntimeEnvs[resourceName2+devID3] = append(m.devRuntimeEnvs[resourceName2+devID3], stringPairType{"key2", "val2"})
-	m.devRuntimeEnvs[resourceName2+devID4] = append(m.devRuntimeEnvs[resourceName2+devID4], stringPairType{"key2", "val3"})
-
-	pod := &v1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			UID: uuid.NewUUID(),
-		},
-		Spec: v1.PodSpec{
-			Containers: []v1.Container{
-				{
-					Name: string(uuid.NewUUID()),
-					Resources: v1.ResourceRequirements{
-						Limits: v1.ResourceList{
-							v1.ResourceName(resourceName1): resourceQuantity1,
-							v1.ResourceName("cpu"):         resourceQuantity1,
-							v1.ResourceName(resourceName2): resourceQuantity2,
-						},
-					},
-				},
-			},
-		},
-	}
-
-	podsStub.updateActivePods([]*v1.Pod{pod})
-	err = testHandler.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: pod})
-	as.Nil(err)
-	runContainerOpts := testHandler.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0])
-	as.Equal(len(runContainerOpts.Devices), 3)
-	as.Equal(len(runContainerOpts.Mounts), 2)
-	as.Equal(len(runContainerOpts.Envs), 2)
-
-	// Requesting to create a pod without enough resources should fail.
-	as.Equal(2, testHandler.allocatedDevices[resourceName1].Len())
-	failPod := &v1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			UID: uuid.NewUUID(),
-		},
-		Spec: v1.PodSpec{
-			Containers: []v1.Container{
-				{
-					Name: string(uuid.NewUUID()),
-					Resources: v1.ResourceRequirements{
-						Limits: v1.ResourceList{
-							v1.ResourceName(resourceName1): resourceQuantity2,
-						},
-					},
-				},
-			},
-		},
-	}
-	err = testHandler.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: failPod})
-	as.NotNil(err)
-	runContainerOpts2 := testHandler.GetDeviceRunContainerOptions(failPod, &failPod.Spec.Containers[0])
-	as.Nil(runContainerOpts2)
-
-	// Requesting to create a new pod with a single resourceName2 should succeed.
-	newPod := &v1.Pod{
-		ObjectMeta: metav1.ObjectMeta{
-			UID: uuid.NewUUID(),
-		},
-		Spec: v1.PodSpec{
-			Containers: []v1.Container{
-				{
-					Name: string(uuid.NewUUID()),
-					Resources: v1.ResourceRequirements{
-						Limits: v1.ResourceList{
-							v1.ResourceName(resourceName2): resourceQuantity2,
-						},
-					},
-				},
-			},
-		},
-	}
-	err = testHandler.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: newPod})
-	as.Nil(err)
-	runContainerOpts3 := testHandler.GetDeviceRunContainerOptions(newPod, &newPod.Spec.Containers[0])
-	as.Equal(1, len(runContainerOpts3.Envs))
-}
-
-func TestSanitizeNodeAllocatable(t *testing.T) {
-	resourceName1 := "domain1.com/resource1"
-	devID1 := "dev1"
-
-	resourceName2 := "domain2.com/resource2"
-	devID2 := "dev2"
-
-	m, err := NewDevicePluginManagerTestStub()
-	as := assert.New(t)
-	as.Nil(err)
-	monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
-
-	testHandler := &HandlerImpl{
-		devicePluginManager:                m,
-		devicePluginManagerMonitorCallback: monitorCallback,
-		allDevices:                         make(map[string]sets.String),
-		allocatedDevices:                   make(map[string]sets.String),
-		podDevices:                         make(podDevices),
-	}
-	// require one of resource1 and one of resource2
-	testHandler.allocatedDevices[resourceName1] = sets.NewString()
-	testHandler.allocatedDevices[resourceName1].Insert(devID1)
-	testHandler.allocatedDevices[resourceName2] = sets.NewString()
-	testHandler.allocatedDevices[resourceName2].Insert(devID2)
-
-	cachedNode := &v1.Node{
-		Status: v1.NodeStatus{
-			Allocatable: v1.ResourceList{
-				// has no resource1 and two of resource2
-				v1.ResourceName(resourceName2): *resource.NewQuantity(int64(2), resource.DecimalSI),
-			},
-		},
-	}
-	nodeInfo := &schedulercache.NodeInfo{}
-	nodeInfo.SetNode(cachedNode)
-
-	testHandler.sanitizeNodeAllocatable(nodeInfo)
-
-	allocatableScalarResources := nodeInfo.AllocatableResource().ScalarResources
-	// allocatable in nodeInfo is less than needed, should update
-	as.Equal(1, int(allocatableScalarResources[v1.ResourceName(resourceName1)]))
-	// allocatable in nodeInfo is more than needed, should skip updating
-	as.Equal(2, int(allocatableScalarResources[v1.ResourceName(resourceName2)]))
-}
--- a/pkg/kubelet/cm/deviceplugin/endpoint.go
+++ b/pkg/kubelet/cm/deviceplugin/endpoint.go
@ -32,7 +32,15 @@ import (
 // endpoint maps to a single registered device plugin. It is responsible
 // for managing gRPC communications with the device plugin and caching
 // device states reported by the device plugin.
-type endpoint struct {
+type endpoint interface {
+	run()
+	stop()
+	allocate(devs []string) (*pluginapi.AllocateResponse, error)
+	getDevices() []pluginapi.Device
+	callback(resourceName string, added, updated, deleted []pluginapi.Device)
+}
+
+type endpointImpl struct {
 	client     pluginapi.DevicePluginClient
 	clientConn *grpc.ClientConn

@ -42,30 +50,34 @@ type endpoint struct {
 	devices map[string]pluginapi.Device
 	mutex   sync.Mutex

-	callback MonitorCallback
+	cb monitorCallback
 }

 // newEndpoint creates a new endpoint for the given resourceName.
-func newEndpoint(socketPath, resourceName string, devices map[string]pluginapi.Device, callback MonitorCallback) (*endpoint, error) {
+func newEndpointImpl(socketPath, resourceName string, devices map[string]pluginapi.Device, callback monitorCallback) (*endpointImpl, error) {
 	client, c, err := dial(socketPath)
 	if err != nil {
 		glog.Errorf("Can't create new endpoint with path %s err %v", socketPath, err)
 		return nil, err
 	}

-	return &endpoint{
+	return &endpointImpl{
 		client:     client,
 		clientConn: c,

 		socketPath:   socketPath,
 		resourceName: resourceName,

-		devices:  devices,
-		callback: callback,
+		devices: devices,
+		cb:      callback,
 	}, nil
 }

-func (e *endpoint) getDevices() []pluginapi.Device {
+func (e *endpointImpl) callback(resourceName string, added, updated, deleted []pluginapi.Device) {
+	e.cb(resourceName, added, updated, deleted)
+}
+
+func (e *endpointImpl) getDevices() []pluginapi.Device {
 	e.mutex.Lock()
 	defer e.mutex.Unlock()
 	var devs []pluginapi.Device
@ -81,11 +93,9 @@ func (e *endpoint) getDevices() []pluginapi.Device {
 // blocks on receiving ListAndWatch gRPC stream updates. Each ListAndWatch
 // stream update contains a new list of device states. listAndWatch compares the new
 // device states with its cached states to get list of new, updated, and deleted devices.
-// It then issues a callback to pass this information to the device_plugin_handler which
+// It then issues a callback to pass this information to the device manager which
 // will adjust the resource available information accordingly.
-func (e *endpoint) run() {
-	glog.V(3).Infof("Starting ListAndWatch")
-
+func (e *endpointImpl) run() {
 	stream, err := e.client.ListAndWatch(context.Background(), &pluginapi.Empty{})
 	if err != nil {
 		glog.Errorf(errListAndWatch, e.resourceName, err)
@ -162,13 +172,13 @@ func (e *endpoint) run() {
 }

 // allocate issues Allocate gRPC call to the device plugin.
-func (e *endpoint) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
+func (e *endpointImpl) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
 	return e.client.Allocate(context.Background(), &pluginapi.AllocateRequest{
 		DevicesIDs: devs,
 	})
 }

-func (e *endpoint) stop() {
+func (e *endpointImpl) stop() {
 	e.clientConn.Close()
 }

--- a/pkg/kubelet/cm/deviceplugin/endpoint_test.go
+++ b/pkg/kubelet/cm/deviceplugin/endpoint_test.go
@ -87,7 +87,7 @@ func TestRun(t *testing.T) {
 }

 func TestGetDevices(t *testing.T) {
-	e := endpoint{
+	e := endpointImpl{
 		devices: map[string]pluginapi.Device{
 			"ADeviceId": {ID: "ADeviceId", Health: pluginapi.Healthy},
 		},
@ -96,19 +96,19 @@ func TestGetDevices(t *testing.T) {
 	require.Len(t, devs, 1)
 }

-func esetup(t *testing.T, devs []*pluginapi.Device, socket, resourceName string, callback MonitorCallback) (*Stub, *endpoint) {
+func esetup(t *testing.T, devs []*pluginapi.Device, socket, resourceName string, callback monitorCallback) (*Stub, *endpointImpl) {
 	p := NewDevicePluginStub(devs, socket)

 	err := p.Start()
 	require.NoError(t, err)

-	e, err := newEndpoint(socket, "mock", make(map[string]pluginapi.Device), func(n string, a, u, r []pluginapi.Device) {})
+	e, err := newEndpointImpl(socket, "mock", make(map[string]pluginapi.Device), func(n string, a, u, r []pluginapi.Device) {})
 	require.NoError(t, err)

 	return p, e
 }

-func ecleanup(t *testing.T, p *Stub, e *endpoint) {
+func ecleanup(t *testing.T, p *Stub, e *endpointImpl) {
 	p.Stop()
 	e.stop()
 }
--- a/pkg/kubelet/cm/deviceplugin/manager.go
+++ b/pkg/kubelet/cm/deviceplugin/manager.go
@ -17,7 +17,9 @@ limitations under the License.
 package deviceplugin

 import (
+	"encoding/json"
 	"fmt"
+	"io/ioutil"
 	"net"
 	"os"
 	"path/filepath"
@ -28,27 +30,58 @@ import (
 	"google.golang.org/grpc"

 	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apimachinery/pkg/util/sets"
 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
 	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
+	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )

+// ActivePodsFunc is a function that returns a list of pods to reconcile.
+type ActivePodsFunc func() []*v1.Pod
+
+// monitorCallback is the function called when a device's health state changes,
+// or new devices are reported, or old devices are deleted.
+// Updated contains the most recent state of the Device.
+type monitorCallback func(resourceName string, added, updated, deleted []pluginapi.Device)
+
 // ManagerImpl is the structure in charge of managing Device Plugins.
 type ManagerImpl struct {
 	socketname string
 	socketdir  string

-	endpoints map[string]*endpoint // Key is ResourceName
+	endpoints map[string]endpoint // Key is ResourceName
 	mutex     sync.Mutex

-	callback MonitorCallback
-
 	server *grpc.Server
+
+	// activePods is a method for listing active pods on the node
+	// so the amount of pluginResources requested by existing pods
+	// could be counted when updating allocated devices
+	activePods ActivePodsFunc
+
+	// callback is used for updating devices' states in one time call.
+	// e.g. a new device is advertised, two old devices are deleted and a running device fails.
+	callback monitorCallback
+
+	// allDevices contains all of registered resourceNames and their exported device IDs.
+	allDevices map[string]sets.String
+
+	// allocatedDevices contains allocated deviceIds, keyed by resourceName.
+	allocatedDevices map[string]sets.String
+
+	// podDevices contains pod to allocated device mapping.
+	podDevices podDevices
 }

-// NewManagerImpl creates a new manager on the socket `socketPath`.
-// f is the callback that is called when a device becomes unhealthy.
-// socketPath is present for testing purposes in production this is pluginapi.KubeletSocket
-func NewManagerImpl(socketPath string, f MonitorCallback) (*ManagerImpl, error) {
+// NewManagerImpl creates a new manager. updateCapacityFunc is called to
+// update ContainerManager capacity when device capacity changes.
+func NewManagerImpl(updateCapacityFunc func(v1.ResourceList)) (*ManagerImpl, error) {
+	return newManagerImpl(updateCapacityFunc, pluginapi.KubeletSocket)
+}
+
+func newManagerImpl(updateCapacityFunc func(v1.ResourceList), socketPath string) (*ManagerImpl, error) {
 	glog.V(2).Infof("Creating Device Plugin manager at %s", socketPath)

 	if socketPath == "" || !filepath.IsAbs(socketPath) {
@ -56,13 +89,42 @@ func NewManagerImpl(socketPath string, f MonitorCallback) (*ManagerImpl, error)
 	}

 	dir, file := filepath.Split(socketPath)
-	return &ManagerImpl{
-		endpoints: make(map[string]*endpoint),
+	manager := &ManagerImpl{
+		endpoints:        make(map[string]endpoint),
+		socketname:       file,
+		socketdir:        dir,
+		allDevices:       make(map[string]sets.String),
+		allocatedDevices: make(map[string]sets.String),
+		podDevices:       make(podDevices),
+	}

-		socketname: file,
-		socketdir:  dir,
-		callback:   f,
-	}, nil
+	manager.callback = func(resourceName string, added, updated, deleted []pluginapi.Device) {
+		var capacity = v1.ResourceList{}
+		kept := append(updated, added...)
+
+		manager.mutex.Lock()
+		defer manager.mutex.Unlock()
+
+		if _, ok := manager.allDevices[resourceName]; !ok {
+			manager.allDevices[resourceName] = sets.NewString()
+		}
+		// For now, Manager only keeps track of healthy devices.
+		// We can revisit this later when the need comes to track unhealthy devices here.
+		for _, dev := range kept {
+			if dev.Health == pluginapi.Healthy {
+				manager.allDevices[resourceName].Insert(dev.ID)
+			} else {
+				manager.allDevices[resourceName].Delete(dev.ID)
+			}
+		}
+		for _, dev := range deleted {
+			manager.allDevices[resourceName].Delete(dev.ID)
+		}
+		capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(manager.allDevices[resourceName].Len()), resource.DecimalSI)
+		updateCapacityFunc(capacity)
+	}
+
+	return manager, nil
 }

 func (m *ManagerImpl) removeContents(dir string) error {
@ -77,7 +139,7 @@ func (m *ManagerImpl) removeContents(dir string) error {
 	}
 	for _, name := range names {
 		filePath := filepath.Join(dir, name)
-		if filePath == m.CheckpointFile() {
+		if filePath == m.checkpointFile() {
 			continue
 		}
 		stat, err := os.Stat(filePath)
@ -101,15 +163,25 @@ const (
 	kubeletDevicePluginCheckpoint = "kubelet_internal_checkpoint"
 )

-// CheckpointFile returns device plugin checkpoint file path.
-func (m *ManagerImpl) CheckpointFile() string {
+// checkpointFile returns device plugin checkpoint file path.
+func (m *ManagerImpl) checkpointFile() string {
 	return filepath.Join(m.socketdir, kubeletDevicePluginCheckpoint)
 }

-// Start starts the Device Plugin Manager
-func (m *ManagerImpl) Start() error {
+// Start starts the Device Plugin Manager amd start initialization of
+// podDevices and allocatedDevices information from checkpoint-ed state and
+// starts device plugin registration service.
+func (m *ManagerImpl) Start(activePods ActivePodsFunc) error {
 	glog.V(2).Infof("Starting Device Plugin manager")

+	m.activePods = activePods
+
+	// Loads in allocatedDevices information from disk.
+	err := m.readCheckpoint()
+	if err != nil {
+		glog.Warningf("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date. Err: %v", err)
+	}
+
 	socketPath := filepath.Join(m.socketdir, m.socketname)
 	os.MkdirAll(m.socketdir, 0755)

@ -130,6 +202,8 @@ func (m *ManagerImpl) Start() error {
 	pluginapi.RegisterRegistrationServer(m.server, m)
 	go m.server.Serve(s)

+	glog.V(2).Infof("Serving device plugin registration server on %q", socketPath)
+
 	return nil
 }

@ -150,22 +224,27 @@ func (m *ManagerImpl) Devices() map[string][]pluginapi.Device {

 // Allocate is the call that you can use to allocate a set of devices
 // from the registered device plugins.
-func (m *ManagerImpl) Allocate(resourceName string, devs []string) (*pluginapi.AllocateResponse, error) {
-
-	if len(devs) == 0 {
-		return nil, nil
+func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
+	pod := attrs.Pod
+	// TODO: Reuse devices between init containers and regular containers.
+	for _, container := range pod.Spec.InitContainers {
+		if err := m.allocateContainerResources(pod, &container); err != nil {
+			return err
+		}
+	}
+	for _, container := range pod.Spec.Containers {
+		if err := m.allocateContainerResources(pod, &container); err != nil {
+			return err
+		}
 	}

-	glog.V(3).Infof("Recieved allocation request for devices %v for device plugin %s",
-		devs, resourceName)
-	m.mutex.Lock()
-	e, ok := m.endpoints[resourceName]
-	m.mutex.Unlock()
-	if !ok {
-		return nil, fmt.Errorf("Unknown Device Plugin %s", resourceName)
+	// quick return if no pluginResources requested
+	if _, podRequireDevicePluginResource := m.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
+		return nil
 	}

-	return e.allocate(devs)
+	m.sanitizeNodeAllocatable(node)
+	return nil
 }

 // Register registers a device plugin.
@ -211,12 +290,16 @@ func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
 	if ok && old != nil {
 		// Pass devices of previous endpoint into re-registered one,
 		// to avoid potential orphaned devices upon re-registration
-		existingDevs = old.devices
+		devices := make(map[string]pluginapi.Device)
+		for _, device := range old.getDevices() {
+			devices[device.ID] = device
+		}
+		existingDevs = devices
 	}
 	m.mutex.Unlock()

 	socketPath := filepath.Join(m.socketdir, r.Endpoint)
-	e, err := newEndpoint(socketPath, r.ResourceName, existingDevs, m.callback)
+	e, err := newEndpointImpl(socketPath, r.ResourceName, existingDevs, m.callback)
 	if err != nil {
 		glog.Errorf("Failed to dial device plugin with request %v: %v", r, err)
 		return
@ -259,3 +342,212 @@ func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
 		m.mutex.Unlock()
 	}()
 }
+
+// Checkpoints device to container allocation information to disk.
+func (m *ManagerImpl) writeCheckpoint() error {
+	m.mutex.Lock()
+	data := m.podDevices.toCheckpointData()
+	m.mutex.Unlock()
+
+	dataJSON, err := json.Marshal(data)
+	if err != nil {
+		return err
+	}
+	filepath := m.checkpointFile()
+	return ioutil.WriteFile(filepath, dataJSON, 0644)
+}
+
+// Reads device to container allocation information from disk, and populates
+// m.allocatedDevices accordingly.
+func (m *ManagerImpl) readCheckpoint() error {
+	filepath := m.checkpointFile()
+	content, err := ioutil.ReadFile(filepath)
+	if err != nil && !os.IsNotExist(err) {
+		return fmt.Errorf("failed to read checkpoint file %q: %v", filepath, err)
+	}
+	glog.V(2).Infof("Read checkpoint file %s\n", filepath)
+	var data checkpointData
+	if err := json.Unmarshal(content, &data); err != nil {
+		return fmt.Errorf("failed to unmarshal checkpoint data: %v", err)
+	}
+
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	m.podDevices.fromCheckpointData(data)
+	m.allocatedDevices = m.podDevices.devices()
+	return nil
+}
+
+// updateAllocatedDevices gets a list of active pods and then frees any Devices that are bound to
+// terminated pods. Returns error on failure.
+func (m *ManagerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	activePodUids := sets.NewString()
+	for _, pod := range activePods {
+		activePodUids.Insert(string(pod.UID))
+	}
+	allocatedPodUids := m.podDevices.pods()
+	podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
+	if len(podsToBeRemoved) <= 0 {
+		return
+	}
+	glog.V(5).Infof("pods to be removed: %v", podsToBeRemoved.List())
+	m.podDevices.delete(podsToBeRemoved.List())
+	// Regenerated allocatedDevices after we update pod allocation information.
+	m.allocatedDevices = m.podDevices.devices()
+}
+
+// Returns list of device Ids we need to allocate with Allocate rpc call.
+// Returns empty list in case we don't need to issue the Allocate rpc call.
+func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int) (sets.String, error) {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	needed := required
+	// Gets list of devices that have already been allocated.
+	// This can happen if a container restarts for example.
+	devices := m.podDevices.containerDevices(podUID, contName, resource)
+	if devices != nil {
+		glog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, contName, podUID, devices.List())
+		needed = needed - devices.Len()
+		// A pod's resource is not expected to change once admitted by the API server,
+		// so just fail loudly here. We can revisit this part if this no longer holds.
+		if needed != 0 {
+			return nil, fmt.Errorf("pod %v container %v changed request for resource %v from %v to %v", podUID, contName, resource, devices.Len(), required)
+		}
+	}
+	if needed == 0 {
+		// No change, no work.
+		return nil, nil
+	}
+	devices = sets.NewString()
+	// Needs to allocate additional devices.
+	if m.allocatedDevices[resource] == nil {
+		m.allocatedDevices[resource] = sets.NewString()
+	}
+	// Gets Devices in use.
+	devicesInUse := m.allocatedDevices[resource]
+	// Gets a list of available devices.
+	available := m.allDevices[resource].Difference(devicesInUse)
+	if int(available.Len()) < needed {
+		return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
+	}
+	allocated := available.UnsortedList()[:needed]
+	// Updates m.allocatedDevices with allocated devices to prevent them
+	// from being allocated to other pods/containers, given that we are
+	// not holding lock during the rpc call.
+	for _, device := range allocated {
+		m.allocatedDevices[resource].Insert(device)
+		devices.Insert(device)
+	}
+	return devices, nil
+}
+
+// allocateContainerResources attempts to allocate all of required device
+// plugin resources for the input container, issues an Allocate rpc request
+// for each new device resource requirement, processes their AllocateResponses,
+// and updates the cached containerDevices on success.
+func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container) error {
+	podUID := string(pod.UID)
+	contName := container.Name
+	allocatedDevicesUpdated := false
+	for k, v := range container.Resources.Limits {
+		resource := string(k)
+		needed := int(v.Value())
+		glog.V(3).Infof("needs %d %s", needed, resource)
+		if _, registeredResource := m.allDevices[resource]; !registeredResource {
+			continue
+		}
+		// Updates allocatedDevices to garbage collect any stranded resources
+		// before doing the device plugin allocation.
+		if !allocatedDevicesUpdated {
+			m.updateAllocatedDevices(m.activePods())
+			allocatedDevicesUpdated = true
+		}
+		allocDevices, err := m.devicesToAllocate(podUID, contName, resource, needed)
+		if err != nil {
+			return err
+		}
+		if allocDevices == nil || len(allocDevices) <= 0 {
+			continue
+		}
+		// devicePluginManager.Allocate involves RPC calls to device plugin, which
+		// could be heavy-weight. Therefore we want to perform this operation outside
+		// mutex lock. Note if Allocate call fails, we may leave container resources
+		// partially allocated for the failed container. We rely on updateAllocatedDevices()
+		// to garbage collect these resources later. Another side effect is that if
+		// we have X resource A and Y resource B in total, and two containers, container1
+		// and container2 both require X resource A and Y resource B. Both allocation
+		// requests may fail if we serve them in mixed order.
+		// TODO: may revisit this part later if we see inefficient resource allocation
+		// in real use as the result of this. Should also consider to parallize device
+		// plugin Allocate grpc calls if it becomes common that a container may require
+		// resources from multiple device plugins.
+		m.mutex.Lock()
+		e, ok := m.endpoints[resource]
+		m.mutex.Unlock()
+		if !ok {
+			m.mutex.Lock()
+			m.allocatedDevices = m.podDevices.devices()
+			m.mutex.Unlock()
+			return fmt.Errorf("Unknown Device Plugin %s", resource)
+		}
+
+		devs := allocDevices.UnsortedList()
+		glog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
+		resp, err := e.allocate(devs)
+		if err != nil {
+			// In case of allocation failure, we want to restore m.allocatedDevices
+			// to the actual allocated state from m.podDevices.
+			m.mutex.Lock()
+			m.allocatedDevices = m.podDevices.devices()
+			m.mutex.Unlock()
+			return err
+		}
+
+		// Update internal cached podDevices state.
+		m.mutex.Lock()
+		m.podDevices.insert(podUID, contName, resource, allocDevices, resp)
+		m.mutex.Unlock()
+	}
+
+	// Checkpoints device to container allocation information.
+	return m.writeCheckpoint()
+}
+
+// GetDeviceRunContainerOptions checks whether we have cached containerDevices
+// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
+// for the found one. An empty struct is returned in case no cached state is found.
+func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
+	m.mutex.Lock()
+	defer m.mutex.Unlock()
+	return m.podDevices.deviceRunContainerOptions(string(pod.UID), container.Name)
+}
+
+// sanitizeNodeAllocatable scans through allocatedDevices in the device manager
+// and if necessary, updates allocatableResource in nodeInfo to at least equal to
+// the allocated capacity. This allows pods that have already been scheduled on
+// the node to pass GeneralPredicates admission checking even upon device plugin failure.
+func (m *ManagerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
+	var newAllocatableResource *schedulercache.Resource
+	allocatableResource := node.AllocatableResource()
+	if allocatableResource.ScalarResources == nil {
+		allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
+	}
+	for resource, devices := range m.allocatedDevices {
+		needed := devices.Len()
+		quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
+		if ok && int(quant) >= needed {
+			continue
+		}
+		// Needs to update nodeInfo.AllocatableResource to make sure
+		// NodeInfo.allocatableResource at least equal to the capacity already allocated.
+		if newAllocatableResource == nil {
+			newAllocatableResource = allocatableResource.Clone()
+		}
+		newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
+	}
+	if newAllocatableResource != nil {
+		node.SetAllocatableResource(newAllocatableResource)
+	}
+}
--- a/pkg/kubelet/cm/deviceplugin/device_plugin_handler_stub.go
+++ b/pkg/kubelet/cm/deviceplugin/device_plugin_handler_stub.go
@ -23,30 +23,35 @@ import (
 	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )

-// HandlerStub provides a simple stub implementation for Handler.
-type HandlerStub struct{}
+// ManagerStub provides a simple stub implementation for the Device Manager.
+type ManagerStub struct{}

-// NewHandlerStub creates a HandlerStub.
-func NewHandlerStub() (*HandlerStub, error) {
-	return &HandlerStub{}, nil
+// NewManagerStub creates a ManagerStub.
+func NewManagerStub() (*ManagerStub, error) {
+	return &ManagerStub{}, nil
 }

 // Start simply returns nil.
-func (h *HandlerStub) Start(activePods ActivePodsFunc) error {
+func (h *ManagerStub) Start(activePods ActivePodsFunc) error {
+	return nil
+}
+
+// Stop simply returns nil.
+func (h *ManagerStub) Stop() error {
 	return nil
 }

 // Devices returns an empty map.
-func (h *HandlerStub) Devices() map[string][]pluginapi.Device {
+func (h *ManagerStub) Devices() map[string][]pluginapi.Device {
 	return make(map[string][]pluginapi.Device)
 }

 // Allocate simply returns nil.
-func (h *HandlerStub) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
+func (h *ManagerStub) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
 	return nil
 }

 // GetDeviceRunContainerOptions simply returns nil.
-func (h *HandlerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
+func (h *ManagerStub) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions {
 	return nil
 }
--- a/pkg/kubelet/cm/deviceplugin/manager_test.go
+++ b/pkg/kubelet/cm/deviceplugin/manager_test.go
@ -17,13 +17,23 @@ limitations under the License.
 package deviceplugin

 import (
+	"flag"
+	"fmt"
+	"reflect"
 	"sync/atomic"
 	"testing"
 	"time"

+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-
+	"k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apimachinery/pkg/util/uuid"
 	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
+	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )

 const (
@ -33,10 +43,8 @@ const (
 )

 func TestNewManagerImpl(t *testing.T) {
-	_, err := NewManagerImpl("", func(n string, a, u, r []pluginapi.Device) {})
-	require.Error(t, err)
-
-	_, err = NewManagerImpl(socketName, func(n string, a, u, r []pluginapi.Device) {})
+	verifyCapacityFunc := func(updates v1.ResourceList) {}
+	_, err := newManagerImpl(verifyCapacityFunc, socketName)
 	require.NoError(t, err)
 }

@ -72,6 +80,7 @@ func TestDevicePluginReRegistration(t *testing.T) {
 	m, p1 := setup(t, devs, callback)
 	p1.Register(socketName, testResourceName)
 	// Wait for the first callback to be issued.
+
 	<-callbackChan
 	// Wait till the endpoint is added to the manager.
 	for i := 0; i < 20; i++ {
@ -113,10 +122,17 @@ func TestDevicePluginReRegistration(t *testing.T) {

 }

-func setup(t *testing.T, devs []*pluginapi.Device, callback MonitorCallback) (Manager, *Stub) {
-	m, err := NewManagerImpl(socketName, callback)
+func setup(t *testing.T, devs []*pluginapi.Device, callback monitorCallback) (Manager, *Stub) {
+	updateCapacity := func(v1.ResourceList) {}
+	m, err := newManagerImpl(updateCapacity, socketName)
 	require.NoError(t, err)
-	err = m.Start()
+
+	m.callback = callback
+
+	activePods := func() []*v1.Pod {
+		return []*v1.Pod{}
+	}
+	err = m.Start(activePods)
 	require.NoError(t, err)

 	p := NewDevicePluginStub(devs, pluginSocketName)
@ -130,3 +146,387 @@ func cleanup(t *testing.T, m Manager, p *Stub) {
 	p.Stop()
 	m.Stop()
 }
+
+func TestUpdateCapacity(t *testing.T) {
+	var expected = v1.ResourceList{}
+	as := assert.New(t)
+	verifyCapacityFunc := func(updates v1.ResourceList) {
+		as.Equal(expected, updates)
+	}
+	testManager, err := newManagerImpl(verifyCapacityFunc, socketName)
+	as.NotNil(testManager)
+	as.Nil(err)
+
+	devs := []pluginapi.Device{
+		{ID: "Device1", Health: pluginapi.Healthy},
+		{ID: "Device2", Health: pluginapi.Healthy},
+		{ID: "Device3", Health: pluginapi.Unhealthy},
+	}
+
+	resourceName := "resource1"
+	// Adds three devices for resource1, two healthy and one unhealthy.
+	// Expects capacity for resource1 to be 2.
+	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
+	testManager.callback(resourceName, devs, []pluginapi.Device{}, []pluginapi.Device{})
+	// Deletes an unhealthy device should NOT change capacity.
+	testManager.callback(resourceName, []pluginapi.Device{}, []pluginapi.Device{}, []pluginapi.Device{devs[2]})
+	// Updates a healthy device to unhealthy should reduce capacity by 1.
+	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(1), resource.DecimalSI)
+	// Deletes a healthy device should reduce capacity by 1.
+	expected[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(0), resource.DecimalSI)
+	// Tests adding another resource.
+	delete(expected, v1.ResourceName(resourceName))
+	resourceName2 := "resource2"
+	expected[v1.ResourceName(resourceName2)] = *resource.NewQuantity(int64(2), resource.DecimalSI)
+	testManager.callback(resourceName2, devs, []pluginapi.Device{}, []pluginapi.Device{})
+}
+
+type stringPairType struct {
+	value1 string
+	value2 string
+}
+
+func constructDevices(devices []string) sets.String {
+	ret := sets.NewString()
+	for _, dev := range devices {
+		ret.Insert(dev)
+	}
+	return ret
+}
+
+func constructAllocResp(devices, mounts, envs map[string]string) *pluginapi.AllocateResponse {
+	resp := &pluginapi.AllocateResponse{}
+	for k, v := range devices {
+		resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
+			HostPath:      k,
+			ContainerPath: v,
+			Permissions:   "mrw",
+		})
+	}
+	for k, v := range mounts {
+		resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
+			ContainerPath: k,
+			HostPath:      v,
+			ReadOnly:      true,
+		})
+	}
+	resp.Envs = make(map[string]string)
+	for k, v := range envs {
+		resp.Envs[k] = v
+	}
+	return resp
+}
+
+func TestCheckpoint(t *testing.T) {
+	resourceName1 := "domain1.com/resource1"
+	resourceName2 := "domain2.com/resource2"
+
+	testManager := &ManagerImpl{
+		allDevices:       make(map[string]sets.String),
+		allocatedDevices: make(map[string]sets.String),
+		podDevices:       make(podDevices),
+	}
+
+	testManager.podDevices.insert("pod1", "con1", resourceName1,
+		constructDevices([]string{"dev1", "dev2"}),
+		constructAllocResp(map[string]string{"/dev/r1dev1": "/dev/r1dev1", "/dev/r1dev2": "/dev/r1dev2"},
+			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
+	testManager.podDevices.insert("pod1", "con1", resourceName2,
+		constructDevices([]string{"dev1", "dev2"}),
+		constructAllocResp(map[string]string{"/dev/r2dev1": "/dev/r2dev1", "/dev/r2dev2": "/dev/r2dev2"},
+			map[string]string{"/home/r2lib1": "/usr/r2lib1"},
+			map[string]string{"r2devices": "dev1 dev2"}))
+	testManager.podDevices.insert("pod1", "con2", resourceName1,
+		constructDevices([]string{"dev3"}),
+		constructAllocResp(map[string]string{"/dev/r1dev3": "/dev/r1dev3"},
+			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
+	testManager.podDevices.insert("pod2", "con1", resourceName1,
+		constructDevices([]string{"dev4"}),
+		constructAllocResp(map[string]string{"/dev/r1dev4": "/dev/r1dev4"},
+			map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
+
+	expectedPodDevices := testManager.podDevices
+	expectedAllocatedDevices := testManager.podDevices.devices()
+
+	err := testManager.writeCheckpoint()
+	as := assert.New(t)
+
+	as.Nil(err)
+	testManager.podDevices = make(podDevices)
+	err = testManager.readCheckpoint()
+	as.Nil(err)
+
+	as.Equal(len(expectedPodDevices), len(testManager.podDevices))
+	for podUID, containerDevices := range expectedPodDevices {
+		for conName, resources := range containerDevices {
+			for resource := range resources {
+				as.True(reflect.DeepEqual(
+					expectedPodDevices.containerDevices(podUID, conName, resource),
+					testManager.podDevices.containerDevices(podUID, conName, resource)))
+				opts1 := expectedPodDevices.deviceRunContainerOptions(podUID, conName)
+				opts2 := testManager.podDevices.deviceRunContainerOptions(podUID, conName)
+				as.Equal(len(opts1.Envs), len(opts2.Envs))
+				as.Equal(len(opts1.Mounts), len(opts2.Mounts))
+				as.Equal(len(opts1.Devices), len(opts2.Devices))
+			}
+		}
+	}
+	as.True(reflect.DeepEqual(expectedAllocatedDevices, testManager.allocatedDevices))
+}
+
+type activePodsStub struct {
+	activePods []*v1.Pod
+}
+
+func (a *activePodsStub) getActivePods() []*v1.Pod {
+	return a.activePods
+}
+
+func (a *activePodsStub) updateActivePods(newPods []*v1.Pod) {
+	a.activePods = newPods
+}
+
+type MockEndpoint struct {
+	allocateFunc func(devs []string) (*pluginapi.AllocateResponse, error)
+}
+
+func (m *MockEndpoint) stop() {}
+func (m *MockEndpoint) run()  {}
+
+func (m *MockEndpoint) getDevices() []pluginapi.Device {
+	return []pluginapi.Device{}
+}
+
+func (m *MockEndpoint) callback(resourceName string, added, updated, deleted []pluginapi.Device) {}
+
+func (m *MockEndpoint) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
+	if m.allocateFunc != nil {
+		return m.allocateFunc(devs)
+	}
+	return nil, nil
+}
+
+func TestPodContainerDeviceAllocation(t *testing.T) {
+	flag.Set("alsologtostderr", fmt.Sprintf("%t", true))
+	var logLevel string
+	flag.StringVar(&logLevel, "logLevel", "4", "test")
+	flag.Lookup("v").Value.Set(logLevel)
+
+	resourceName1 := "domain1.com/resource1"
+	resourceQuantity1 := *resource.NewQuantity(int64(2), resource.DecimalSI)
+	devID1 := "dev1"
+	devID2 := "dev2"
+	resourceName2 := "domain2.com/resource2"
+	resourceQuantity2 := *resource.NewQuantity(int64(1), resource.DecimalSI)
+	devID3 := "dev3"
+	devID4 := "dev4"
+
+	as := require.New(t)
+	monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
+	podsStub := activePodsStub{
+		activePods: []*v1.Pod{},
+	}
+	cachedNode := &v1.Node{
+		Status: v1.NodeStatus{
+			Allocatable: v1.ResourceList{},
+		},
+	}
+	nodeInfo := &schedulercache.NodeInfo{}
+	nodeInfo.SetNode(cachedNode)
+
+	testManager := &ManagerImpl{
+		callback:         monitorCallback,
+		allDevices:       make(map[string]sets.String),
+		allocatedDevices: make(map[string]sets.String),
+		endpoints:        make(map[string]endpoint),
+		podDevices:       make(podDevices),
+		activePods:       podsStub.getActivePods,
+	}
+
+	testManager.allDevices[resourceName1] = sets.NewString()
+	testManager.allDevices[resourceName1].Insert(devID1)
+	testManager.allDevices[resourceName1].Insert(devID2)
+	testManager.allDevices[resourceName2] = sets.NewString()
+	testManager.allDevices[resourceName2].Insert(devID3)
+	testManager.allDevices[resourceName2].Insert(devID4)
+
+	testManager.endpoints[resourceName1] = &MockEndpoint{
+		allocateFunc: func(devs []string) (*pluginapi.AllocateResponse, error) {
+			resp := new(pluginapi.AllocateResponse)
+			resp.Envs = make(map[string]string)
+			for _, dev := range devs {
+				switch dev {
+				case "dev1":
+					resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
+						ContainerPath: "/dev/aaa",
+						HostPath:      "/dev/aaa",
+						Permissions:   "mrw",
+					})
+
+					resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
+						ContainerPath: "/dev/bbb",
+						HostPath:      "/dev/bbb",
+						Permissions:   "mrw",
+					})
+
+					resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
+						ContainerPath: "/container_dir1/file1",
+						HostPath:      "host_dir1/file1",
+						ReadOnly:      true,
+					})
+
+				case "dev2":
+					resp.Devices = append(resp.Devices, &pluginapi.DeviceSpec{
+						ContainerPath: "/dev/ccc",
+						HostPath:      "/dev/ccc",
+						Permissions:   "mrw",
+					})
+
+					resp.Mounts = append(resp.Mounts, &pluginapi.Mount{
+						ContainerPath: "/container_dir1/file2",
+						HostPath:      "host_dir1/file2",
+						ReadOnly:      true,
+					})
+
+					resp.Envs["key1"] = "val1"
+				}
+			}
+			return resp, nil
+		},
+	}
+
+	testManager.endpoints[resourceName2] = &MockEndpoint{
+		allocateFunc: func(devs []string) (*pluginapi.AllocateResponse, error) {
+			resp := new(pluginapi.AllocateResponse)
+			resp.Envs = make(map[string]string)
+			for _, dev := range devs {
+				switch dev {
+				case "dev3":
+					resp.Envs["key2"] = "val2"
+
+				case "dev4":
+					resp.Envs["key2"] = "val3"
+				}
+			}
+			return resp, nil
+		},
+	}
+
+	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			UID: uuid.NewUUID(),
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name: string(uuid.NewUUID()),
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							v1.ResourceName(resourceName1): resourceQuantity1,
+							v1.ResourceName("cpu"):         resourceQuantity1,
+							v1.ResourceName(resourceName2): resourceQuantity2,
+						},
+					},
+				},
+			},
+		},
+	}
+
+	podsStub.updateActivePods([]*v1.Pod{pod})
+	err := testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: pod})
+	as.Nil(err)
+	runContainerOpts := testManager.GetDeviceRunContainerOptions(pod, &pod.Spec.Containers[0])
+	as.NotNil(runContainerOpts)
+	as.Equal(len(runContainerOpts.Devices), 3)
+	as.Equal(len(runContainerOpts.Mounts), 2)
+	as.Equal(len(runContainerOpts.Envs), 2)
+
+	// Requesting to create a pod without enough resources should fail.
+	as.Equal(2, testManager.allocatedDevices[resourceName1].Len())
+	failPod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			UID: uuid.NewUUID(),
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name: string(uuid.NewUUID()),
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							v1.ResourceName(resourceName1): resourceQuantity2,
+						},
+					},
+				},
+			},
+		},
+	}
+	err = testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: failPod})
+	as.NotNil(err)
+	runContainerOpts2 := testManager.GetDeviceRunContainerOptions(failPod, &failPod.Spec.Containers[0])
+	as.Nil(runContainerOpts2)
+
+	// Requesting to create a new pod with a single resourceName2 should succeed.
+	newPod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			UID: uuid.NewUUID(),
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{
+				{
+					Name: string(uuid.NewUUID()),
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							v1.ResourceName(resourceName2): resourceQuantity2,
+						},
+					},
+				},
+			},
+		},
+	}
+	err = testManager.Allocate(nodeInfo, &lifecycle.PodAdmitAttributes{Pod: newPod})
+	as.Nil(err)
+	runContainerOpts3 := testManager.GetDeviceRunContainerOptions(newPod, &newPod.Spec.Containers[0])
+	as.Equal(1, len(runContainerOpts3.Envs))
+}
+
+func TestSanitizeNodeAllocatable(t *testing.T) {
+	resourceName1 := "domain1.com/resource1"
+	devID1 := "dev1"
+
+	resourceName2 := "domain2.com/resource2"
+	devID2 := "dev2"
+
+	as := assert.New(t)
+	monitorCallback := func(resourceName string, added, updated, deleted []pluginapi.Device) {}
+
+	testManager := &ManagerImpl{
+		callback:         monitorCallback,
+		allDevices:       make(map[string]sets.String),
+		allocatedDevices: make(map[string]sets.String),
+		podDevices:       make(podDevices),
+	}
+	// require one of resource1 and one of resource2
+	testManager.allocatedDevices[resourceName1] = sets.NewString()
+	testManager.allocatedDevices[resourceName1].Insert(devID1)
+	testManager.allocatedDevices[resourceName2] = sets.NewString()
+	testManager.allocatedDevices[resourceName2].Insert(devID2)
+
+	cachedNode := &v1.Node{
+		Status: v1.NodeStatus{
+			Allocatable: v1.ResourceList{
+				// has no resource1 and two of resource2
+				v1.ResourceName(resourceName2): *resource.NewQuantity(int64(2), resource.DecimalSI),
+			},
+		},
+	}
+	nodeInfo := &schedulercache.NodeInfo{}
+	nodeInfo.SetNode(cachedNode)
+
+	testManager.sanitizeNodeAllocatable(nodeInfo)
+
+	allocatableScalarResources := nodeInfo.AllocatableResource().ScalarResources
+	// allocatable in nodeInfo is less than needed, should update
+	as.Equal(1, int(allocatableScalarResources[v1.ResourceName(resourceName1)]))
+	// allocatable in nodeInfo is more than needed, should skip updating
+	as.Equal(2, int(allocatableScalarResources[v1.ResourceName(resourceName2)]))
+}
--- a/pkg/kubelet/cm/deviceplugin/pod_devices.go
+++ b/pkg/kubelet/cm/deviceplugin/pod_devices.go
@ -116,6 +116,11 @@ func (pdev podDevices) toCheckpointData() checkpointData {
 		for conName, resources := range containerDevices {
 			for resource, devices := range resources {
 				devIds := devices.deviceIds.UnsortedList()
+				if devices.allocResp == nil {
+					glog.Errorf("Can't marshal allocResp for %v %v %v: allocation response is missing", podUID, conName, resource)
+					continue
+				}
+
 				allocResp, err := devices.allocResp.Marshal()
 				if err != nil {
 					glog.Errorf("Can't marshal allocResp for %v %v %v: %v", podUID, conName, resource, err)
--- a/pkg/kubelet/cm/deviceplugin/types.go
+++ b/pkg/kubelet/cm/deviceplugin/types.go
@ -17,34 +17,40 @@ limitations under the License.
 package deviceplugin

 import (
+	"k8s.io/api/core/v1"
 	pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
+	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/schedulercache"
 )

-// MonitorCallback is the function called when a device's health state changes,
-// or new devices are reported, or old devices are deleted.
-// Updated contains the most recent state of the Device.
-type MonitorCallback func(resourceName string, added, updated, deleted []pluginapi.Device)
-
 // Manager manages all the Device Plugins running on a node.
 type Manager interface {
-	// Start starts the gRPC Registration service.
-	Start() error
+	// Start starts device plugin registration service.
+	Start(activePods ActivePodsFunc) error

 	// Devices is the map of devices that have registered themselves
 	// against the manager.
 	// The map key is the ResourceName of the device plugins.
 	Devices() map[string][]pluginapi.Device

-	// Allocate takes resourceName and list of device Ids, and calls the
-	// gRPC Allocate on the device plugin matching the resourceName.
-	Allocate(string, []string) (*pluginapi.AllocateResponse, error)
+	// Allocate configures and assigns devices to pods. The pods are provided
+	// through the pod admission attributes in the attrs argument. From the
+	// requested device resources, Allocate will communicate with the owning
+	// device plugin to allow setup procedures to take place, and for the
+	// device plugin to provide runtime settings to use the device (environment
+	// variables, mount points and device files). The node object is provided
+	// for the device manager to update the node capacity to reflect the
+	// currently available devices.
+	Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error

 	// Stop stops the manager.
 	Stop() error

-	// Returns checkpoint file path.
-	CheckpointFile() string
+	// GetDeviceRunContainerOptions checks whether we have cached containerDevices
+	// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
+	// for the found one. An empty struct is returned in case no cached state is found.
+	GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) *DeviceRunContainerOptions
 }

 // DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices.
--- a/pkg/kubelet/kubelet.go
+++ b/pkg/kubelet/kubelet.go
@ -217,7 +217,8 @@ type Builder func(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 	registerSchedulable bool,
 	nonMasqueradeCIDR string,
 	keepTerminatedPodVolumes bool,
-	nodeLabels map[string]string) (Bootstrap, error)
+	nodeLabels map[string]string,
+	seccompProfileRoot string) (Bootstrap, error)

 // Dependencies is a bin for things we might consider "injected dependencies" -- objects constructed
 // at runtime that are necessary for running the Kubelet. This is a temporary solution for grouping
@ -343,7 +344,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 	registerSchedulable bool,
 	nonMasqueradeCIDR string,
 	keepTerminatedPodVolumes bool,
-	nodeLabels map[string]string) (*Kubelet, error) {
+	nodeLabels map[string]string,
+	seccompProfileRoot string) (*Kubelet, error) {
 	if rootDirectory == "" {
 		return nil, fmt.Errorf("invalid root directory %q", rootDirectory)
 	}
@ -657,7 +659,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
 		runtime, err := kuberuntime.NewKubeGenericRuntimeManager(
 			kubecontainer.FilterEventRecorder(kubeDeps.Recorder),
 			klet.livenessManager,
-			kubeCfg.SeccompProfileRoot,
+			seccompProfileRoot,
 			containerRefManager,
 			machineInfo,
 			klet,
--- a/pkg/printers/internalversion/describe.go
+++ b/pkg/printers/internalversion/describe.go
@ -1136,6 +1136,9 @@ func describePersistentVolume(pv *api.PersistentVolume, events *api.EventList) (
 		}
 		w.Write(LEVEL_0, "Reclaim Policy:\t%v\n", pv.Spec.PersistentVolumeReclaimPolicy)
 		w.Write(LEVEL_0, "Access Modes:\t%s\n", helper.GetAccessModesAsString(pv.Spec.AccessModes))
+		if pv.Spec.VolumeMode != nil {
+			w.Write(LEVEL_0, "VolumeMode:\t%v\n", *pv.Spec.VolumeMode)
+		}
 		storage := pv.Spec.Capacity[api.ResourceStorage]
 		w.Write(LEVEL_0, "Capacity:\t%s\n", storage.String())
 		w.Write(LEVEL_0, "Message:\t%s\n", pv.Status.Message)
@ -1235,6 +1238,9 @@ func describePersistentVolumeClaim(pvc *api.PersistentVolumeClaim, events *api.E
 		}
 		w.Write(LEVEL_0, "Capacity:\t%s\n", capacity)
 		w.Write(LEVEL_0, "Access Modes:\t%s\n", accessModes)
+		if pvc.Spec.VolumeMode != nil {
+			w.Write(LEVEL_0, "VolumeMode:\t%v\n", *pvc.Spec.VolumeMode)
+		}
 		if events != nil {
 			DescribeEvents(events, w)
 		}
@ -1365,6 +1371,7 @@ func describeContainerProbe(container api.Container, w PrefixWriter) {
 }

 func describeContainerVolumes(container api.Container, w PrefixWriter) {
+	// Show volumeMounts
 	none := ""
 	if len(container.VolumeMounts) == 0 {
 		none = "\t<none>"
@ -1383,6 +1390,14 @@ func describeContainerVolumes(container api.Container, w PrefixWriter) {
 		}
 		w.Write(LEVEL_3, "%s from %s (%s)\n", mount.MountPath, mount.Name, strings.Join(flags, ","))
 	}
+	// Show volumeDevices if exists
+	if len(container.VolumeDevices) > 0 {
+		w.Write(LEVEL_2, "Devices:%s\n", none)
+		sort.Sort(SortableVolumeDevices(container.VolumeDevices))
+		for _, device := range container.VolumeDevices {
+			w.Write(LEVEL_3, "%s from %s\n", device.DevicePath, device.Name)
+		}
+	}
 }

 func describeContainerEnvVars(container api.Container, resolverFn EnvVarResolverFunc, w PrefixWriter) {
@ -3803,6 +3818,20 @@ func (list SortableVolumeMounts) Less(i, j int) bool {
 	return list[i].MountPath < list[j].MountPath
 }

+type SortableVolumeDevices []api.VolumeDevice
+
+func (list SortableVolumeDevices) Len() int {
+	return len(list)
+}
+
+func (list SortableVolumeDevices) Swap(i, j int) {
+	list[i], list[j] = list[j], list[i]
+}
+
+func (list SortableVolumeDevices) Less(i, j int) bool {
+	return list[i].DevicePath < list[j].DevicePath
+}
+
 // TODO: get rid of this and plumb the caller correctly
 func versionedExtensionsClientV1beta1(internalClient clientset.Interface) clientextensionsv1beta1.ExtensionsV1beta1Interface {
 	if internalClient == nil {
--- a/pkg/printers/internalversion/describe_test.go
+++ b/pkg/printers/internalversion/describe_test.go
@ -634,6 +634,50 @@ func TestDescribeContainers(t *testing.T) {
 			},
 			expectedElements: []string{"cpu", "1k", "memory", "4G", "storage", "20G"},
 		},
+		// volumeMounts read/write
+		{
+			container: api.Container{
+				Name:  "test",
+				Image: "image",
+				VolumeMounts: []api.VolumeMount{
+					{
+						Name:      "mounted-volume",
+						MountPath: "/opt/",
+					},
+				},
+			},
+			expectedElements: []string{"mounted-volume", "/opt/", "(rw)"},
+		},
+		// volumeMounts readonly
+		{
+			container: api.Container{
+				Name:  "test",
+				Image: "image",
+				VolumeMounts: []api.VolumeMount{
+					{
+						Name:      "mounted-volume",
+						MountPath: "/opt/",
+						ReadOnly:  true,
+					},
+				},
+			},
+			expectedElements: []string{"Mounts", "mounted-volume", "/opt/", "(ro)"},
+		},
+
+		// volumeDevices
+		{
+			container: api.Container{
+				Name:  "test",
+				Image: "image",
+				VolumeDevices: []api.VolumeDevice{
+					{
+						Name:       "volume-device",
+						DevicePath: "/dev/xvda",
+					},
+				},
+			},
+			expectedElements: []string{"Devices", "volume-device", "/dev/xvda"},
+		},
 	}

 	for i, testCase := range testCases {
@ -815,99 +859,237 @@ func TestGetPodsTotalRequests(t *testing.T) {
 }

 func TestPersistentVolumeDescriber(t *testing.T) {
-	tests := map[string]*api.PersistentVolume{
-
-		"hostpath": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					HostPath: &api.HostPathVolumeSource{Type: new(api.HostPathType)},
+	block := api.PersistentVolumeBlock
+	file := api.PersistentVolumeFilesystem
+	testCases := []struct {
+		plugin             string
+		pv                 *api.PersistentVolume
+		expectedElements   []string
+		unexpectedElements []string
+	}{
+		{
+			plugin: "hostpath",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						HostPath: &api.HostPathVolumeSource{Type: new(api.HostPathType)},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"gce": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					GCEPersistentDisk: &api.GCEPersistentDiskVolumeSource{},
+		{
+			plugin: "gce",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						GCEPersistentDisk: &api.GCEPersistentDiskVolumeSource{},
+					},
+					VolumeMode: &file,
 				},
 			},
+			expectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"ebs": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					AWSElasticBlockStore: &api.AWSElasticBlockStoreVolumeSource{},
+		{
+			plugin: "ebs",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						AWSElasticBlockStore: &api.AWSElasticBlockStoreVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"nfs": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					NFS: &api.NFSVolumeSource{},
+		{
+			plugin: "nfs",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						NFS: &api.NFSVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"iscsi": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					ISCSI: &api.ISCSIPersistentVolumeSource{},
+		{
+			plugin: "iscsi",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						ISCSI: &api.ISCSIPersistentVolumeSource{},
+					},
+					VolumeMode: &block,
 				},
 			},
+			expectedElements: []string{"VolumeMode", "Block"},
 		},
-		"gluster": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					Glusterfs: &api.GlusterfsVolumeSource{},
+		{
+			plugin: "gluster",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						Glusterfs: &api.GlusterfsVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"rbd": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					RBD: &api.RBDPersistentVolumeSource{},
+		{
+			plugin: "rbd",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						RBD: &api.RBDPersistentVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"quobyte": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					Quobyte: &api.QuobyteVolumeSource{},
+		{
+			plugin: "quobyte",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						Quobyte: &api.QuobyteVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"cinder": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					Cinder: &api.CinderVolumeSource{},
+		{
+			plugin: "cinder",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						Cinder: &api.CinderVolumeSource{},
+					},
 				},
 			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
 		},
-		"fc": {
-			ObjectMeta: metav1.ObjectMeta{Name: "bar"},
-			Spec: api.PersistentVolumeSpec{
-				PersistentVolumeSource: api.PersistentVolumeSource{
-					FC: &api.FCVolumeSource{},
+		{
+			plugin: "fc",
+			pv: &api.PersistentVolume{
+				ObjectMeta: metav1.ObjectMeta{Name: "bar"},
+				Spec: api.PersistentVolumeSpec{
+					PersistentVolumeSource: api.PersistentVolumeSource{
+						FC: &api.FCVolumeSource{},
+					},
+					VolumeMode: &block,
 				},
 			},
+			expectedElements: []string{"VolumeMode", "Block"},
 		},
 	}

-	for name, pv := range tests {
-		fake := fake.NewSimpleClientset(pv)
+	for _, test := range testCases {
+		fake := fake.NewSimpleClientset(test.pv)
 		c := PersistentVolumeDescriber{fake}
 		str, err := c.Describe("foo", "bar", printers.DescriberSettings{ShowEvents: true})
 		if err != nil {
-			t.Errorf("Unexpected error for test %s: %v", name, err)
+			t.Errorf("Unexpected error for test %s: %v", test.plugin, err)
 		}
 		if str == "" {
-			t.Errorf("Unexpected empty string for test %s.  Expected PV Describer output", name)
+			t.Errorf("Unexpected empty string for test %s.  Expected PV Describer output", test.plugin)
+		}
+		for _, expected := range test.expectedElements {
+			if !strings.Contains(str, expected) {
+				t.Errorf("expected to find %q in output: %q", expected, str)
+			}
+		}
+		for _, unexpected := range test.unexpectedElements {
+			if strings.Contains(str, unexpected) {
+				t.Errorf("unexpected to find %q in output: %q", unexpected, str)
+			}
+		}
+	}
+}
+
+func TestPersistentVolumeClaimDescriber(t *testing.T) {
+	block := api.PersistentVolumeBlock
+	file := api.PersistentVolumeFilesystem
+	goldClassName := "gold"
+	testCases := []struct {
+		name               string
+		pvc                *api.PersistentVolumeClaim
+		expectedElements   []string
+		unexpectedElements []string
+	}{
+		{
+			name: "default",
+			pvc: &api.PersistentVolumeClaim{
+				ObjectMeta: metav1.ObjectMeta{Namespace: "foo", Name: "bar"},
+				Spec: api.PersistentVolumeClaimSpec{
+					VolumeName:       "volume1",
+					StorageClassName: &goldClassName,
+				},
+				Status: api.PersistentVolumeClaimStatus{
+					Phase: api.ClaimBound,
+				},
+			},
+			unexpectedElements: []string{"VolumeMode", "Filesystem"},
+		},
+		{
+			name: "filesystem",
+			pvc: &api.PersistentVolumeClaim{
+				ObjectMeta: metav1.ObjectMeta{Namespace: "foo", Name: "bar"},
+				Spec: api.PersistentVolumeClaimSpec{
+					VolumeName:       "volume2",
+					StorageClassName: &goldClassName,
+					VolumeMode:       &file,
+				},
+				Status: api.PersistentVolumeClaimStatus{
+					Phase: api.ClaimBound,
+				},
+			},
+			expectedElements: []string{"VolumeMode", "Filesystem"},
+		},
+		{
+			name: "block",
+			pvc: &api.PersistentVolumeClaim{
+				ObjectMeta: metav1.ObjectMeta{Namespace: "foo", Name: "bar"},
+				Spec: api.PersistentVolumeClaimSpec{
+					VolumeName:       "volume3",
+					StorageClassName: &goldClassName,
+					VolumeMode:       &block,
+				},
+				Status: api.PersistentVolumeClaimStatus{
+					Phase: api.ClaimBound,
+				},
+			},
+			expectedElements: []string{"VolumeMode", "Block"},
+		},
+	}
+
+	for _, test := range testCases {
+		fake := fake.NewSimpleClientset(test.pvc)
+		c := PersistentVolumeClaimDescriber{fake}
+		str, err := c.Describe("foo", "bar", printers.DescriberSettings{ShowEvents: true})
+		if err != nil {
+			t.Errorf("Unexpected error for test %s: %v", test.name, err)
+		}
+		if str == "" {
+			t.Errorf("Unexpected empty string for test %s.  Expected PVC Describer output", test.name)
+		}
+		for _, expected := range test.expectedElements {
+			if !strings.Contains(str, expected) {
+				t.Errorf("expected to find %q in output: %q", expected, str)
+			}
+		}
+		for _, unexpected := range test.unexpectedElements {
+			if strings.Contains(str, unexpected) {
+				t.Errorf("unexpected to find %q in output: %q", unexpected, str)
+			}
 		}
 	}
 }
--- a/pkg/volume/vsphere_volume/attacher.go
+++ b/pkg/volume/vsphere_volume/attacher.go
@ -76,7 +76,7 @@ func (attacher *vsphereVMDKAttacher) Attach(spec *volume.Spec, nodeName types.No

 	// vsphereCloud.AttachDisk checks if disk is already attached to host and
 	// succeeds in that case, so no need to do that separately.
-	diskUUID, err := attacher.vsphereVolumes.AttachDisk(volumeSource.VolumePath, volumeSource.StoragePolicyID, nodeName)
+	diskUUID, err := attacher.vsphereVolumes.AttachDisk(volumeSource.VolumePath, volumeSource.StoragePolicyName, nodeName)
 	if err != nil {
 		glog.Errorf("Error attaching volume %q to node %q: %+v", volumeSource.VolumePath, nodeName, err)
 		return "", err
--- a/plugin/BUILD
+++ b/plugin/BUILD
@ -19,6 +19,7 @@ filegroup(
        "//plugin/pkg/admission/deny:all-srcs",
        "//plugin/pkg/admission/eventratelimit:all-srcs",
        "//plugin/pkg/admission/exec:all-srcs",
+        "//plugin/pkg/admission/extendedresourcetoleration:all-srcs",
        "//plugin/pkg/admission/gc:all-srcs",
        "//plugin/pkg/admission/imagepolicy:all-srcs",
        "//plugin/pkg/admission/initialresources:all-srcs",
--- a/plugin/pkg/admission/extendedresourcetoleration/BUILD
+++ b/plugin/pkg/admission/extendedresourcetoleration/BUILD
@ -0,0 +1,42 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["admission.go"],
+    importpath = "k8s.io/kubernetes/plugin/pkg/admission/extendedresourcetoleration",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/apis/core:go_default_library",
+        "//pkg/apis/core/helper:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
+        "//vendor/k8s.io/apiserver/pkg/admission:go_default_library",
+    ],
+)
+
+go_test(
+    name = "go_default_test",
+    srcs = ["admission_test.go"],
+    importpath = "k8s.io/kubernetes/plugin/pkg/admission/extendedresourcetoleration",
+    library = ":go_default_library",
+    deps = [
+        "//pkg/apis/core:go_default_library",
+        "//pkg/apis/core/helper:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
+        "//vendor/k8s.io/apiserver/pkg/admission:go_default_library",
+    ],
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [":package-srcs"],
+    tags = ["automanaged"],
+    visibility = ["//visibility:public"],
+)
--- a/plugin/pkg/admission/extendedresourcetoleration/admission.go
+++ b/plugin/pkg/admission/extendedresourcetoleration/admission.go
@ -0,0 +1,94 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package extendedresourcetoleration
+
+import (
+	"fmt"
+	"io"
+
+	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/apiserver/pkg/admission"
+	"k8s.io/kubernetes/pkg/apis/core"
+	"k8s.io/kubernetes/pkg/apis/core/helper"
+)
+
+// Register is called by the apiserver to register the plugin factory.
+func Register(plugins *admission.Plugins) {
+	plugins.Register("ExtendedResourceToleration", func(config io.Reader) (admission.Interface, error) {
+		return newExtendedResourceToleration(), nil
+	})
+}
+
+// newExtendedResourceToleration creates a new instance of the ExtendedResourceToleration admission controller.
+func newExtendedResourceToleration() *plugin {
+	return &plugin{
+		Handler: admission.NewHandler(admission.Create, admission.Update),
+	}
+}
+
+// Make sure we are implementing the interface.
+var _ admission.MutationInterface = &plugin{}
+
+type plugin struct {
+	*admission.Handler
+}
+
+// Admit updates the toleration of a pod based on the resources requested by it.
+// If an extended resource of name "example.com/device" is requested, it adds
+// a toleration with key "example.com/device", operator "Exists" and effect "NoSchedule".
+// The rationale for this is described in:
+// https://github.com/kubernetes/kubernetes/issues/55080
+func (p *plugin) Admit(attributes admission.Attributes) error {
+	// Ignore all calls to subresources or resources other than pods.
+	if len(attributes.GetSubresource()) != 0 || attributes.GetResource().GroupResource() != core.Resource("pods") {
+		return nil
+	}
+
+	pod, ok := attributes.GetObject().(*core.Pod)
+	if !ok {
+		return errors.NewBadRequest(fmt.Sprintf("expected *core.Pod but got %T", attributes.GetObject()))
+	}
+
+	resources := sets.String{}
+	for _, container := range pod.Spec.Containers {
+		for resourceName := range container.Resources.Requests {
+			if helper.IsExtendedResourceName(resourceName) {
+				resources.Insert(string(resourceName))
+			}
+		}
+	}
+	for _, container := range pod.Spec.InitContainers {
+		for resourceName := range container.Resources.Requests {
+			if helper.IsExtendedResourceName(resourceName) {
+				resources.Insert(string(resourceName))
+			}
+		}
+	}
+
+	// Doing .List() so that we get a stable sorted list.
+	// This allows us to test adding tolerations for multiple extended resources.
+	for _, resource := range resources.List() {
+		helper.AddOrUpdateTolerationInPod(pod, &core.Toleration{
+			Key:      resource,
+			Operator: core.TolerationOpExists,
+			Effect:   core.TaintEffectNoSchedule,
+		})
+	}
+
+	return nil
+}
--- a/plugin/pkg/admission/extendedresourcetoleration/admission_test.go
+++ b/plugin/pkg/admission/extendedresourcetoleration/admission_test.go
@ -0,0 +1,382 @@
+/*
+Copyright 2017 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package extendedresourcetoleration
+
+import (
+	"testing"
+
+	"k8s.io/apimachinery/pkg/api/resource"
+	"k8s.io/apiserver/pkg/admission"
+	"k8s.io/kubernetes/pkg/apis/core"
+	"k8s.io/kubernetes/pkg/apis/core/helper"
+)
+
+func TestAdmit(t *testing.T) {
+
+	plugin := newExtendedResourceToleration()
+
+	containerRequestingCPU := core.Container{
+		Resources: core.ResourceRequirements{
+			Requests: core.ResourceList{
+				core.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI),
+			},
+		},
+	}
+
+	containerRequestingMemory := core.Container{
+		Resources: core.ResourceRequirements{
+			Requests: core.ResourceList{
+				core.ResourceMemory: *resource.NewQuantity(2048, resource.DecimalSI),
+			},
+		},
+	}
+
+	extendedResource1 := "example.com/device-ek"
+	extendedResource2 := "example.com/device-do"
+
+	containerRequestingExtendedResource1 := core.Container{
+		Resources: core.ResourceRequirements{
+			Requests: core.ResourceList{
+				core.ResourceName(extendedResource1): *resource.NewQuantity(1, resource.DecimalSI),
+			},
+		},
+	}
+	containerRequestingExtendedResource2 := core.Container{
+		Resources: core.ResourceRequirements{
+			Requests: core.ResourceList{
+				core.ResourceName(extendedResource2): *resource.NewQuantity(2, resource.DecimalSI),
+			},
+		},
+	}
+
+	tests := []struct {
+		description  string
+		requestedPod core.Pod
+		expectedPod  core.Pod
+	}{
+		{
+			description: "empty pod without any extended resources, expect no change in tolerations",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{},
+			},
+		},
+		{
+			description: "pod with container without any extended resources, expect no change in tolerations",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+					},
+				},
+			},
+		},
+		{
+			description: "pod with init container without any extended resources, expect no change in tolerations",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					InitContainers: []core.Container{
+						containerRequestingMemory,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					InitContainers: []core.Container{
+						containerRequestingMemory,
+					},
+				},
+			},
+		},
+		{
+			description: "pod with container with extended resource, expect toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingExtendedResource1,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with init container with extended resource, expect toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					InitContainers: []core.Container{
+						containerRequestingExtendedResource2,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					InitContainers: []core.Container{
+						containerRequestingExtendedResource2,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource2,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with existing tolerations and container with extended resource, expect existing tolerations to be preserved and new toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      "foo",
+							Operator: core.TolerationOpEqual,
+							Value:    "bar",
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      "foo",
+							Operator: core.TolerationOpEqual,
+							Value:    "bar",
+							Effect:   core.TaintEffectNoSchedule,
+						},
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with multiple extended resources, expect multiple tolerations to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					InitContainers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingExtendedResource2,
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					InitContainers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingExtendedResource2,
+					},
+					Tolerations: []core.Toleration{
+						// Note the order, it's sorted by the Key
+						{
+							Key:      extendedResource2,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with container requesting extended resource and existing correct toleration, expect no change in tolerations",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with container requesting extended resource and existing toleration with the same key but different effect and value, expect existing tolerations to be preserved and new toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpEqual,
+							Value:    "foo",
+							Effect:   core.TaintEffectNoExecute,
+						},
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpEqual,
+							Value:    "foo",
+							Effect:   core.TaintEffectNoExecute,
+						},
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+		{
+			description: "pod with wildcard toleration and container requesting extended resource, expect existing tolerations to be preserved and new toleration to be added",
+			requestedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Operator: core.TolerationOpExists,
+						},
+					},
+				},
+			},
+			expectedPod: core.Pod{
+				Spec: core.PodSpec{
+					Containers: []core.Container{
+						containerRequestingCPU,
+						containerRequestingMemory,
+						containerRequestingExtendedResource1,
+					},
+					Tolerations: []core.Toleration{
+						{
+							Operator: core.TolerationOpExists,
+						},
+						{
+							Key:      extendedResource1,
+							Operator: core.TolerationOpExists,
+							Effect:   core.TaintEffectNoSchedule,
+						},
+					},
+				},
+			},
+		},
+	}
+	for i, test := range tests {
+		err := plugin.Admit(admission.NewAttributesRecord(&test.requestedPod, nil, core.Kind("Pod").WithVersion("version"), "foo", "name", core.Resource("pods").WithVersion("version"), "", "ignored", nil))
+		if err != nil {
+			t.Errorf("[%d: %s] unexpected error %v for pod %+v", i, test.description, err, test.requestedPod)
+		}
+
+		if !helper.Semantic.DeepEqual(test.expectedPod.Spec.Tolerations, test.requestedPod.Spec.Tolerations) {
+			t.Errorf("[%d: %s] expected %#v got %#v", i, test.description, test.expectedPod.Spec.Tolerations, test.requestedPod.Spec.Tolerations)
+		}
+	}
+}
+
+func TestHandles(t *testing.T) {
+	plugin := newExtendedResourceToleration()
+	tests := map[admission.Operation]bool{
+		admission.Create:  true,
+		admission.Update:  true,
+		admission.Delete:  false,
+		admission.Connect: false,
+	}
+	for op, expected := range tests {
+		result := plugin.Handles(op)
+		if result != expected {
+			t.Errorf("Unexpected result for operation %s: %v\n", op, result)
+		}
+	}
+}
--- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go
+++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/policy.go
@ -169,7 +169,7 @@ func ClusterRoles() []rbac.ClusterRole {
 			ObjectMeta: metav1.ObjectMeta{Name: "system:basic-user"},
 			Rules: []rbac.PolicyRule{
 				// TODO add future selfsubjectrulesreview, project request APIs, project listing APIs
-				rbac.NewRule("create").Groups(authorizationGroup).Resources("selfsubjectaccessreviews").RuleOrDie(),
+				rbac.NewRule("create").Groups(authorizationGroup).Resources("selfsubjectaccessreviews", "selfsubjectrulesreviews").RuleOrDie(),
 			},
 		},

--- a/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml
+++ b/plugin/pkg/auth/authorizer/rbac/bootstrappolicy/testdata/cluster-roles.yaml
@ -522,6 +522,7 @@ items:
    - authorization.k8s.io
    resources:
    - selfsubjectaccessreviews
+    - selfsubjectrulesreviews
    verbs:
    - create
 - apiVersion: rbac.authorization.k8s.io/v1
--- a/staging/src/k8s.io/apiserver/pkg/server/options/admission.go
+++ b/staging/src/k8s.io/apiserver/pkg/server/options/admission.go
@ -67,7 +67,11 @@ func NewAdmissionOptions() *AdmissionOptions {
 // AddFlags adds flags related to admission for a specific APIServer to the specified FlagSet
 func (a *AdmissionOptions) AddFlags(fs *pflag.FlagSet) {
 	fs.StringSliceVar(&a.PluginNames, "admission-control", a.PluginNames, ""+
-		"Ordered list of plug-ins to do admission control of resources into cluster. "+
+		"Admission is divided into two phases. "+
+		"In the first phase, only mutating admission plugins run. "+
+		"In the second phase, only validating admission plugins run. "+
+		"The names in the below list may represent a validating plugin, a mutating plugin, or both. "+
+		"Within each phase, the plugins will run in the order in which they are passed to this flag. "+
 		"Comma-delimited list of: "+strings.Join(a.Plugins.Registered(), ", ")+".")

 	fs.StringVar(&a.ConfigFile, "admission-control-config-file", a.ConfigFile,
--- a/test/e2e/apps/BUILD
+++ b/test/e2e/apps/BUILD
@ -69,7 +69,6 @@ go_library(
        "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/watch:go_default_library",
        "//vendor/k8s.io/client-go/kubernetes:go_default_library",
-        "//vendor/k8s.io/client-go/kubernetes/typed/extensions/v1beta1:go_default_library",
        "//vendor/k8s.io/client-go/tools/cache:go_default_library",
    ],
 )
--- a/test/e2e/apps/deployment.go
+++ b/test/e2e/apps/deployment.go
@ -35,7 +35,6 @@ import (
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/apimachinery/pkg/watch"
 	clientset "k8s.io/client-go/kubernetes"
-	extensionsclient "k8s.io/client-go/kubernetes/typed/extensions/v1beta1"
 	extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
 	"k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
 	deploymentutil "k8s.io/kubernetes/pkg/controller/deployment/util"
@ -87,10 +86,6 @@ var _ = SIGDescribe("Deployment", func() {
 	It("deployment should support rollback", func() {
 		testRollbackDeployment(f)
 	})
-	It("scaled rollout deployment should not block on annotation check", func() {
-		testScaledRolloutDeployment(f)
-	})
-
 	It("iterative rollouts should eventually progress", func() {
 		testIterativeDeployments(f)
 	})
@ -621,159 +616,6 @@ func testRollbackDeployment(f *framework.Framework) {
 	Expect(err).NotTo(HaveOccurred())
 }

-func testScaledRolloutDeployment(f *framework.Framework) {
-	ns := f.Namespace.Name
-	c := f.ClientSet
-
-	podLabels := map[string]string{"name": NginxImageName}
-	replicas := int32(10)
-
-	// Create a nginx deployment.
-	deploymentName := "nginx"
-	d := framework.NewDeployment(deploymentName, replicas, podLabels, NginxImageName, NginxImage, extensions.RollingUpdateDeploymentStrategyType)
-	d.Spec.Strategy.RollingUpdate = new(extensions.RollingUpdateDeployment)
-	d.Spec.Strategy.RollingUpdate.MaxSurge = intOrStrP(3)
-	d.Spec.Strategy.RollingUpdate.MaxUnavailable = intOrStrP(2)
-
-	framework.Logf("Creating deployment %q", deploymentName)
-	deployment, err := c.ExtensionsV1beta1().Deployments(ns).Create(d)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for observed generation %d", deployment.Generation)
-	Expect(framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)).NotTo(HaveOccurred())
-
-	// Verify that the required pods have come up.
-	framework.Logf("Waiting for all required pods to come up")
-	err = framework.VerifyPodsRunning(f.ClientSet, ns, NginxImageName, false, *(deployment.Spec.Replicas))
-	Expect(err).NotTo(HaveOccurred(), "error in waiting for pods to come up: %v", err)
-
-	framework.Logf("Waiting for deployment %q to complete", deployment.Name)
-	Expect(framework.WaitForDeploymentComplete(c, deployment)).NotTo(HaveOccurred())
-
-	first, err := deploymentutil.GetNewReplicaSet(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	// Update the deployment with a non-existent image so that the new replica set will be blocked.
-	framework.Logf("Updating deployment %q with a non-existent image", deploymentName)
-	deployment, err = framework.UpdateDeploymentWithRetries(c, ns, d.Name, func(update *extensions.Deployment) {
-		update.Spec.Template.Spec.Containers[0].Image = "nginx:404"
-	})
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for observed generation %d", deployment.Generation)
-	err = framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)
-	Expect(err).NotTo(HaveOccurred())
-
-	deployment, err = c.ExtensionsV1beta1().Deployments(ns).Get(deploymentName, metav1.GetOptions{})
-	Expect(err).NotTo(HaveOccurred())
-
-	if deployment.Status.AvailableReplicas < deploymentutil.MinAvailable(deployment) {
-		Expect(fmt.Errorf("Observed %d available replicas, less than min required %d", deployment.Status.AvailableReplicas, deploymentutil.MinAvailable(deployment))).NotTo(HaveOccurred())
-	}
-
-	framework.Logf("Checking that the replica sets for %q are synced", deploymentName)
-	second, err := deploymentutil.GetNewReplicaSet(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	first, err = c.ExtensionsV1beta1().ReplicaSets(first.Namespace).Get(first.Name, metav1.GetOptions{})
-	Expect(err).NotTo(HaveOccurred())
-
-	firstCond := replicaSetHasDesiredReplicas(c.ExtensionsV1beta1(), first)
-	err = wait.PollImmediate(10*time.Millisecond, 1*time.Minute, firstCond)
-	Expect(err).NotTo(HaveOccurred())
-
-	secondCond := replicaSetHasDesiredReplicas(c.ExtensionsV1beta1(), second)
-	err = wait.PollImmediate(10*time.Millisecond, 1*time.Minute, secondCond)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Updating the size (up) and template at the same time for deployment %q", deploymentName)
-	newReplicas := int32(20)
-	deployment, err = framework.UpdateDeploymentWithRetries(c, ns, deployment.Name, func(update *extensions.Deployment) {
-		update.Spec.Replicas = &newReplicas
-		update.Spec.Template.Spec.Containers[0].Image = NautilusImage
-	})
-	Expect(err).NotTo(HaveOccurred())
-
-	err = framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for deployment status to sync (current available: %d, minimum available: %d)", deployment.Status.AvailableReplicas, deploymentutil.MinAvailable(deployment))
-	Expect(framework.WaitForDeploymentComplete(c, deployment)).NotTo(HaveOccurred())
-
-	oldRSs, _, rs, err := deploymentutil.GetAllReplicaSets(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	for _, rs := range append(oldRSs, rs) {
-		framework.Logf("Ensuring replica set %q has the correct desiredReplicas annotation", rs.Name)
-		desired, ok := deploymentutil.GetDesiredReplicasAnnotation(rs)
-		if !ok || desired == *(deployment.Spec.Replicas) {
-			continue
-		}
-		err = fmt.Errorf("unexpected desiredReplicas annotation %d for replica set %q", desired, rs.Name)
-		Expect(err).NotTo(HaveOccurred())
-	}
-
-	// Update the deployment with a non-existent image so that the new replica set will be blocked.
-	framework.Logf("Updating deployment %q with a non-existent image", deploymentName)
-	deployment, err = framework.UpdateDeploymentWithRetries(c, ns, d.Name, func(update *extensions.Deployment) {
-		update.Spec.Template.Spec.Containers[0].Image = "nginx:404"
-	})
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for observed generation %d", deployment.Generation)
-	err = framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)
-	Expect(err).NotTo(HaveOccurred())
-
-	deployment, err = c.ExtensionsV1beta1().Deployments(ns).Get(deploymentName, metav1.GetOptions{})
-	Expect(err).NotTo(HaveOccurred())
-
-	if deployment.Status.AvailableReplicas < deploymentutil.MinAvailable(deployment) {
-		Expect(fmt.Errorf("Observed %d available replicas, less than min required %d", deployment.Status.AvailableReplicas, deploymentutil.MinAvailable(deployment))).NotTo(HaveOccurred())
-	}
-
-	framework.Logf("Checking that the replica sets for %q are synced", deploymentName)
-	oldRs, err := c.ExtensionsV1beta1().ReplicaSets(rs.Namespace).Get(rs.Name, metav1.GetOptions{})
-	Expect(err).NotTo(HaveOccurred())
-
-	newRs, err := deploymentutil.GetNewReplicaSet(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	oldCond := replicaSetHasDesiredReplicas(c.ExtensionsV1beta1(), oldRs)
-	err = wait.PollImmediate(10*time.Millisecond, 1*time.Minute, oldCond)
-	Expect(err).NotTo(HaveOccurred())
-
-	newCond := replicaSetHasDesiredReplicas(c.ExtensionsV1beta1(), newRs)
-	err = wait.PollImmediate(10*time.Millisecond, 1*time.Minute, newCond)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Updating the size (down) and template at the same time for deployment %q", deploymentName)
-	newReplicas = int32(5)
-	deployment, err = framework.UpdateDeploymentWithRetries(c, ns, deployment.Name, func(update *extensions.Deployment) {
-		update.Spec.Replicas = &newReplicas
-		update.Spec.Template.Spec.Containers[0].Image = KittenImage
-	})
-	Expect(err).NotTo(HaveOccurred())
-
-	err = framework.WaitForObservedDeployment(c, ns, deploymentName, deployment.Generation)
-	Expect(err).NotTo(HaveOccurred())
-
-	framework.Logf("Waiting for deployment status to sync (current available: %d, minimum available: %d)", deployment.Status.AvailableReplicas, deploymentutil.MinAvailable(deployment))
-	Expect(framework.WaitForDeploymentComplete(c, deployment)).NotTo(HaveOccurred())
-
-	oldRSs, _, rs, err = deploymentutil.GetAllReplicaSets(deployment, c.ExtensionsV1beta1())
-	Expect(err).NotTo(HaveOccurred())
-
-	for _, rs := range append(oldRSs, rs) {
-		framework.Logf("Ensuring replica set %q has the correct desiredReplicas annotation", rs.Name)
-		desired, ok := deploymentutil.GetDesiredReplicasAnnotation(rs)
-		if !ok || desired == *(deployment.Spec.Replicas) {
-			continue
-		}
-		err = fmt.Errorf("unexpected desiredReplicas annotation %d for replica set %q", desired, rs.Name)
-		Expect(err).NotTo(HaveOccurred())
-	}
-}
-
 func randomScale(d *extensions.Deployment, i int) {
 	switch r := rand.Float32(); {
 	case r < 0.3:
@ -904,17 +746,6 @@ func testIterativeDeployments(f *framework.Framework) {
 	Expect(framework.WaitForDeploymentWithCondition(c, ns, deploymentName, deploymentutil.NewRSAvailableReason, extensions.DeploymentProgressing)).NotTo(HaveOccurred())
 }

-func replicaSetHasDesiredReplicas(rsClient extensionsclient.ReplicaSetsGetter, replicaSet *extensions.ReplicaSet) wait.ConditionFunc {
-	desiredGeneration := replicaSet.Generation
-	return func() (bool, error) {
-		rs, err := rsClient.ReplicaSets(replicaSet.Namespace).Get(replicaSet.Name, metav1.GetOptions{})
-		if err != nil {
-			return false, err
-		}
-		return rs.Status.ObservedGeneration >= desiredGeneration && rs.Status.Replicas == *(rs.Spec.Replicas), nil
-	}
-}
-
 func testDeploymentsControllerRef(f *framework.Framework) {
 	ns := f.Namespace.Name
 	c := f.ClientSet
@ -954,16 +785,6 @@ func testDeploymentsControllerRef(f *framework.Framework) {
 	Expect(err).NotTo(HaveOccurred())
 }

-func waitDeploymentReplicaSetsControllerRef(c clientset.Interface, ns string, uid types.UID, label map[string]string) func() (bool, error) {
-	return func() (bool, error) {
-		err := checkDeploymentReplicaSetsControllerRef(c, ns, uid, label)
-		if err != nil {
-			return false, nil
-		}
-		return true, nil
-	}
-}
-
 func checkDeploymentReplicaSetsControllerRef(c clientset.Interface, ns string, uid types.UID, label map[string]string) error {
 	rsList := listDeploymentReplicaSets(c, ns, label)
 	for _, rs := range rsList.Items {
--- a/test/e2e/storage/persistent_volumes-local.go
+++ b/test/e2e/storage/persistent_volumes-local.go
@ -21,6 +21,8 @@ import (
 	"fmt"
 	"path"
 	"path/filepath"
+	"strconv"
+	"strings"
 	"time"

 	. "github.com/onsi/ginkgo"
@ -54,8 +56,22 @@ const (
 	DirectoryLocalVolumeType LocalVolumeType = "dir"
 	// creates a tmpfs and mounts it
 	TmpfsLocalVolumeType LocalVolumeType = "tmpfs"
+	// tests based on local ssd at /mnt/disks/by-uuid/
+	GCELocalSSDVolumeType LocalVolumeType = "gce-localssd-scsi-fs"
 )

+var setupLocalVolumeMap = map[LocalVolumeType]func(*localTestConfig) *localTestVolume{
+	GCELocalSSDVolumeType:    setupLocalVolumeGCELocalSSD,
+	TmpfsLocalVolumeType:     setupLocalVolumeTmpfs,
+	DirectoryLocalVolumeType: setupLocalVolumeDirectory,
+}
+
+var cleanupLocalVolumeMap = map[LocalVolumeType]func(*localTestConfig, *localTestVolume){
+	GCELocalSSDVolumeType:    cleanupLocalVolumeGCELocalSSD,
+	TmpfsLocalVolumeType:     cleanupLocalVolumeTmpfs,
+	DirectoryLocalVolumeType: cleanupLocalVolumeDirectory,
+}
+
 type localTestVolume struct {
 	// Node that the volume is on
 	node *v1.Node
@ -199,105 +215,100 @@ var _ = SIGDescribe("PersistentVolumes-local [Feature:LocalPersistentVolumes] [S
 		})
 	})

-	LocalVolumeTypes := []LocalVolumeType{DirectoryLocalVolumeType, TmpfsLocalVolumeType}
-
-	Context("when two pods mount a local volume at the same time", func() {
-		It("should be able to write from pod1 and read from pod2", func() {
-			for _, testVolType := range LocalVolumeTypes {
-				var testVol *localTestVolume
-				By(fmt.Sprintf("local-volume-type: %s", testVolType))
-				testVol = setupLocalVolumePVCPV(config, testVolType)
-				twoPodsReadWriteTest(config, testVol)
-				cleanupLocalVolume(config, testVol)
-			}
-		})
-	})
-
-	Context("when two pods mount a local volume one after the other", func() {
-		It("should be able to write from pod1 and read from pod2", func() {
-			for _, testVolType := range LocalVolumeTypes {
-				var testVol *localTestVolume
-				By(fmt.Sprintf("local-volume-type: %s", testVolType))
-				testVol = setupLocalVolumePVCPV(config, testVolType)
-				twoPodsReadWriteSerialTest(config, testVol)
-				cleanupLocalVolume(config, testVol)
-			}
-		})
-	})
-
-	Context("when pod using local volume with non-existant path", func() {
-		ep := &eventPatterns{
-			reason:  "FailedMount",
-			pattern: make([]string, 2)}
-		ep.pattern = append(ep.pattern, "MountVolume.SetUp failed")
-		ep.pattern = append(ep.pattern, "does not exist")
-
-		It("should not be able to mount", func() {
-			for _, testVolType := range LocalVolumeTypes {
-				By(fmt.Sprintf("local-volume-type: %s", testVolType))
-				testVol := &localTestVolume{
-					node:            config.node0,
-					hostDir:         "/non-existent/location/nowhere",
-					localVolumeType: testVolType,
+	LocalVolumeTypes := []LocalVolumeType{DirectoryLocalVolumeType, TmpfsLocalVolumeType, GCELocalSSDVolumeType}
+	for _, tempTestVolType := range LocalVolumeTypes {
+		// New variable required for gingko test closures
+		testVolType := tempTestVolType
+		ctxString := fmt.Sprintf("when using volume type %s", testVolType)
+		Context(ctxString, func() {
+			BeforeEach(func() {
+				if testVolType == GCELocalSSDVolumeType {
+					SkipUnlessLocalSSDExists("scsi", "fs", config.node0)
 				}
-				By("Creating local PVC and PV")
-				createLocalPVCPV(config, testVol)
-				pod, err := createLocalPod(config, testVol)
-				Expect(err).To(HaveOccurred())
-				checkPodEvents(config, pod.Name, ep)
-			}
-		})
-	})
-
-	Context("when pod's node is different from PV's NodeAffinity", func() {
-
-		BeforeEach(func() {
-			if len(config.nodes.Items) < 2 {
-				framework.Skipf("Runs only when number of nodes >= 2")
-			}
-		})
-
-		ep := &eventPatterns{
-			reason:  "FailedScheduling",
-			pattern: make([]string, 2)}
-		ep.pattern = append(ep.pattern, "MatchNodeSelector")
-		ep.pattern = append(ep.pattern, "NoVolumeNodeConflict")
-		for _, testVolType := range LocalVolumeTypes {
-
-			It("should not be able to mount due to different NodeAffinity", func() {
-
-				testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeAffinity)
 			})

-			It("should not be able to mount due to different NodeSelector", func() {
+			Context("when two pods mount a local volume at the same time", func() {
+				It("should be able to write from pod1 and read from pod2", func() {
+					var testVol *localTestVolume
+					testVol = setupLocalVolumePVCPV(config, testVolType)
+					twoPodsReadWriteTest(config, testVol)
+					cleanupLocalVolume(config, testVol)
+				})
+
+			})
+			Context("when two pods mount a local volume one after the other", func() {
+				It("should be able to write from pod1 and read from pod2", func() {
+					var testVol *localTestVolume
+					testVol = setupLocalVolumePVCPV(config, testVolType)
+					twoPodsReadWriteSerialTest(config, testVol)
+					cleanupLocalVolume(config, testVol)
+				})
+			})
+			Context("when pod using local volume with non-existant path", func() {
+
+				ep := &eventPatterns{
+					reason:  "FailedMount",
+					pattern: make([]string, 2)}
+				ep.pattern = append(ep.pattern, "MountVolume.SetUp failed")
+				ep.pattern = append(ep.pattern, "does not exist")
+
+				It("should not be able to mount", func() {
+					testVol := &localTestVolume{
+						node:            config.node0,
+						hostDir:         "/non-existent/location/nowhere",
+						localVolumeType: testVolType,
+					}
+					By("Creating local PVC and PV")
+					createLocalPVCPV(config, testVol)
+					pod, err := createLocalPod(config, testVol)
+					Expect(err).To(HaveOccurred())
+					checkPodEvents(config, pod.Name, ep)
+				})
+
+			})
+			Context("when pod's node is different from PV's NodeAffinity", func() {
+
+				BeforeEach(func() {
+					if len(config.nodes.Items) < 2 {
+						framework.Skipf("Runs only when number of nodes >= 2")
+					}
+				})
+
+				ep := &eventPatterns{
+					reason:  "FailedScheduling",
+					pattern: make([]string, 2)}
+				ep.pattern = append(ep.pattern, "MatchNodeSelector")
+				ep.pattern = append(ep.pattern, "NoVolumeNodeConflict")
+
+				It("should not be able to mount due to different NodeAffinity", func() {
+					testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeAffinity)
+				})
+				It("should not be able to mount due to different NodeSelector", func() {
+					testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeSelector)
+				})
+
+			})
+			Context("when pod's node is different from PV's NodeName", func() {
+
+				BeforeEach(func() {
+					if len(config.nodes.Items) < 2 {
+						framework.Skipf("Runs only when number of nodes >= 2")
+					}
+				})
+
+				ep := &eventPatterns{
+					reason:  "FailedMount",
+					pattern: make([]string, 2)}
+				ep.pattern = append(ep.pattern, "NodeSelectorTerm")
+				ep.pattern = append(ep.pattern, "Storage node affinity check failed")
+				It("should not be able to mount due to different NodeName", func() {
+					testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeName)
+				})

-				testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeSelector)
 			})

-		}
-	})
-
-	Context("when pod's node is different from PV's NodeName", func() {
-
-		BeforeEach(func() {
-			if len(config.nodes.Items) < 2 {
-				framework.Skipf("Runs only when number of nodes >= 2")
-			}
 		})
-
-		ep := &eventPatterns{
-			reason:  "FailedMount",
-			pattern: make([]string, 2)}
-		ep.pattern = append(ep.pattern, "NodeSelectorTerm")
-		ep.pattern = append(ep.pattern, "Storage node affinity check failed")
-		for _, testVolType := range LocalVolumeTypes {
-
-			It("should not be able to mount due to different NodeName", func() {
-
-				testPodWithNodeName(config, testVolType, ep, config.nodes.Items[1].Name, makeLocalPodWithNodeName)
-			})
-		}
-	})
+	}

 	Context("when using local volume provisioner", func() {
 		var volumePath string
@ -362,7 +373,6 @@ type makeLocalPodWith func(config *localTestConfig, volume *localTestVolume, nod

 func testPodWithNodeName(config *localTestConfig, testVolType LocalVolumeType, ep *eventPatterns, nodeName string, makeLocalPodFunc makeLocalPodWith) {
 	var testVol *localTestVolume
-	By(fmt.Sprintf("local-volume-type: %s", testVolType))
 	testVol = setupLocalVolumePVCPV(config, testVolType)

 	pod := makeLocalPodFunc(config, testVol, nodeName)
@ -486,16 +496,7 @@ func podNodeName(config *localTestConfig, pod *v1.Pod) (string, error) {
 	return runtimePod.Spec.NodeName, runtimePodErr
 }

-// setupLocalVolume setups a directory to user for local PV
-func setupLocalVolume(config *localTestConfig, localVolumeType LocalVolumeType) *localTestVolume {
-	testDirName := "local-volume-test-" + string(uuid.NewUUID())
-	hostDir := filepath.Join(hostBase, testDirName)
-
-	if localVolumeType == TmpfsLocalVolumeType {
-		createAndMountTmpfsLocalVolume(config, hostDir)
-	}
-
-	// populate volume with testFile containing testFileContent
+func setupWriteTestFile(hostDir string, config *localTestConfig, localVolumeType LocalVolumeType) *localTestVolume {
 	writeCmd, _ := createWriteAndReadCmds(hostDir, testFile, testFileContent)
 	By(fmt.Sprintf("Creating local volume on node %q at path %q", config.node0.Name, hostDir))
 	err := framework.IssueSSHCommand(writeCmd, framework.TestContext.Provider, config.node0)
@ -507,7 +508,30 @@ func setupLocalVolume(config *localTestConfig, localVolumeType LocalVolumeType)
 	}
 }

-// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory
+func setupLocalVolumeTmpfs(config *localTestConfig) *localTestVolume {
+	testDirName := "local-volume-test-" + string(uuid.NewUUID())
+	hostDir := filepath.Join(hostBase, testDirName)
+	createAndMountTmpfsLocalVolume(config, hostDir)
+	// populate volume with testFile containing testFileContent
+	return setupWriteTestFile(hostDir, config, TmpfsLocalVolumeType)
+}
+
+func setupLocalVolumeGCELocalSSD(config *localTestConfig) *localTestVolume {
+	res, err := framework.IssueSSHCommandWithResult("ls /mnt/disks/by-uuid/google-local-ssds-scsi-fs/", framework.TestContext.Provider, config.node0)
+	Expect(err).NotTo(HaveOccurred())
+	dirName := strings.Fields(res.Stdout)[0]
+	hostDir := "/mnt/disks/by-uuid/google-local-ssds-scsi-fs/" + dirName
+	// populate volume with testFile containing testFileContent
+	return setupWriteTestFile(hostDir, config, GCELocalSSDVolumeType)
+}
+
+func setupLocalVolumeDirectory(config *localTestConfig) *localTestVolume {
+	testDirName := "local-volume-test-" + string(uuid.NewUUID())
+	hostDir := filepath.Join(hostBase, testDirName)
+	// populate volume with testFile containing testFileContent
+	return setupWriteTestFile(hostDir, config, DirectoryLocalVolumeType)
+}
+
 func cleanupLocalVolume(config *localTestConfig, volume *localTestVolume) {
 	if volume == nil {
 		return
@ -519,10 +543,30 @@ func cleanupLocalVolume(config *localTestConfig, volume *localTestVolume) {
 		framework.Failf("Failed to delete PV and/or PVC: %v", utilerrors.NewAggregate(errs))
 	}

-	if volume.localVolumeType == TmpfsLocalVolumeType {
-		unmountTmpfsLocalVolume(config, volume.hostDir)
-	}
+	cleanup := cleanupLocalVolumeMap[volume.localVolumeType]
+	cleanup(config, volume)
+}

+// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory
+func cleanupLocalVolumeGCELocalSSD(config *localTestConfig, volume *localTestVolume) {
+	By("Removing the test directory")
+	removeCmd := fmt.Sprintf("rm %s", volume.hostDir+"/"+testFile)
+	err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0)
+	Expect(err).NotTo(HaveOccurred())
+}
+
+// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory
+func cleanupLocalVolumeTmpfs(config *localTestConfig, volume *localTestVolume) {
+	unmountTmpfsLocalVolume(config, volume.hostDir)
+
+	By("Removing the test directory")
+	removeCmd := fmt.Sprintf("rm -r %s", volume.hostDir)
+	err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0)
+	Expect(err).NotTo(HaveOccurred())
+}
+
+// Deletes the PVC/PV, and launches a pod with hostpath volume to remove the test directory
+func cleanupLocalVolumeDirectory(config *localTestConfig, volume *localTestVolume) {
 	By("Removing the test directory")
 	removeCmd := fmt.Sprintf("rm -r %s", volume.hostDir)
 	err := framework.IssueSSHCommand(removeCmd, framework.TestContext.Provider, config.node0)
@ -703,7 +747,9 @@ func podRWCmdExec(pod *v1.Pod, cmd string) string {
 // and create local PVC and PV
 func setupLocalVolumePVCPV(config *localTestConfig, localVolumeType LocalVolumeType) *localTestVolume {
 	By("Initializing test volume")
-	testVol := setupLocalVolume(config, localVolumeType)
+	setupLocalVolume, ok := setupLocalVolumeMap[localVolumeType]
+	Expect(ok).To(BeTrue())
+	testVol := setupLocalVolume(config)

 	By("Creating local PVC and PV")
 	createLocalPVCPV(config, testVol)
@ -921,3 +967,16 @@ func findLocalPersistentVolume(c clientset.Interface, volumePath string) (*v1.Pe
 	}
 	return nil, fmt.Errorf("Unable to find local persistent volume with path %v", volumePath)
 }
+
+// SkipUnlessLocalSSDExists takes in an ssdInterface (scsi/nvme) and a filesystemType (fs/block)
+// and skips if a disk of that type does not exist on the node
+func SkipUnlessLocalSSDExists(ssdInterface, filesystemType string, node *v1.Node) {
+	ssdCmd := fmt.Sprintf("ls -1 /mnt/disks/by-uuid/google-local-ssds-%s-%s/ | wc -l", ssdInterface, filesystemType)
+	res, err := framework.IssueSSHCommandWithResult(ssdCmd, framework.TestContext.Provider, node)
+	Expect(err).NotTo(HaveOccurred())
+	num, err := strconv.Atoi(strings.TrimSpace(res.Stdout))
+	Expect(err).NotTo(HaveOccurred())
+	if num < 1 {
+		framework.Skipf("Requires at least 1 %s %s localSSD ", ssdInterface, filesystemType)
+	}
+}
--- a/test/e2e/storage/persistent_volumes-vsphere.go
+++ b/test/e2e/storage/persistent_volumes-vsphere.go
@ -70,7 +70,7 @@ var _ = SIGDescribe("PersistentVolumes:vsphere", func() {
 		selector = metav1.SetAsLabelSelector(volLabel)

 		if vsp == nil {
-			vsp, err = vsphere.GetVSphere()
+			vsp, err = getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())
 		}
 		if volumePath == "" {
@ -105,7 +105,7 @@ var _ = SIGDescribe("PersistentVolumes:vsphere", func() {
 		node = types.NodeName(clientPod.Spec.NodeName)

 		By("Verify disk should be attached to the node")
-		isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, node)
+		isAttached, err := verifyVSphereDiskAttached(c, vsp, volumePath, node)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), "disk is not attached with the node")
 	})
@ -133,7 +133,11 @@ var _ = SIGDescribe("PersistentVolumes:vsphere", func() {
 	framework.AddCleanupAction(func() {
 		// Cleanup actions will be called even when the tests are skipped and leaves namespace unset.
 		if len(ns) > 0 && len(volumePath) > 0 {
-			framework.ExpectNoError(waitForVSphereDiskToDetach(vsp, volumePath, node))
+			client, err := framework.LoadClientset()
+			if err != nil {
+				return
+			}
+			framework.ExpectNoError(waitForVSphereDiskToDetach(client, vsp, volumePath, node))
 			vsp.DeleteVolume(volumePath)
 		}
 	})
@ -213,6 +217,6 @@ var _ = SIGDescribe("PersistentVolumes:vsphere", func() {
 		Expect(err).NotTo(HaveOccurred())

 		By("Verifying Persistent Disk detaches")
-		waitForVSphereDiskToDetach(vsp, volumePath, node)
+		waitForVSphereDiskToDetach(c, vsp, volumePath, node)
 	})
 })
--- a/test/e2e/storage/pv_reclaimpolicy.go
+++ b/test/e2e/storage/pv_reclaimpolicy.go
@ -56,7 +56,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 		})

 		AfterEach(func() {
-			vsp, err := vsphere.GetVSphere()
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())
 			testCleanupVSpherePersistentVolumeReclaim(vsp, c, ns, volumePath, pv, pvc)
 		})
@ -74,7 +74,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 			6. Verify PV is deleted automatically.
 		*/
 		It("should delete persistent volume when reclaimPolicy set to delete and associated claim is deleted", func() {
-			vsp, err := vsphere.GetVSphere()
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())

 			volumePath, pv, pvc, err = testSetupVSpherePersistentVolumeReclaim(vsp, c, ns, v1.PersistentVolumeReclaimDelete)
@ -104,7 +104,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 			9. Verify PV should be detached from the node and automatically deleted.
 		*/
 		It("should not detach and unmount PV when associated pvc with delete as reclaimPolicy is deleted when it is in use by the pod", func() {
-			vsp, err := vsphere.GetVSphere()
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())

 			volumePath, pv, pvc, err = testSetupVSpherePersistentVolumeReclaim(vsp, c, ns, v1.PersistentVolumeReclaimDelete)
@ -127,19 +127,19 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {
 			Expect(framework.WaitForPersistentVolumePhase(v1.VolumeFailed, c, pv.Name, 1*time.Second, 60*time.Second)).NotTo(HaveOccurred())

 			By("Verify the volume is attached to the node")
-			isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(vsp, pv.Spec.VsphereVolume.VolumePath, node)
+			isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(c, vsp, pv.Spec.VsphereVolume.VolumePath, node)
 			Expect(verifyDiskAttachedError).NotTo(HaveOccurred())
 			Expect(isVolumeAttached).To(BeTrue())

 			By("Verify the volume is accessible and available in the pod")
-			verifyVSphereVolumesAccessible(pod, []*v1.PersistentVolume{pv}, vsp)
+			verifyVSphereVolumesAccessible(c, pod, []*v1.PersistentVolume{pv}, vsp)
 			framework.Logf("Verified that Volume is accessible in the POD after deleting PV claim")

 			By("Deleting the Pod")
 			framework.ExpectNoError(framework.DeletePodWithWait(f, c, pod), "Failed to delete pod ", pod.Name)

 			By("Verify PV is detached from the node after Pod is deleted")
-			Expect(waitForVSphereDiskToDetach(vsp, pv.Spec.VsphereVolume.VolumePath, types.NodeName(pod.Spec.NodeName))).NotTo(HaveOccurred())
+			Expect(waitForVSphereDiskToDetach(c, vsp, pv.Spec.VsphereVolume.VolumePath, types.NodeName(pod.Spec.NodeName))).NotTo(HaveOccurred())

 			By("Verify PV should be deleted automatically")
 			framework.ExpectNoError(framework.WaitForPersistentVolumeDeleted(c, pv.Name, 1*time.Second, 30*time.Second))
@ -167,7 +167,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:ReclaimPolicy]", func() {

 		It("should retain persistent volume when reclaimPolicy set to retain when associated claim is deleted", func() {
 			var volumeFileContent = "hello from vsphere cloud provider, Random Content is :" + strconv.FormatInt(time.Now().UnixNano(), 10)
-			vsp, err := vsphere.GetVSphere()
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())

 			volumePath, pv, pvc, err = testSetupVSpherePersistentVolumeReclaim(vsp, c, ns, v1.PersistentVolumeReclaimRetain)
--- a/test/e2e/storage/pvc_label_selector.go
+++ b/test/e2e/storage/pvc_label_selector.go
@ -23,7 +23,6 @@ import (
 	. "github.com/onsi/gomega"
 	"k8s.io/api/core/v1"
 	clientset "k8s.io/client-go/kubernetes"
-	vsphere "k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )

@ -104,7 +103,7 @@ var _ = SIGDescribe("PersistentVolumes [Feature:LabelSelector]", func() {
 func testSetupVSpherePVClabelselector(c clientset.Interface, ns string, ssdlabels map[string]string, vvollabels map[string]string) (volumePath string, pv_ssd *v1.PersistentVolume, pvc_ssd *v1.PersistentVolumeClaim, pvc_vvol *v1.PersistentVolumeClaim, err error) {
 	volumePath = ""
 	By("creating vmdk")
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(c)
 	Expect(err).NotTo(HaveOccurred())
 	volumePath, err = createVSphereVolume(vsp, nil)
 	if err != nil {
@ -134,7 +133,7 @@ func testSetupVSpherePVClabelselector(c clientset.Interface, ns string, ssdlabel
 func testCleanupVSpherePVClabelselector(c clientset.Interface, ns string, volumePath string, pv_ssd *v1.PersistentVolume, pvc_ssd *v1.PersistentVolumeClaim, pvc_vvol *v1.PersistentVolumeClaim) {
 	By("running testCleanupVSpherePVClabelselector")
 	if len(volumePath) > 0 {
-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(c)
 		Expect(err).NotTo(HaveOccurred())
 		vsp.DeleteVolume(volumePath)
 	}
--- a/test/e2e/storage/volumes.go
+++ b/test/e2e/storage/volumes.go
@ -53,7 +53,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	kubeletapis "k8s.io/kubernetes/pkg/kubelet/apis"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
@ -507,7 +506,11 @@ var _ = SIGDescribe("Volumes", func() {
 				Prefix:    "vsphere",
 			}
 			By("creating a test vsphere volume")
-			vsp, err := vsphere.GetVSphere()
+			c, err := framework.LoadClientset()
+			if err != nil {
+				return
+			}
+			vsp, err := getVSphere(c)
 			Expect(err).NotTo(HaveOccurred())

 			volumePath, err = createVSphereVolume(vsp, nil)
--- a/test/e2e/storage/vsphere_scale.go
+++ b/test/e2e/storage/vsphere_scale.go
@ -150,7 +150,7 @@ var _ = SIGDescribe("vcp at scale [Feature:vsphere] ", func() {
 			scArrays[index] = sc
 		}

-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())

 		volumeCountPerInstance := volumeCount / numberOfInstances
@ -176,7 +176,7 @@ var _ = SIGDescribe("vcp at scale [Feature:vsphere] ", func() {
 			Expect(err).NotTo(HaveOccurred())
 		}
 		By("Waiting for volumes to be detached from the node")
-		err = waitForVSphereDisksToDetach(vsp, nodeVolumeMap)
+		err = waitForVSphereDisksToDetach(client, vsp, nodeVolumeMap)
 		Expect(err).NotTo(HaveOccurred())

 		for _, pvcClaim := range pvcClaimList {
@ -228,7 +228,7 @@ func VolumeCreateAndAttach(client clientset.Interface, namespace string, sc []*s
 			nodeVolumeMap[pod.Spec.NodeName] = append(nodeVolumeMap[pod.Spec.NodeName], pv.Spec.VsphereVolume.VolumePath)
 		}
 		By("Verify the volume is accessible and available in the pod")
-		verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+		verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)
 		nodeSelectorIndex++
 	}
 	nodeVolumeMapChan <- nodeVolumeMap
--- a/test/e2e/storage/vsphere_statefulsets.go
+++ b/test/e2e/storage/vsphere_statefulsets.go
@ -24,7 +24,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )

@ -104,7 +103,7 @@ var _ = SIGDescribe("vsphere statefulset", func() {
 		Expect(scaledownErr).NotTo(HaveOccurred())
 		statefulsetTester.WaitForStatusReadyReplicas(statefulset, replicas-1)

-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())

 		// After scale down, verify vsphere volumes are detached from deleted pods
@ -117,7 +116,7 @@ var _ = SIGDescribe("vsphere statefulset", func() {
 					if volumespec.PersistentVolumeClaim != nil {
 						vSpherediskPath := getvSphereVolumePathFromClaim(client, statefulset.Namespace, volumespec.PersistentVolumeClaim.ClaimName)
 						framework.Logf("Waiting for Volume: %q to detach from Node: %q", vSpherediskPath, sspod.Spec.NodeName)
-						Expect(waitForVSphereDiskToDetach(vsp, vSpherediskPath, types.NodeName(sspod.Spec.NodeName))).NotTo(HaveOccurred())
+						Expect(waitForVSphereDiskToDetach(client, vsp, vSpherediskPath, types.NodeName(sspod.Spec.NodeName))).NotTo(HaveOccurred())
 					}
 				}
 			}
@ -146,7 +145,7 @@ var _ = SIGDescribe("vsphere statefulset", func() {
 					framework.Logf("Verify Volume: %q is attached to the Node: %q", vSpherediskPath, sspod.Spec.NodeName)
 					// Verify scale up has re-attached the same volumes and not introduced new volume
 					Expect(volumesBeforeScaleDown[vSpherediskPath] == "").To(BeFalse())
-					isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(vsp, vSpherediskPath, types.NodeName(sspod.Spec.NodeName))
+					isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(client, vsp, vSpherediskPath, types.NodeName(sspod.Spec.NodeName))
 					Expect(isVolumeAttached).To(BeTrue())
 					Expect(verifyDiskAttachedError).NotTo(HaveOccurred())
 				}
--- a/test/e2e/storage/vsphere_stress.go
+++ b/test/e2e/storage/vsphere_stress.go
@ -30,7 +30,6 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	k8stype "k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )

@ -135,9 +134,8 @@ var _ = SIGDescribe("vsphere cloud provider stress [Feature:vsphere]", func() {
 func PerformVolumeLifeCycleInParallel(f *framework.Framework, client clientset.Interface, namespace string, instanceId string, sc *storageV1.StorageClass, iterations int, wg *sync.WaitGroup) {
 	defer wg.Done()
 	defer GinkgoRecover()
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(f.ClientSet)
 	Expect(err).NotTo(HaveOccurred())
-
 	for iterationCount := 0; iterationCount < iterations; iterationCount++ {
 		logPrefix := fmt.Sprintf("Instance: [%v], Iteration: [%v] :", instanceId, iterationCount+1)
 		By(fmt.Sprintf("%v Creating PVC using the Storage Class: %v", logPrefix, sc.Name))
@ -164,19 +162,19 @@ func PerformVolumeLifeCycleInParallel(f *framework.Framework, client clientset.I
 		Expect(err).NotTo(HaveOccurred())

 		By(fmt.Sprintf("%v Verifing the volume: %v is attached to the node VM: %v", logPrefix, persistentvolumes[0].Spec.VsphereVolume.VolumePath, pod.Spec.NodeName))
-		isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, types.NodeName(pod.Spec.NodeName))
+		isVolumeAttached, verifyDiskAttachedError := verifyVSphereDiskAttached(client, vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, types.NodeName(pod.Spec.NodeName))
 		Expect(isVolumeAttached).To(BeTrue())
 		Expect(verifyDiskAttachedError).NotTo(HaveOccurred())

 		By(fmt.Sprintf("%v Verifing the volume: %v is accessible in the pod: %v", logPrefix, persistentvolumes[0].Spec.VsphereVolume.VolumePath, pod.Name))
-		verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+		verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)

 		By(fmt.Sprintf("%v Deleting pod: %v", logPrefix, pod.Name))
 		err = framework.DeletePodWithWait(f, client, pod)
 		Expect(err).NotTo(HaveOccurred())

 		By(fmt.Sprintf("%v Waiting for volume: %v to be detached from the node: %v", logPrefix, persistentvolumes[0].Spec.VsphereVolume.VolumePath, pod.Spec.NodeName))
-		err = waitForVSphereDiskToDetach(vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
+		err = waitForVSphereDiskToDetach(client, vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
 		Expect(err).NotTo(HaveOccurred())

 		By(fmt.Sprintf("%v Deleting the Claim: %v", logPrefix, pvclaim.Name))
--- a/test/e2e/storage/vsphere_utils.go
+++ b/test/e2e/storage/vsphere_utils.go
@ -55,13 +55,13 @@ const (
 )

 // Sanity check for vSphere testing.  Verify the persistent disk attached to the node.
-func verifyVSphereDiskAttached(vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) (bool, error) {
+func verifyVSphereDiskAttached(c clientset.Interface, vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) (bool, error) {
 	var (
 		isAttached bool
 		err        error
 	)
 	if vsp == nil {
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(c)
 		Expect(err).NotTo(HaveOccurred())
 	}
 	isAttached, err = vsp.DiskIsAttached(volumePath, nodeName)
@ -70,7 +70,7 @@ func verifyVSphereDiskAttached(vsp *vsphere.VSphere, volumePath string, nodeName
 }

 // Wait until vsphere volumes are detached from the list of nodes or time out after 5 minutes
-func waitForVSphereDisksToDetach(vsp *vsphere.VSphere, nodeVolumes map[k8stype.NodeName][]string) error {
+func waitForVSphereDisksToDetach(c clientset.Interface, vsp *vsphere.VSphere, nodeVolumes map[k8stype.NodeName][]string) error {
 	var (
 		err            error
 		disksAttached  = true
@ -78,7 +78,7 @@ func waitForVSphereDisksToDetach(vsp *vsphere.VSphere, nodeVolumes map[k8stype.N
 		detachPollTime = 10 * time.Second
 	)
 	if vsp == nil {
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(c)
 		if err != nil {
 			return err
 		}
@ -110,7 +110,7 @@ func waitForVSphereDisksToDetach(vsp *vsphere.VSphere, nodeVolumes map[k8stype.N
 }

 // Wait until vsphere vmdk moves to expected state on the given node, or time out after 6 minutes
-func waitForVSphereDiskStatus(vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName, expectedState volumeState) error {
+func waitForVSphereDiskStatus(c clientset.Interface, vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName, expectedState volumeState) error {
 	var (
 		err          error
 		diskAttached bool
@ -130,7 +130,7 @@ func waitForVSphereDiskStatus(vsp *vsphere.VSphere, volumePath string, nodeName
 	}

 	err = wait.Poll(pollTime, timeout, func() (bool, error) {
-		diskAttached, err = verifyVSphereDiskAttached(vsp, volumePath, nodeName)
+		diskAttached, err = verifyVSphereDiskAttached(c, vsp, volumePath, nodeName)
 		if err != nil {
 			return true, err
 		}
@ -154,13 +154,13 @@ func waitForVSphereDiskStatus(vsp *vsphere.VSphere, volumePath string, nodeName
 }

 // Wait until vsphere vmdk is attached from the given node or time out after 6 minutes
-func waitForVSphereDiskToAttach(vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) error {
-	return waitForVSphereDiskStatus(vsp, volumePath, nodeName, volumeStateAttached)
+func waitForVSphereDiskToAttach(c clientset.Interface, vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) error {
+	return waitForVSphereDiskStatus(c, vsp, volumePath, nodeName, volumeStateAttached)
 }

 // Wait until vsphere vmdk is detached from the given node or time out after 6 minutes
-func waitForVSphereDiskToDetach(vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) error {
-	return waitForVSphereDiskStatus(vsp, volumePath, nodeName, volumeStateDetached)
+func waitForVSphereDiskToDetach(c clientset.Interface, vsp *vsphere.VSphere, volumePath string, nodeName types.NodeName) error {
+	return waitForVSphereDiskStatus(c, vsp, volumePath, nodeName, volumeStateDetached)
 }

 // function to create vsphere volume spec with given VMDK volume path, Reclaim Policy and labels
@ -414,12 +414,12 @@ func createEmptyFilesOnVSphereVolume(namespace string, podName string, filePaths
 }

 // verify volumes are attached to the node and are accessible in pod
-func verifyVSphereVolumesAccessible(pod *v1.Pod, persistentvolumes []*v1.PersistentVolume, vsp *vsphere.VSphere) {
+func verifyVSphereVolumesAccessible(c clientset.Interface, pod *v1.Pod, persistentvolumes []*v1.PersistentVolume, vsp *vsphere.VSphere) {
 	nodeName := pod.Spec.NodeName
 	namespace := pod.Namespace
 	for index, pv := range persistentvolumes {
 		// Verify disks are attached to the node
-		isAttached, err := verifyVSphereDiskAttached(vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(nodeName))
+		isAttached, err := verifyVSphereDiskAttached(c, vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(nodeName))
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), fmt.Sprintf("disk %v is not attached with the node", pv.Spec.VsphereVolume.VolumePath))
 		// Verify Volumes are accessible
@ -437,3 +437,23 @@ func getvSphereVolumePathFromClaim(client clientset.Interface, namespace string,
 	Expect(err).NotTo(HaveOccurred())
 	return pv.Spec.VsphereVolume.VolumePath
 }
+
+func addNodesToVCP(vsp *vsphere.VSphere, c clientset.Interface) error {
+	nodes, err := c.CoreV1().Nodes().List(metav1.ListOptions{})
+	if err != nil {
+		return err
+	}
+	for _, node := range nodes.Items {
+		vsp.NodeAdded(&node)
+	}
+	return nil
+}
+
+func getVSphere(c clientset.Interface) (*vsphere.VSphere, error) {
+	vsp, err := vsphere.GetVSphere()
+	if err != nil {
+		return nil, err
+	}
+	addNodesToVCP(vsp, c)
+	return vsp, nil
+}
--- a/test/e2e/storage/vsphere_volume_cluster_ds.go
+++ b/test/e2e/storage/vsphere_volume_cluster_ds.go
@ -25,7 +25,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere/vclib"
 	"k8s.io/kubernetes/test/e2e/framework"
 )
@ -69,7 +68,7 @@ var _ = SIGDescribe("Volume Provisioning On Clustered Datastore [Feature:vsphere

 	It("verify static provisioning on clustered datastore", func() {
 		var volumePath string
-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())

 		By("creating a test vsphere volume")
@ -100,7 +99,7 @@ var _ = SIGDescribe("Volume Provisioning On Clustered Datastore [Feature:vsphere
 		nodeName := types.NodeName(pod.Spec.NodeName)

 		By("Verifying volume is attached")
-		isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, nodeName)
+		isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePath, nodeName)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), fmt.Sprintf("disk: %s is not attached with the node: %v", volumePath, nodeName))

@ -109,7 +108,7 @@ var _ = SIGDescribe("Volume Provisioning On Clustered Datastore [Feature:vsphere
 		Expect(err).NotTo(HaveOccurred())

 		By("Waiting for volumes to be detached from the node")
-		err = waitForVSphereDiskToDetach(vsp, volumePath, nodeName)
+		err = waitForVSphereDiskToDetach(client, vsp, volumePath, nodeName)
 		Expect(err).NotTo(HaveOccurred())
 	})

--- a/test/e2e/storage/vsphere_volume_datastore.go
+++ b/test/e2e/storage/vsphere_volume_datastore.go
@ -68,7 +68,7 @@ var _ = SIGDescribe("Volume Provisioning on Datastore [Feature:vsphere]", func()
 		scParameters[DiskFormat] = ThinDisk
 		err := invokeInvalidDatastoreTestNeg(client, namespace, scParameters)
 		Expect(err).To(HaveOccurred())
-		errorMsg := `Failed to provision volume with StorageClass \"` + DatastoreSCName + `\": datastore '` + InvalidDatastore + `' not found`
+		errorMsg := `Failed to provision volume with StorageClass \"` + DatastoreSCName + `\": The specified datastore ` + InvalidDatastore + ` is not a shared datastore across node VMs`
 		if !strings.Contains(err.Error(), errorMsg) {
 			Expect(err).NotTo(HaveOccurred(), errorMsg)
 		}
--- a/test/e2e/storage/vsphere_volume_diskformat.go
+++ b/test/e2e/storage/vsphere_volume_diskformat.go
@ -145,9 +145,9 @@ func invokeTest(f *framework.Framework, client clientset.Interface, namespace st
 	pod, err := client.CoreV1().Pods(namespace).Create(podSpec)
 	Expect(err).NotTo(HaveOccurred())

-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())
-	verifyVSphereDiskAttached(vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(nodeName))
+	verifyVSphereDiskAttached(client, vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(nodeName))

 	By("Waiting for pod to be running")
 	Expect(framework.WaitForPodNameRunningInNamespace(client, pod.Name, namespace)).To(Succeed())
--- a/test/e2e/storage/vsphere_volume_fstype.go
+++ b/test/e2e/storage/vsphere_volume_fstype.go
@ -97,7 +97,7 @@ func invokeTestForFstype(f *framework.Framework, client clientset.Interface, nam
 	framework.Logf("Invoking Test for fstype: %s", fstype)
 	scParameters := make(map[string]string)
 	scParameters["fstype"] = fstype
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())

 	// Create Persistent Volume
@ -117,7 +117,7 @@ func invokeTestForFstype(f *framework.Framework, client clientset.Interface, nam
 func invokeTestForInvalidFstype(f *framework.Framework, client clientset.Interface, namespace string, fstype string) {
 	scParameters := make(map[string]string)
 	scParameters["fstype"] = fstype
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())

 	// Create Persistent Volume
@ -170,12 +170,12 @@ func createPodAndVerifyVolumeAccessible(client clientset.Interface, namespace st
 	pvclaims = append(pvclaims, pvclaim)
 	By("Creating pod to attach PV to the node")
 	// Create pod to attach Volume to Node
-	pod, err := framework.CreatePod(client, namespace, nil, pvclaims, false, "")
+	pod, err := framework.CreatePod(client, namespace, nil, pvclaims, false, ExecCommand)
 	Expect(err).NotTo(HaveOccurred())

 	// Asserts: Right disk is attached to the pod
 	By("Verify the volume is accessible and available in the pod")
-	verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+	verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)
 	return pod
 }

@ -184,7 +184,7 @@ func detachVolume(f *framework.Framework, client clientset.Interface, vsp *vsphe
 	framework.DeletePodWithWait(f, client, pod)

 	By("Waiting for volumes to be detached from the node")
-	waitForVSphereDiskToDetach(vsp, volPath, k8stype.NodeName(pod.Spec.NodeName))
+	waitForVSphereDiskToDetach(client, vsp, volPath, k8stype.NodeName(pod.Spec.NodeName))
 }

 func deleteVolume(client clientset.Interface, pvclaimName string, namespace string) {
--- a/test/e2e/storage/vsphere_volume_master_restart.go
+++ b/test/e2e/storage/vsphere_volume_master_restart.go
@ -27,7 +27,6 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )

@ -79,7 +78,7 @@ var _ = SIGDescribe("Volume Attach Verify [Feature:vsphere][Serial][Disruptive]"
 	})

 	It("verify volume remains attached after master kubelet restart", func() {
-		vsp, err := vsphere.GetVSphere()
+		vsp, err := getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())

 		// Create pod on each node
@ -106,7 +105,7 @@ var _ = SIGDescribe("Volume Attach Verify [Feature:vsphere][Serial][Disruptive]"

 			nodeName := types.NodeName(pod.Spec.NodeName)
 			By(fmt.Sprintf("Verify volume %s is attached to the pod %v", volumePath, nodeName))
-			isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, types.NodeName(nodeName))
+			isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePath, types.NodeName(nodeName))
 			Expect(err).NotTo(HaveOccurred())
 			Expect(isAttached).To(BeTrue(), fmt.Sprintf("disk: %s is not attached with the node", volumePath))

@ -126,7 +125,7 @@ var _ = SIGDescribe("Volume Attach Verify [Feature:vsphere][Serial][Disruptive]"

 			nodeName := types.NodeName(pod.Spec.NodeName)
 			By(fmt.Sprintf("After master restart, verify volume %v is attached to the pod %v", volumePath, nodeName))
-			isAttached, err := verifyVSphereDiskAttached(vsp, volumePaths[i], types.NodeName(nodeName))
+			isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePaths[i], types.NodeName(nodeName))
 			Expect(err).NotTo(HaveOccurred())
 			Expect(isAttached).To(BeTrue(), fmt.Sprintf("disk: %s is not attached with the node", volumePath))

@ -135,7 +134,7 @@ var _ = SIGDescribe("Volume Attach Verify [Feature:vsphere][Serial][Disruptive]"
 			Expect(err).NotTo(HaveOccurred())

 			By(fmt.Sprintf("Waiting for volume %s to be detached from the node %v", volumePath, nodeName))
-			err = waitForVSphereDiskToDetach(vsp, volumePath, types.NodeName(nodeName))
+			err = waitForVSphereDiskToDetach(client, vsp, volumePath, types.NodeName(nodeName))
 			Expect(err).NotTo(HaveOccurred())

 			By(fmt.Sprintf("Deleting volume %s", volumePath))
--- a/test/e2e/storage/vsphere_volume_node_poweroff.go
+++ b/test/e2e/storage/vsphere_volume_node_poweroff.go
@ -61,7 +61,7 @@ var _ = SIGDescribe("Node Poweroff [Feature:vsphere] [Slow] [Disruptive]", func(
 		nodeList := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
 		Expect(nodeList.Items).NotTo(BeEmpty(), "Unable to find ready and schedulable Node")
 		Expect(len(nodeList.Items) > 1).To(BeTrue(), "At least 2 nodes are required for this test")
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())
 		workingDir = os.Getenv("VSPHERE_WORKING_DIR")
 		Expect(workingDir).NotTo(BeEmpty())
@ -112,7 +112,7 @@ var _ = SIGDescribe("Node Poweroff [Feature:vsphere] [Slow] [Disruptive]", func(
 		node1 := types.NodeName(pod.Spec.NodeName)

 		By(fmt.Sprintf("Verify disk is attached to the node: %v", node1))
-		isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, node1)
+		isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePath, node1)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), "Disk is not attached to the node")

@ -139,11 +139,11 @@ var _ = SIGDescribe("Node Poweroff [Feature:vsphere] [Slow] [Disruptive]", func(
 		Expect(err).NotTo(HaveOccurred(), "Pod did not fail over to a different node")

 		By(fmt.Sprintf("Waiting for disk to be attached to the new node: %v", node2))
-		err = waitForVSphereDiskToAttach(vsp, volumePath, node2)
+		err = waitForVSphereDiskToAttach(client, vsp, volumePath, node2)
 		Expect(err).NotTo(HaveOccurred(), "Disk is not attached to the node")

 		By(fmt.Sprintf("Waiting for disk to be detached from the previous node: %v", node1))
-		err = waitForVSphereDiskToDetach(vsp, volumePath, node1)
+		err = waitForVSphereDiskToDetach(client, vsp, volumePath, node1)
 		Expect(err).NotTo(HaveOccurred(), "Disk is not detached from the node")

 		By(fmt.Sprintf("Power on the previous node: %v", node1))
--- a/test/e2e/storage/vsphere_volume_ops_storm.go
+++ b/test/e2e/storage/vsphere_volume_ops_storm.go
@ -75,7 +75,7 @@ var _ = SIGDescribe("Volume Operations Storm [Feature:vsphere]", func() {
 			volume_ops_scale = DEFAULT_VOLUME_OPS_SCALE
 		}
 		pvclaims = make([]*v1.PersistentVolumeClaim, volume_ops_scale)
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(client)
 		Expect(err).NotTo(HaveOccurred())
 	})
 	AfterEach(func() {
@ -113,14 +113,14 @@ var _ = SIGDescribe("Volume Operations Storm [Feature:vsphere]", func() {
 		Expect(err).NotTo(HaveOccurred())

 		By("Verify all volumes are accessible and available in the pod")
-		verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+		verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)

 		By("Deleting pod")
 		framework.ExpectNoError(framework.DeletePodWithWait(f, client, pod))

 		By("Waiting for volumes to be detached from the node")
 		for _, pv := range persistentvolumes {
-			waitForVSphereDiskToDetach(vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
+			waitForVSphereDiskToDetach(client, vsp, pv.Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
 		}
 	})
 })
--- a/test/e2e/storage/vsphere_volume_perf.go
+++ b/test/e2e/storage/vsphere_volume_perf.go
@ -28,7 +28,6 @@ import (
 	storageV1 "k8s.io/api/storage/v1"
 	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
-	"k8s.io/kubernetes/pkg/cloudprovider/providers/vsphere"
 	"k8s.io/kubernetes/test/e2e/framework"
 )

@ -214,11 +213,11 @@ func invokeVolumeLifeCyclePerformance(f *framework.Framework, client clientset.I
 	latency[AttachOp] = elapsed.Seconds()

 	// Verify access to the volumes
-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())

 	for i, pod := range totalpods {
-		verifyVSphereVolumesAccessible(pod, totalpvs[i], vsp)
+		verifyVSphereVolumesAccessible(client, pod, totalpvs[i], vsp)
 	}

 	By("Deleting pods")
@ -237,7 +236,7 @@ func invokeVolumeLifeCyclePerformance(f *framework.Framework, client clientset.I
 		}
 	}

-	err = waitForVSphereDisksToDetach(vsp, nodeVolumeMap)
+	err = waitForVSphereDisksToDetach(client, vsp, nodeVolumeMap)
 	Expect(err).NotTo(HaveOccurred())

 	By("Deleting the PVCs")
--- a/test/e2e/storage/vsphere_volume_placement.go
+++ b/test/e2e/storage/vsphere_volume_placement.go
@ -57,7 +57,7 @@ var _ = SIGDescribe("Volume Placement", func() {
 			isNodeLabeled = true
 		}
 		By("creating vmdk")
-		vsp, err = vsphere.GetVSphere()
+		vsp, err = getVSphere(c)
 		Expect(err).NotTo(HaveOccurred())
 		volumePath, err := createVSphereVolume(vsp, nil)
 		Expect(err).NotTo(HaveOccurred())
@ -285,7 +285,7 @@ var _ = SIGDescribe("Volume Placement", func() {
 			framework.ExpectNoError(framework.DeletePodWithWait(f, c, podB), "defer: Failed to delete pod ", podB.Name)
 			By(fmt.Sprintf("wait for volumes to be detached from the node: %v", node1Name))
 			for _, volumePath := range volumePaths {
-				framework.ExpectNoError(waitForVSphereDiskToDetach(vsp, volumePath, types.NodeName(node1Name)))
+				framework.ExpectNoError(waitForVSphereDiskToDetach(c, vsp, volumePath, types.NodeName(node1Name)))
 			}
 		}()

@ -362,7 +362,7 @@ func createPodWithVolumeAndNodeSelector(client clientset.Interface, namespace st

 	By(fmt.Sprintf("Verify volume is attached to the node:%v", nodeName))
 	for _, volumePath := range volumePaths {
-		isAttached, err := verifyVSphereDiskAttached(vsp, volumePath, types.NodeName(nodeName))
+		isAttached, err := verifyVSphereDiskAttached(client, vsp, volumePath, types.NodeName(nodeName))
 		Expect(err).NotTo(HaveOccurred())
 		Expect(isAttached).To(BeTrue(), "disk:"+volumePath+" is not attached with the node")
 	}
@ -385,6 +385,6 @@ func deletePodAndWaitForVolumeToDetach(f *framework.Framework, c clientset.Inter

 	By("Waiting for volume to be detached from the node")
 	for _, volumePath := range volumePaths {
-		framework.ExpectNoError(waitForVSphereDiskToDetach(vsp, volumePath, types.NodeName(nodeName)))
+		framework.ExpectNoError(waitForVSphereDiskToDetach(c, vsp, volumePath, types.NodeName(nodeName)))
 	}
 }
--- a/test/e2e/storage/vsphere_volume_vsan_policy.go
+++ b/test/e2e/storage/vsphere_volume_vsan_policy.go
@ -295,16 +295,16 @@ func invokeValidPolicyTest(f *framework.Framework, client clientset.Interface, n
 	pod, err := framework.CreatePod(client, namespace, nil, pvclaims, false, "")
 	Expect(err).NotTo(HaveOccurred())

-	vsp, err := vsphere.GetVSphere()
+	vsp, err := getVSphere(client)
 	Expect(err).NotTo(HaveOccurred())
 	By("Verify the volume is accessible and available in the pod")
-	verifyVSphereVolumesAccessible(pod, persistentvolumes, vsp)
+	verifyVSphereVolumesAccessible(client, pod, persistentvolumes, vsp)

 	By("Deleting pod")
 	framework.DeletePodWithWait(f, client, pod)

 	By("Waiting for volumes to be detached from the node")
-	waitForVSphereDiskToDetach(vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
+	waitForVSphereDiskToDetach(client, vsp, persistentvolumes[0].Spec.VsphereVolume.VolumePath, k8stype.NodeName(pod.Spec.NodeName))
 }

 func invokeInvalidPolicyTestNeg(client clientset.Interface, namespace string, scParameters map[string]string) error {
--- a/test/integration/deployment/BUILD
+++ b/test/integration/deployment/BUILD
@ -42,6 +42,7 @@ go_library(
        "//vendor/k8s.io/api/core/v1:go_default_library",
        "//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/util/wait:go_default_library",
        "//vendor/k8s.io/client-go/informers:go_default_library",
        "//vendor/k8s.io/client-go/kubernetes:go_default_library",
--- a/test/integration/deployment/deployment_test.go
+++ b/test/integration/deployment/deployment_test.go
@ -876,3 +876,195 @@ func TestOverlappingDeployments(t *testing.T) {
 		}
 	}
 }
+
+// Deployment should not block rollout when updating spec replica number and template at the same time.
+func TestScaledRolloutDeployment(t *testing.T) {
+	s, closeFn, rm, dc, informers, c := dcSetup(t)
+	defer closeFn()
+	name := "test-scaled-rollout-deployment"
+	ns := framework.CreateTestingNamespace(name, s, t)
+	defer framework.DeleteTestingNamespace(ns, s, t)
+
+	stopCh := make(chan struct{})
+	defer close(stopCh)
+	informers.Start(stopCh)
+	go rm.Run(5, stopCh)
+	go dc.Run(5, stopCh)
+
+	// Create a deployment with rolling update strategy, max surge = 3, and max unavailable = 2
+	var err error
+	replicas := int32(10)
+	tester := &deploymentTester{t: t, c: c, deployment: newDeployment(name, ns.Name, replicas)}
+	tester.deployment.Spec.Strategy.RollingUpdate.MaxSurge = intOrStrP(3)
+	tester.deployment.Spec.Strategy.RollingUpdate.MaxUnavailable = intOrStrP(2)
+	tester.deployment, err = c.ExtensionsV1beta1().Deployments(ns.Name).Create(tester.deployment)
+	if err != nil {
+		t.Fatalf("failed to create deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("1", fakeImage); err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitForDeploymentCompleteAndMarkPodsReady(); err != nil {
+		t.Fatalf("deployment %q failed to complete: %v", name, err)
+	}
+
+	// Record current replicaset before starting new rollout
+	firstRS, err := tester.expectNewReplicaSet()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Update the deployment with another new image but do not mark the pods as ready to block new replicaset
+	fakeImage2 := "fakeimage2"
+	tester.deployment, err = tester.updateDeployment(func(update *v1beta1.Deployment) {
+		update.Spec.Template.Spec.Containers[0].Image = fakeImage2
+	})
+	if err != nil {
+		t.Fatalf("failed updating deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("2", fakeImage2); err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify the deployment has minimum available replicas after 2nd rollout
+	tester.deployment, err = c.ExtensionsV1beta1().Deployments(ns.Name).Get(name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("failed to get deployment %q: %v", name, err)
+	}
+	minAvailableReplicas := deploymentutil.MinAvailable(tester.deployment)
+	if tester.deployment.Status.AvailableReplicas < minAvailableReplicas {
+		t.Fatalf("deployment %q does not have minimum number of available replicas after 2nd rollout", name)
+	}
+
+	// Wait for old replicaset of 1st rollout to have desired replicas
+	firstRS, err = c.ExtensionsV1beta1().ReplicaSets(ns.Name).Get(firstRS.Name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("failed to get replicaset %q: %v", firstRS.Name, err)
+	}
+	if err = tester.waitRSStable(firstRS); err != nil {
+		t.Fatal(err)
+	}
+
+	// Wait for new replicaset of 2nd rollout to have desired replicas
+	secondRS, err := tester.expectNewReplicaSet()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitRSStable(secondRS); err != nil {
+		t.Fatal(err)
+	}
+
+	// Scale up the deployment and update its image to another new image simultaneously (this time marks all pods as ready)
+	newReplicas := int32(20)
+	fakeImage3 := "fakeimage3"
+	tester.deployment, err = tester.updateDeployment(func(update *v1beta1.Deployment) {
+		update.Spec.Replicas = &newReplicas
+		update.Spec.Template.Spec.Containers[0].Image = fakeImage3
+	})
+	if err != nil {
+		t.Fatalf("failed updating deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("3", fakeImage3); err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitForDeploymentCompleteAndMarkPodsReady(); err != nil {
+		t.Fatalf("deployment %q failed to complete: %v", name, err)
+	}
+
+	// Verify every replicaset has correct desiredReplicas annotation after 3rd rollout
+	thirdRS, err := deploymentutil.GetNewReplicaSet(tester.deployment, c.ExtensionsV1beta1())
+	if err != nil {
+		t.Fatalf("failed getting new revision 3 replicaset for deployment %q: %v", name, err)
+	}
+	rss := []*v1beta1.ReplicaSet{firstRS, secondRS, thirdRS}
+	for _, curRS := range rss {
+		curRS, err = c.ExtensionsV1beta1().ReplicaSets(ns.Name).Get(curRS.Name, metav1.GetOptions{})
+		if err != nil {
+			t.Fatalf("failed to get replicaset when checking desired replicas annotation: %v", err)
+		}
+		desired, ok := deploymentutil.GetDesiredReplicasAnnotation(curRS)
+		if !ok {
+			t.Fatalf("failed to retrieve desiredReplicas annotation for replicaset %q", curRS.Name)
+		}
+		if desired != *(tester.deployment.Spec.Replicas) {
+			t.Fatalf("unexpected desiredReplicas annotation for replicaset %q: expected %d, got %d", curRS.Name, *(tester.deployment.Spec.Replicas), desired)
+		}
+	}
+
+	// Update the deployment with another new image but do not mark the pods as ready to block new replicaset
+	fakeImage4 := "fakeimage4"
+	tester.deployment, err = tester.updateDeployment(func(update *v1beta1.Deployment) {
+		update.Spec.Template.Spec.Containers[0].Image = fakeImage4
+	})
+	if err != nil {
+		t.Fatalf("failed updating deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("4", fakeImage4); err != nil {
+		t.Fatal(err)
+	}
+
+	// Verify the deployment has minimum available replicas after 4th rollout
+	tester.deployment, err = c.ExtensionsV1beta1().Deployments(ns.Name).Get(name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("failed to get deployment %q: %v", name, err)
+	}
+	minAvailableReplicas = deploymentutil.MinAvailable(tester.deployment)
+	if tester.deployment.Status.AvailableReplicas < minAvailableReplicas {
+		t.Fatalf("deployment %q does not have minimum number of available replicas after 4th rollout", name)
+	}
+
+	// Wait for old replicaset of 3rd rollout to have desired replicas
+	thirdRS, err = c.ExtensionsV1beta1().ReplicaSets(ns.Name).Get(thirdRS.Name, metav1.GetOptions{})
+	if err != nil {
+		t.Fatalf("failed to get replicaset %q: %v", thirdRS.Name, err)
+	}
+	if err = tester.waitRSStable(thirdRS); err != nil {
+		t.Fatal(err)
+	}
+
+	// Wait for new replicaset of 4th rollout to have desired replicas
+	fourthRS, err := tester.expectNewReplicaSet()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitRSStable(fourthRS); err != nil {
+		t.Fatal(err)
+	}
+
+	// Scale down the deployment and update its image to another new image simultaneously (this time marks all pods as ready)
+	newReplicas = int32(5)
+	fakeImage5 := "fakeimage5"
+	tester.deployment, err = tester.updateDeployment(func(update *v1beta1.Deployment) {
+		update.Spec.Replicas = &newReplicas
+		update.Spec.Template.Spec.Containers[0].Image = fakeImage5
+	})
+	if err != nil {
+		t.Fatalf("failed updating deployment %q: %v", name, err)
+	}
+	if err = tester.waitForDeploymentRevisionAndImage("5", fakeImage5); err != nil {
+		t.Fatal(err)
+	}
+	if err = tester.waitForDeploymentCompleteAndMarkPodsReady(); err != nil {
+		t.Fatalf("deployment %q failed to complete: %v", name, err)
+	}
+
+	// Verify every replicaset has correct desiredReplicas annotation after 5th rollout
+	fifthRS, err := deploymentutil.GetNewReplicaSet(tester.deployment, c.ExtensionsV1beta1())
+	if err != nil {
+		t.Fatalf("failed getting new revision 5 replicaset for deployment %q: %v", name, err)
+	}
+	rss = []*v1beta1.ReplicaSet{thirdRS, fourthRS, fifthRS}
+	for _, curRS := range rss {
+		curRS, err = c.ExtensionsV1beta1().ReplicaSets(ns.Name).Get(curRS.Name, metav1.GetOptions{})
+		if err != nil {
+			t.Fatalf("failed to get replicaset when checking desired replicas annotation: %v", err)
+		}
+		desired, ok := deploymentutil.GetDesiredReplicasAnnotation(curRS)
+		if !ok {
+			t.Fatalf("failed to retrieve desiredReplicas annotation for replicaset %q", curRS.Name)
+		}
+		if desired != *(tester.deployment.Spec.Replicas) {
+			t.Fatalf("unexpected desiredReplicas annotation for replicaset %q: expected %d, got %d", curRS.Name, *(tester.deployment.Spec.Replicas), desired)
+		}
+	}
+}
--- a/test/integration/deployment/util.go
+++ b/test/integration/deployment/util.go
@ -26,6 +26,7 @@ import (
 	"k8s.io/api/core/v1"
 	"k8s.io/api/extensions/v1beta1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/informers"
 	clientset "k8s.io/client-go/kubernetes"
@ -80,7 +81,8 @@ func newDeployment(name, ns string, replicas int32) *v1beta1.Deployment {
 			Replicas: &replicas,
 			Selector: &metav1.LabelSelector{MatchLabels: testLabels()},
 			Strategy: v1beta1.DeploymentStrategy{
-				Type: v1beta1.RollingUpdateDeploymentStrategyType,
+				Type:          v1beta1.RollingUpdateDeploymentStrategyType,
+				RollingUpdate: new(v1beta1.RollingUpdateDeployment),
 			},
 			Template: v1.PodTemplateSpec{
 				ObjectMeta: metav1.ObjectMeta{
@ -212,6 +214,11 @@ func markPodReady(c clientset.Interface, ns string, pod *v1.Pod) error {
 	return err
 }

+func intOrStrP(num int) *intstr.IntOrString {
+	intstr := intstr.FromInt(num)
+	return &intstr
+}
+
 // markUpdatedPodsReady manually marks updated Deployment pods status to ready,
 // until the deployment is complete
 func (d *deploymentTester) markUpdatedPodsReady(wg *sync.WaitGroup) {
@ -405,3 +412,7 @@ func (d *deploymentTester) listUpdatedPods() ([]v1.Pod, error) {
 	}
 	return ownedPods, nil
 }
+
+func (d *deploymentTester) waitRSStable(replicaset *v1beta1.ReplicaSet) error {
+	return testutil.WaitRSStable(d.t, d.c, replicaset, pollInterval, pollTimeout)
+}
--- a/test/integration/replicaset/BUILD
+++ b/test/integration/replicaset/BUILD
@ -18,6 +18,7 @@ go_test(
        "//pkg/api/v1/pod:go_default_library",
        "//pkg/controller/replicaset:go_default_library",
        "//test/integration/framework:go_default_library",
+        "//test/utils:go_default_library",
        "//vendor/k8s.io/api/core/v1:go_default_library",
        "//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
        "//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
--- a/test/integration/replicaset/replicaset_test.go
+++ b/test/integration/replicaset/replicaset_test.go
@ -41,6 +41,7 @@ import (
 	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
 	"k8s.io/kubernetes/pkg/controller/replicaset"
 	"k8s.io/kubernetes/test/integration/framework"
+	testutil "k8s.io/kubernetes/test/utils"
 )

 const (
@ -217,15 +218,8 @@ func createRSsPods(t *testing.T, clientSet clientset.Interface, rss []*v1beta1.R

 // Verify .Status.Replicas is equal to .Spec.Replicas
 func waitRSStable(t *testing.T, clientSet clientset.Interface, rs *v1beta1.ReplicaSet) {
-	rsClient := clientSet.Extensions().ReplicaSets(rs.Namespace)
-	if err := wait.PollImmediate(interval, timeout, func() (bool, error) {
-		newRS, err := rsClient.Get(rs.Name, metav1.GetOptions{})
-		if err != nil {
-			return false, err
-		}
-		return newRS.Status.Replicas == *rs.Spec.Replicas, nil
-	}); err != nil {
-		t.Fatalf("Failed to verify .Status.Replicas is equal to .Spec.Replicas for rs %s: %v", rs.Name, err)
+	if err := testutil.WaitRSStable(t, clientSet, rs, interval, timeout); err != nil {
+		t.Fatal(err)
 	}
 }

--- a/test/utils/replicaset.go
+++ b/test/utils/replicaset.go
@ -18,6 +18,7 @@ package utils

 import (
 	"fmt"
+	"testing"
 	"time"

 	extensions "k8s.io/api/extensions/v1beta1"
@ -50,3 +51,18 @@ func UpdateReplicaSetWithRetries(c clientset.Interface, namespace, name string,
 	}
 	return rs, pollErr
 }
+
+// Verify .Status.Replicas is equal to .Spec.Replicas
+func WaitRSStable(t *testing.T, clientSet clientset.Interface, rs *extensions.ReplicaSet, pollInterval, pollTimeout time.Duration) error {
+	desiredGeneration := rs.Generation
+	if err := wait.PollImmediate(pollInterval, pollTimeout, func() (bool, error) {
+		newRS, err := clientSet.ExtensionsV1beta1().ReplicaSets(rs.Namespace).Get(rs.Name, metav1.GetOptions{})
+		if err != nil {
+			return false, err
+		}
+		return newRS.Status.ObservedGeneration >= desiredGeneration && newRS.Status.Replicas == *rs.Spec.Replicas, nil
+	}); err != nil {
+		return fmt.Errorf("failed to verify .Status.Replicas is equal to .Spec.Replicas for replicaset %q: %v", rs.Name, err)
+	}
+	return nil
+}