Retry attaching multipath iSCSI volumes

Don't mount single path instead of multipath volumes and always wait until
at least 2 paths are available. Try up to 3 times to get all paths. Try 5
times to get at last 2 paths.
pull/58/head
Jan Safranek 2018-10-08 13:00:24 +02:00
parent a29b093a56
commit 362be3a58b
1 changed files with 129 additions and 85 deletions

View File

@ -36,6 +36,23 @@ import (
volumeutil "k8s.io/kubernetes/pkg/volume/util" volumeutil "k8s.io/kubernetes/pkg/volume/util"
) )
const (
// Minimum number of paths that the volume plugin considers enough when a multipath volume is requested.
minMultipathCount = 2
// Minimal number of attempts to attach all paths of a multipath volumes. If at least minMultipathCount paths
// are available after this nr. of attempts, the volume plugin continues with mounting the volume.
minAttachAttempts = 2
// Total number of attempts to attach at least minMultipathCount paths. If there are less than minMultipathCount,
// the volume plugin tries to attach the remaining paths at least this number of times in total. After
// maxAttachAttempts attempts, it mounts even a single path.
maxAttachAttempts = 5
// How many seconds to wait for a multipath device if at least two paths are available.
multipathDeviceTimeout = 10
)
var ( var (
chap_st = []string{ chap_st = []string{
"discovery.sendtargets.auth.username", "discovery.sendtargets.auth.username",
@ -268,7 +285,7 @@ func waitForMultiPathToExist(devicePaths []string, maxRetries int, deviceUtil vo
// AttachDisk returns devicePath of volume if attach succeeded otherwise returns error // AttachDisk returns devicePath of volume if attach succeeded otherwise returns error
func (util *ISCSIUtil) AttachDisk(b iscsiDiskMounter) (string, error) { func (util *ISCSIUtil) AttachDisk(b iscsiDiskMounter) (string, error) {
var devicePath string var devicePath string
var devicePaths []string devicePaths := map[string]string{}
var iscsiTransport string var iscsiTransport string
var lastErr error var lastErr error
@ -308,118 +325,145 @@ func (util *ISCSIUtil) AttachDisk(b iscsiDiskMounter) (string, error) {
} }
glog.V(4).Infof("AttachDisk portal->host map for %s is %v", b.Iqn, portalHostMap) glog.V(4).Infof("AttachDisk portal->host map for %s is %v", b.Iqn, portalHostMap)
for _, tp := range bkpPortal { for i := 1; i <= maxAttachAttempts; i++ {
hostNumber, loggedIn := portalHostMap[tp] for _, tp := range bkpPortal {
if !loggedIn { if _, found := devicePaths[tp]; found {
glog.V(4).Infof("Could not get SCSI host number for portal %s, will attempt login", tp) glog.V(4).Infof("Device for portal %q already known", tp)
// build discoverydb and discover iscsi target
b.exec.Run("iscsiadm", "-m", "discoverydb", "-t", "sendtargets", "-p", tp, "-I", b.Iface, "-o", "new")
// update discoverydb with CHAP secret
err = updateISCSIDiscoverydb(b, tp)
if err != nil {
lastErr = fmt.Errorf("iscsi: failed to update discoverydb to portal %s error: %v", tp, err)
continue continue
} }
out, err = b.exec.Run("iscsiadm", "-m", "discoverydb", "-t", "sendtargets", "-p", tp, "-I", b.Iface, "--discover")
if err != nil {
// delete discoverydb record
b.exec.Run("iscsiadm", "-m", "discoverydb", "-t", "sendtargets", "-p", tp, "-I", b.Iface, "-o", "delete")
lastErr = fmt.Errorf("iscsi: failed to sendtargets to portal %s output: %s, err %v", tp, string(out), err)
continue
}
err = updateISCSINode(b, tp)
if err != nil {
// failure to update node db is rare. But deleting record will likely impact those who already start using it.
lastErr = fmt.Errorf("iscsi: failed to update iscsi node to portal %s error: %v", tp, err)
continue
}
// login to iscsi target
out, err = b.exec.Run("iscsiadm", "-m", "node", "-p", tp, "-T", b.Iqn, "-I", b.Iface, "--login")
if err != nil {
// delete the node record from database
b.exec.Run("iscsiadm", "-m", "node", "-p", tp, "-I", b.Iface, "-T", b.Iqn, "-o", "delete")
lastErr = fmt.Errorf("iscsi: failed to attach disk: Error: %s (%v)", string(out), err)
continue
}
// in case of node failure/restart, explicitly set to manual login so it doesn't hang on boot
out, err = b.exec.Run("iscsiadm", "-m", "node", "-p", tp, "-T", b.Iqn, "-o", "update", "-n", "node.startup", "-v", "manual")
if err != nil {
// don't fail if we can't set startup mode, but log warning so there is a clue
glog.Warningf("Warning: Failed to set iSCSI login mode to manual. Error: %v", err)
}
// Rebuild the host map after logging in hostNumber, loggedIn := portalHostMap[tp]
portalHostMap, err := b.deviceUtil.GetISCSIPortalHostMapForTarget(b.Iqn) if !loggedIn {
glog.V(4).Infof("Could not get SCSI host number for portal %s, will attempt login", tp)
// build discoverydb and discover iscsi target
b.exec.Run("iscsiadm", "-m", "discoverydb", "-t", "sendtargets", "-p", tp, "-I", b.Iface, "-o", "new")
// update discoverydb with CHAP secret
err = updateISCSIDiscoverydb(b, tp)
if err != nil {
lastErr = fmt.Errorf("iscsi: failed to update discoverydb to portal %s error: %v", tp, err)
continue
}
out, err = b.exec.Run("iscsiadm", "-m", "discoverydb", "-t", "sendtargets", "-p", tp, "-I", b.Iface, "--discover")
if err != nil {
// delete discoverydb record
b.exec.Run("iscsiadm", "-m", "discoverydb", "-t", "sendtargets", "-p", tp, "-I", b.Iface, "-o", "delete")
lastErr = fmt.Errorf("iscsi: failed to sendtargets to portal %s output: %s, err %v", tp, string(out), err)
continue
}
err = updateISCSINode(b, tp)
if err != nil {
// failure to update node db is rare. But deleting record will likely impact those who already start using it.
lastErr = fmt.Errorf("iscsi: failed to update iscsi node to portal %s error: %v", tp, err)
continue
}
// login to iscsi target
out, err = b.exec.Run("iscsiadm", "-m", "node", "-p", tp, "-T", b.Iqn, "-I", b.Iface, "--login")
if err != nil {
// delete the node record from database
b.exec.Run("iscsiadm", "-m", "node", "-p", tp, "-I", b.Iface, "-T", b.Iqn, "-o", "delete")
lastErr = fmt.Errorf("iscsi: failed to attach disk: Error: %s (%v)", string(out), err)
continue
}
// in case of node failure/restart, explicitly set to manual login so it doesn't hang on boot
out, err = b.exec.Run("iscsiadm", "-m", "node", "-p", tp, "-T", b.Iqn, "-o", "update", "-n", "node.startup", "-v", "manual")
if err != nil {
// don't fail if we can't set startup mode, but log warning so there is a clue
glog.Warningf("Warning: Failed to set iSCSI login mode to manual. Error: %v", err)
}
// Rebuild the host map after logging in
portalHostMap, err := b.deviceUtil.GetISCSIPortalHostMapForTarget(b.Iqn)
if err != nil {
return "", err
}
glog.V(6).Infof("AttachDisk portal->host map for %s is %v", b.Iqn, portalHostMap)
hostNumber, loggedIn = portalHostMap[tp]
if !loggedIn {
glog.Warningf("Could not get SCSI host number for portal %s after logging in", tp)
continue
}
}
glog.V(5).Infof("AttachDisk: scanning SCSI host %d LUN %s", hostNumber, b.Lun)
lunNumber, err := strconv.Atoi(b.Lun)
if err != nil {
return "", fmt.Errorf("AttachDisk: lun is not a number: %s\nError: %v", b.Lun, err)
}
// Scan the iSCSI bus for the LUN
err = scanOneLun(hostNumber, lunNumber)
if err != nil { if err != nil {
return "", err return "", err
} }
glog.V(6).Infof("AttachDisk portal->host map for %s is %v", b.Iqn, portalHostMap)
hostNumber, loggedIn = portalHostMap[tp] if iscsiTransport == "" {
if !loggedIn { glog.Errorf("iscsi: could not find transport name in iface %s", b.Iface)
glog.Warningf("Could not get SCSI host number for portal %s after logging in", tp) return "", fmt.Errorf("Could not parse iface file for %s", b.Iface)
}
if iscsiTransport == "tcp" {
devicePath = strings.Join([]string{"/dev/disk/by-path/ip", tp, "iscsi", b.Iqn, "lun", b.Lun}, "-")
} else {
devicePath = strings.Join([]string{"/dev/disk/by-path/pci", "*", "ip", tp, "iscsi", b.Iqn, "lun", b.Lun}, "-")
}
if exist := waitForPathToExist(&devicePath, multipathDeviceTimeout, iscsiTransport); !exist {
glog.Errorf("Could not attach disk: Timeout after 10s")
// update last error
lastErr = fmt.Errorf("Could not attach disk: Timeout after 10s")
continue continue
} else {
devicePaths[tp] = devicePath
} }
} }
glog.V(4).Infof("iscsi: tried all devices for %q %d times, %d paths found", b.Iqn, i, len(devicePaths))
glog.V(5).Infof("AttachDisk: scanning SCSI host %d LUN %s", hostNumber, b.Lun) if len(devicePaths) == 0 {
lunNumber, err := strconv.Atoi(b.Lun) // No path attached, report error and stop trying. kubelet will try again in a short while
if err != nil { // delete cloned iface
return "", fmt.Errorf("AttachDisk: lun is not a number: %s\nError: %v", b.Lun, err) b.exec.Run("iscsiadm", "-m", "iface", "-I", b.Iface, "-o", "delete")
glog.Errorf("iscsi: failed to get any path for iscsi disk, last err seen:\n%v", lastErr)
return "", fmt.Errorf("failed to get any path for iscsi disk, last err seen:\n%v", lastErr)
} }
if len(devicePaths) == len(bkpPortal) {
// Scan the iSCSI bus for the LUN // We have all paths
err = scanOneLun(hostNumber, lunNumber) glog.V(4).Infof("iscsi: all devices for %q found", b.Iqn)
if err != nil { break
return "", err
} }
if len(devicePaths) >= minMultipathCount && i >= minAttachAttempts {
if iscsiTransport == "" { // We have at least two paths for multipath and we tried the other paths long enough
glog.Errorf("iscsi: could not find transport name in iface %s", b.Iface) glog.V(4).Infof("%d devices found for %q", len(devicePaths), b.Iqn)
return "", fmt.Errorf("Could not parse iface file for %s", b.Iface) break
}
if iscsiTransport == "tcp" {
devicePath = strings.Join([]string{"/dev/disk/by-path/ip", tp, "iscsi", b.Iqn, "lun", b.Lun}, "-")
} else {
devicePath = strings.Join([]string{"/dev/disk/by-path/pci", "*", "ip", tp, "iscsi", b.Iqn, "lun", b.Lun}, "-")
}
if exist := waitForPathToExist(&devicePath, 10, iscsiTransport); !exist {
glog.Errorf("Could not attach disk: Timeout after 10s")
// update last error
lastErr = fmt.Errorf("Could not attach disk: Timeout after 10s")
continue
} else {
devicePaths = append(devicePaths, devicePath)
} }
} }
if len(devicePaths) == 0 {
// delete cloned iface
b.exec.Run("iscsiadm", "-m", "iface", "-I", b.Iface, "-o", "delete")
glog.Errorf("iscsi: failed to get any path for iscsi disk, last err seen:\n%v", lastErr)
return "", fmt.Errorf("failed to get any path for iscsi disk, last err seen:\n%v", lastErr)
}
if lastErr != nil { if lastErr != nil {
glog.Errorf("iscsi: last error occurred during iscsi init:\n%v", lastErr) glog.Errorf("iscsi: last error occurred during iscsi init:\n%v", lastErr)
} }
devicePathList := []string{}
for _, path := range devicePaths {
devicePathList = append(devicePathList, path)
}
// Try to find a multipath device for the volume // Try to find a multipath device for the volume
if 1 < len(bkpPortal) { if len(bkpPortal) > 1 {
// If the PV has 2 or more portals, wait up to 10 seconds for the multipath // Multipath volume was requested. Wait up to 10 seconds for the multipath device to appear.
// device to appear devicePath = waitForMultiPathToExist(devicePathList, 10, b.deviceUtil)
devicePath = waitForMultiPathToExist(devicePaths, 10, b.deviceUtil)
} else { } else {
// For PVs with 1 portal, just try one time to find the multipath device. This // For PVs with 1 portal, just try one time to find the multipath device. This
// avoids a long pause when the multipath device will never get created, and // avoids a long pause when the multipath device will never get created, and
// matches legacy behavior. // matches legacy behavior.
devicePath = waitForMultiPathToExist(devicePaths, 1, b.deviceUtil) devicePath = waitForMultiPathToExist(devicePathList, 1, b.deviceUtil)
} }
// When no multipath device is found, just use the first (and presumably only) device // When no multipath device is found, just use the first (and presumably only) device
if devicePath == "" { if devicePath == "" {
devicePath = devicePaths[0] devicePath = devicePathList[0]
} }
glog.V(5).Infof("iscsi: AttachDisk devicePath: %s", devicePath) glog.V(5).Infof("iscsi: AttachDisk devicePath: %s", devicePath)