Merge pull request #49640 from jsafrane/systemd-mount-service

Automatic merge from submit-queue Run mount in its own systemd scope. Kubelet needs to run /bin/mount in its own cgroup. - When kubelet runs as a systemd service, "systemctl restart kubelet" may kill all processes in the same cgroup and thus terminate fuse daemons that are needed for gluster and cephfs mounts. - When kubelet runs in a docker container, restart of the container kills all fuse daemons started in the container. Killing fuse daemons is bad, it basically unmounts volumes from running pods. This patch runs mount via "systemd-run --scope /bin/mount ...", which makes sure that any fuse daemons are forked in its own systemd scope (= cgroup) and they will survive restart of kubelet's systemd service or docker container. This helps with #34965 As a downside, each new fuse daemon will run in its own transient systemd service and systemctl output may be cluttered. @kubernetes/sig-storage-pr-reviews @kubernetes/sig-node-pr-reviews ```release-note fuse daemons for GlusterFS and CephFS are now run in their own systemd scope when Kubernetes runs on a system with systemd. ```
2017-08-09 12:05:01 -07:00 · 2017-08-09 12:05:01 -07:00 · 68ac78ae45
parent 190ee708a6 dd03384747
commit 68ac78ae45
4 changed files with 135 additions and 25 deletions
--- a/pkg/util/mount/mount.go
+++ b/pkg/util/mount/mount.go
@ -107,15 +107,6 @@ func (mounter *SafeFormatAndMount) FormatAndMount(source string, target string,
 	return mounter.formatAndMount(source, target, fstype, options)
 }

-// New returns a mount.Interface for the current system.
-// It provides options to override the default mounter behavior.
-// mounterPath allows using an alternative to `/bin/mount` for mounting.
-func New(mounterPath string) Interface {
-	return &Mounter{
-		mounterPath: mounterPath,
-	}
-}
-
 // GetMountRefs finds all other references to the device referenced
 // by mountPath; returns a list of paths.
 func GetMountRefs(mounter Interface, mountPath string) ([]string, error) {
--- a/pkg/util/mount/mount_linux.go
+++ b/pkg/util/mount/mount_linux.go
@ -56,6 +56,17 @@ const (
 // kubelet is running in the host's root mount namespace.
 type Mounter struct {
 	mounterPath string
+	withSystemd bool
+}
+
+// New returns a mount.Interface for the current system.
+// It provides options to override the default mounter behavior.
+// mounterPath allows using an alternative to `/bin/mount` for mounting.
+func New(mounterPath string) Interface {
+	return &Mounter{
+		mounterPath: mounterPath,
+		withSystemd: detectSystemd(),
+	}
 }

 // Mount mounts source to target as fstype with given options. 'source' and 'fstype' must
@ -69,18 +80,18 @@ func (mounter *Mounter) Mount(source string, target string, fstype string, optio
 	mounterPath := ""
 	bind, bindRemountOpts := isBind(options)
 	if bind {
-		err := doMount(mounterPath, defaultMountCommand, source, target, fstype, []string{"bind"})
+		err := mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, []string{"bind"})
 		if err != nil {
 			return err
 		}
-		return doMount(mounterPath, defaultMountCommand, source, target, fstype, bindRemountOpts)
+		return mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, bindRemountOpts)
 	}
 	// The list of filesystems that require containerized mounter on GCI image cluster
 	fsTypesNeedMounter := sets.NewString("nfs", "glusterfs", "ceph", "cifs")
 	if fsTypesNeedMounter.Has(fstype) {
 		mounterPath = mounter.mounterPath
 	}
-	return doMount(mounterPath, defaultMountCommand, source, target, fstype, options)
+	return mounter.doMount(mounterPath, defaultMountCommand, source, target, fstype, options)
 }

 // isBind detects whether a bind mount is being requested and makes the remount options to
@ -109,24 +120,80 @@ func isBind(options []string) (bool, []string) {
 }

 // doMount runs the mount command. mounterPath is the path to mounter binary if containerized mounter is used.
-func doMount(mounterPath string, mountCmd string, source string, target string, fstype string, options []string) error {
+func (m *Mounter) doMount(mounterPath string, mountCmd string, source string, target string, fstype string, options []string) error {
 	mountArgs := makeMountArgs(source, target, fstype, options)
 	if len(mounterPath) > 0 {
 		mountArgs = append([]string{mountCmd}, mountArgs...)
 		mountCmd = mounterPath
 	}

+	if m.withSystemd {
+		// Try to run mount via systemd-run --scope. This will escape the
+		// service where kubelet runs and any fuse daemons will be started in a
+		// specific scope. kubelet service than can be restarted without killing
+		// these fuse daemons.
+		//
+		// Complete command line (when mounterPath is not used):
+		// systemd-run --description=... --scope -- mount -t <type> <what> <where>
+		//
+		// Expected flow:
+		// * systemd-run creates a transient scope (=~ cgroup) and executes its
+		//   argument (/bin/mount) there.
+		// * mount does its job, forks a fuse daemon if necessary and finishes.
+		//   (systemd-run --scope finishes at this point, returning mount's exit
+		//   code and stdout/stderr - thats one of --scope benefits).
+		// * systemd keeps the fuse daemon running in the scope (i.e. in its own
+		//   cgroup) until the fuse daemon dies (another --scope benefit).
+		//   Kubelet service can be restarted and the fuse daemon survives.
+		// * When the fuse daemon dies (e.g. during unmount) systemd removes the
+		//   scope automatically.
+		//
+		// systemd-mount is not used because it's too new for older distros
+		// (CentOS 7, Debian Jessie).
+		mountCmd, mountArgs = addSystemdScope("systemd-run", target, mountCmd, mountArgs)
+	} else {
+		// No systemd-run on the host (or we failed to check it), assume kubelet
+		// does not run as a systemd service.
+		// No code here, mountCmd and mountArgs are already populated.
+	}
+
 	glog.V(4).Infof("Mounting cmd (%s) with arguments (%s)", mountCmd, mountArgs)
 	command := exec.Command(mountCmd, mountArgs...)
 	output, err := command.CombinedOutput()
 	if err != nil {
-		glog.Errorf("Mount failed: %v\nMounting command: %s\nMounting arguments: %s %s %s %v\nOutput: %s\n", err, mountCmd, source, target, fstype, options, string(output))
-		return fmt.Errorf("mount failed: %v\nMounting command: %s\nMounting arguments: %s %s %s %v\nOutput: %s\n",
-			err, mountCmd, source, target, fstype, options, string(output))
+		args := strings.Join(mountArgs, " ")
+		glog.Errorf("Mount failed: %v\nMounting command: %s\nMounting arguments: %s\nOutput: %s\n", err, mountCmd, args, string(output))
+		return fmt.Errorf("mount failed: %v\nMounting command: %s\nMounting arguments: %s\nOutput: %s\n",
+			err, mountCmd, args, string(output))
 	}
 	return err
 }

+// detectSystemd returns true if OS runs with systemd as init. When not sure
+// (permission errors, ...), it returns false.
+// There may be different ways how to detect systemd, this one makes sure that
+// systemd-runs (needed by Mount()) works.
+func detectSystemd() bool {
+	if _, err := exec.LookPath("systemd-run"); err != nil {
+		glog.V(2).Infof("Detected OS without systemd")
+		return false
+	}
+	// Try to run systemd-run --scope /bin/true, that should be enough
+	// to make sure that systemd is really running and not just installed,
+	// which happens when running in a container with a systemd-based image
+	// but with different pid 1.
+	cmd := exec.Command("systemd-run", "--description=Kubernetes systemd probe", "--scope", "true")
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		glog.V(2).Infof("Cannot run systemd-run, assuming non-systemd OS")
+		glog.V(4).Infof("systemd-run failed with: %v", err)
+		glog.V(4).Infof("systemd-run output: %s", string(output))
+		return false
+	}
+	glog.V(2).Infof("Detected OS with systemd")
+	return true
+}
+
 // makeMountArgs makes the arguments to the mount(8) command.
 func makeMountArgs(source, target, fstype string, options []string) []string {
 	// Build mount command as follows:
@ -146,6 +213,13 @@ func makeMountArgs(source, target, fstype string, options []string) []string {
 	return mountArgs
 }

+// addSystemdScope adds "system-run --scope" to given command line
+func addSystemdScope(systemdRunPath, mountName, command string, args []string) (string, []string) {
+	descriptionArg := fmt.Sprintf("--description=Kubernetes transient mount for %s", mountName)
+	systemdRunArgs := []string{descriptionArg, "--scope", "--", command}
+	return systemdRunPath, append(systemdRunArgs, args...)
+}
+
 // Unmount unmounts the target.
 func (mounter *Mounter) Unmount(target string) error {
 	glog.V(4).Infof("Unmounting %s", target)
--- a/pkg/util/mount/mount_unsupported.go
+++ b/pkg/util/mount/mount_unsupported.go
@ -22,6 +22,15 @@ type Mounter struct {
 	mounterPath string
 }

+// New returns a mount.Interface for the current system.
+// It provides options to override the default mounter behavior.
+// mounterPath allows using an alternative to `/bin/mount` for mounting.
+func New(mounterPath string) Interface {
+	return &Mounter{
+		mounterPath: mounterPath,
+	}
+}
+
 func (mounter *Mounter) Mount(source string, target string, fstype string, options []string) error {
 	return nil
 }
--- a/pkg/util/mount/nsenter_mount.go
+++ b/pkg/util/mount/nsenter_mount.go
@ -51,7 +51,7 @@ import (
 //     contents. TODO: remove this requirement.
 // 6.  The host image must have mount, findmnt, and umount binaries in /bin,
 //     /usr/sbin, or /usr/bin
-//
+// 7.  The host image should have systemd-run in /bin, /usr/sbin, or /usr/bin
 // For more information about mount propagation modes, see:
 //   https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
 type NsenterMounter struct {
@ -62,9 +62,10 @@ type NsenterMounter struct {
 func NewNsenterMounter() *NsenterMounter {
 	m := &NsenterMounter{
 		paths: map[string]string{
-			"mount":   "",
-			"findmnt": "",
-			"umount":  "",
+			"mount":       "",
+			"findmnt":     "",
+			"umount":      "",
+			"systemd-run": "",
 		},
 	}
 	// search for the mount command in other locations besides /usr/bin
@ -80,6 +81,7 @@ func NewNsenterMounter() *NsenterMounter {
 			break
 		}
 		// TODO: error, so that the kubelet can stop if the mounts don't exist
+		// (don't forget that systemd-run is optional)
 	}
 	return m
 }
@ -128,15 +130,47 @@ func (n *NsenterMounter) doNsenterMount(source, target, fstype string, options [
 // makeNsenterArgs makes a list of argument to nsenter in order to do the
 // requested mount.
 func (n *NsenterMounter) makeNsenterArgs(source, target, fstype string, options []string) []string {
+	mountCmd := n.absHostPath("mount")
+	mountArgs := makeMountArgs(source, target, fstype, options)
+
+	if systemdRunPath, hasSystemd := n.paths["systemd-run"]; hasSystemd {
+		// Complete command line:
+		// nsenter --mount=/rootfs/proc/1/ns/mnt -- /bin/systemd-run --description=... --scope -- /bin/mount -t <type> <what> <where>
+		// Expected flow is:
+		// * nsenter breaks out of container's mount namespace and executes
+		//   host's systemd-run.
+		// * systemd-run creates a transient scope (=~ cgroup) and executes its
+		//   argument (/bin/mount) there.
+		// * mount does its job, forks a fuse daemon if necessary and finishes.
+		//   (systemd-run --scope finishes at this point, returning mount's exit
+		//   code and stdout/stderr - thats one of --scope benefits).
+		// * systemd keeps the fuse daemon running in the scope (i.e. in its own
+		//   cgroup) until the fuse daemon dies (another --scope benefit).
+		//   Kubelet container can be restarted and the fuse daemon survives.
+		// * When the daemon dies (e.g. during unmount) systemd removes the
+		//   scope automatically.
+		mountCmd, mountArgs = addSystemdScope(systemdRunPath, target, mountCmd, mountArgs)
+	} else {
+		// Fall back to simple mount when the host has no systemd.
+		// Complete command line:
+		// nsenter --mount=/rootfs/proc/1/ns/mnt -- /bin/mount -t <type> <what> <where>
+		// Expected flow is:
+		// * nsenter breaks out of container's mount namespace and executes host's /bin/mount.
+		// * mount does its job, forks a fuse daemon if necessary and finishes.
+		// * Any fuse daemon runs in cgroup of kubelet docker container,
+		//   restart of kubelet container will kill it!
+
+		// No code here, mountCmd and mountArgs use /bin/mount
+	}
+
 	nsenterArgs := []string{
 		"--mount=/rootfs/proc/1/ns/mnt",
 		"--",
-		n.absHostPath("mount"),
+		mountCmd,
 	}
+	nsenterArgs = append(nsenterArgs, mountArgs...)

-	args := makeMountArgs(source, target, fstype, options)
-
-	return append(nsenterArgs, args...)
+	return nsenterArgs
 }

 // Unmount runs umount(8) in the host's mount namespace.
@ -147,7 +181,9 @@ func (n *NsenterMounter) Unmount(target string) error {
 		n.absHostPath("umount"),
 		target,
 	}
-
+	// No need to execute systemd-run here, it's enough that unmount is executed
+	// in the host's mount namespace. It will finish appropriate fuse daemon(s)
+	// running in any scope.
 	glog.V(5).Infof("Unmount command: %v %v", nsenterPath, args)
 	exec := exec.New()
 	outputBytes, err := exec.Command(nsenterPath, args...).CombinedOutput()