k3s/vendor/github.com/rootless-containers/rootlesskit/pkg/child/child.go

314 lines
9.1 KiB
Go

package child
import (
"context"
"io/ioutil"
"os"
"os/exec"
"os/signal"
"runtime"
"strconv"
"syscall"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/rootless-containers/rootlesskit/pkg/common"
"github.com/rootless-containers/rootlesskit/pkg/copyup"
"github.com/rootless-containers/rootlesskit/pkg/msgutil"
"github.com/rootless-containers/rootlesskit/pkg/network"
"github.com/rootless-containers/rootlesskit/pkg/port"
"github.com/rootless-containers/rootlesskit/pkg/sigproxy"
sigproxysignal "github.com/rootless-containers/rootlesskit/pkg/sigproxy/signal"
)
var propagationStates = map[string]uintptr{
"private": uintptr(unix.MS_PRIVATE),
"rprivate": uintptr(unix.MS_REC | unix.MS_PRIVATE),
"shared": uintptr(unix.MS_SHARED),
"rshared": uintptr(unix.MS_REC | unix.MS_SHARED),
"slave": uintptr(unix.MS_SLAVE),
"rslave": uintptr(unix.MS_REC | unix.MS_SLAVE),
}
func createCmd(targetCmd []string) (*exec.Cmd, error) {
var args []string
if len(targetCmd) > 1 {
args = targetCmd[1:]
}
cmd := exec.Command(targetCmd[0], args...)
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.Env = os.Environ()
cmd.SysProcAttr = &syscall.SysProcAttr{
Pdeathsig: syscall.SIGKILL,
}
return cmd, nil
}
// mountSysfs is needed for mounting /sys/class/net
// when netns is unshared.
func mountSysfs() error {
tmp, err := ioutil.TempDir("/tmp", "rksys")
if err != nil {
return errors.Wrap(err, "creating a directory under /tmp")
}
defer os.RemoveAll(tmp)
cgroupDir := "/sys/fs/cgroup"
if err := unix.Mount(cgroupDir, tmp, "", uintptr(unix.MS_BIND|unix.MS_REC), ""); err != nil {
return errors.Wrapf(err, "failed to create bind mount on %s", cgroupDir)
}
if err := unix.Mount("none", "/sys", "sysfs", 0, ""); err != nil {
// when the sysfs in the parent namespace is RO,
// we can't mount RW sysfs even in the child namespace.
// https://github.com/rootless-containers/rootlesskit/pull/23#issuecomment-429292632
// https://github.com/torvalds/linux/blob/9f203e2f2f065cd74553e6474f0ae3675f39fb0f/fs/namespace.c#L3326-L3328
logrus.Warnf("failed to mount sysfs, falling back to read-only mount: %v", err)
if err := unix.Mount("none", "/sys", "sysfs", uintptr(unix.MS_RDONLY), ""); err != nil {
// when /sys/firmware is masked, even RO sysfs can't be mounted
logrus.Warnf("failed to mount sysfs: %v", err)
}
}
if err := unix.Mount(tmp, cgroupDir, "", uintptr(unix.MS_MOVE), ""); err != nil {
return errors.Wrapf(err, "failed to move mount point from %s to %s", tmp, cgroupDir)
}
return nil
}
func mountProcfs() error {
if err := unix.Mount("none", "/proc", "proc", 0, ""); err != nil {
logrus.Warnf("failed to mount procfs, falling back to read-only mount: %v", err)
if err := unix.Mount("none", "/proc", "proc", uintptr(unix.MS_RDONLY), ""); err != nil {
logrus.Warnf("failed to mount procfs: %v", err)
}
}
return nil
}
func activateLoopback() error {
cmds := [][]string{
{"ip", "link", "set", "lo", "up"},
}
if err := common.Execs(os.Stderr, os.Environ(), cmds); err != nil {
return errors.Wrapf(err, "executing %v", cmds)
}
return nil
}
func activateDev(dev, ip string, netmask int, gateway string, mtu int) error {
cmds := [][]string{
{"ip", "link", "set", dev, "up"},
{"ip", "link", "set", "dev", dev, "mtu", strconv.Itoa(mtu)},
{"ip", "addr", "add", ip + "/" + strconv.Itoa(netmask), "dev", dev},
{"ip", "route", "add", "default", "via", gateway, "dev", dev},
}
if err := common.Execs(os.Stderr, os.Environ(), cmds); err != nil {
return errors.Wrapf(err, "executing %v", cmds)
}
return nil
}
func setupCopyDir(driver copyup.ChildDriver, dirs []string) (bool, error) {
if driver != nil {
etcWasCopied := false
copied, err := driver.CopyUp(dirs)
for _, d := range copied {
if d == "/etc" {
etcWasCopied = true
break
}
}
return etcWasCopied, err
}
if len(dirs) != 0 {
return false, errors.New("copy-up driver is not specified")
}
return false, nil
}
func setupNet(msg common.Message, etcWasCopied bool, driver network.ChildDriver) error {
// HostNetwork
if driver == nil {
return nil
}
// for /sys/class/net
if err := mountSysfs(); err != nil {
return err
}
if err := activateLoopback(); err != nil {
return err
}
dev, err := driver.ConfigureNetworkChild(&msg.Network)
if err != nil {
return err
}
if err := activateDev(dev, msg.Network.IP, msg.Network.Netmask, msg.Network.Gateway, msg.Network.MTU); err != nil {
return err
}
if etcWasCopied {
if err := writeResolvConf(msg.Network.DNS); err != nil {
return err
}
if err := writeEtcHosts(); err != nil {
return err
}
} else {
logrus.Warn("Mounting /etc/resolv.conf without copying-up /etc. " +
"Note that /etc/resolv.conf in the namespace will be unmounted when it is recreated on the host. " +
"Unless /etc/resolv.conf is statically configured, copying-up /etc is highly recommended. " +
"Please refer to RootlessKit documentation for further information.")
if err := mountResolvConf(msg.StateDir, msg.Network.DNS); err != nil {
return err
}
if err := mountEtcHosts(msg.StateDir); err != nil {
return err
}
}
return nil
}
type Opt struct {
PipeFDEnvKey string // needs to be set
TargetCmd []string // needs to be set
NetworkDriver network.ChildDriver // nil for HostNetwork
CopyUpDriver copyup.ChildDriver // cannot be nil if len(CopyUpDirs) != 0
CopyUpDirs []string
PortDriver port.ChildDriver
MountProcfs bool // needs to be set if (and only if) parent.Opt.CreatePIDNS is set
Propagation string // mount propagation type
Reaper bool
}
func Child(opt Opt) error {
if opt.PipeFDEnvKey == "" {
return errors.New("pipe FD env key is not set")
}
pipeFDStr := os.Getenv(opt.PipeFDEnvKey)
if pipeFDStr == "" {
return errors.Errorf("%s is not set", opt.PipeFDEnvKey)
}
pipeFD, err := strconv.Atoi(pipeFDStr)
if err != nil {
return errors.Wrapf(err, "unexpected fd value: %s", pipeFDStr)
}
pipeR := os.NewFile(uintptr(pipeFD), "")
var msg common.Message
if _, err := msgutil.UnmarshalFromReader(pipeR, &msg); err != nil {
return errors.Wrapf(err, "parsing message from fd %d", pipeFD)
}
logrus.Debugf("child: got msg from parent: %+v", msg)
if msg.Stage == 0 {
// the parent has configured the child's uid_map and gid_map, but the child doesn't have caps here.
// so we exec the child again to obtain caps.
// PID should be kept.
if err = syscall.Exec("/proc/self/exe", os.Args, os.Environ()); err != nil {
return err
}
panic("should not reach here")
}
if msg.Stage != 1 {
return errors.Errorf("expected stage 1, got stage %d", msg.Stage)
}
// The parent calls child with Pdeathsig, but it is cleared when newuidmap SUID binary is called
// https://github.com/rootless-containers/rootlesskit/issues/65#issuecomment-492343646
runtime.LockOSThread()
err = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0)
runtime.UnlockOSThread()
if err != nil {
return err
}
os.Unsetenv(opt.PipeFDEnvKey)
if err := pipeR.Close(); err != nil {
return errors.Wrapf(err, "failed to close fd %d", pipeFD)
}
if msg.StateDir == "" {
return errors.New("got empty StateDir")
}
if err := setMountPropagation(opt.Propagation); err != nil {
return err
}
etcWasCopied, err := setupCopyDir(opt.CopyUpDriver, opt.CopyUpDirs)
if err != nil {
return err
}
if err := setupNet(msg, etcWasCopied, opt.NetworkDriver); err != nil {
return err
}
if opt.MountProcfs {
if err := mountProcfs(); err != nil {
return err
}
}
portQuitCh := make(chan struct{})
portErrCh := make(chan error)
if opt.PortDriver != nil {
go func() {
portErrCh <- opt.PortDriver.RunChildDriver(msg.Port.Opaque, portQuitCh)
}()
}
cmd, err := createCmd(opt.TargetCmd)
if err != nil {
return err
}
if opt.Reaper {
if err := runAndReap(cmd); err != nil {
return errors.Wrapf(err, "command %v exited", opt.TargetCmd)
}
} else {
if err := cmd.Start(); err != nil {
return errors.Wrapf(err, "command %v exited", opt.TargetCmd)
}
sigc := sigproxy.ForwardAllSignals(context.TODO(), cmd.Process.Pid)
defer sigproxysignal.StopCatch(sigc)
if err := cmd.Wait(); err != nil {
return errors.Wrapf(err, "command %v exited", opt.TargetCmd)
}
}
if opt.PortDriver != nil {
portQuitCh <- struct{}{}
return <-portErrCh
}
return nil
}
func setMountPropagation(propagation string) error {
flags, ok := propagationStates[propagation]
if ok {
if err := unix.Mount("none", "/", "", flags, ""); err != nil {
return errors.Wrapf(err, "failed to share mount point: /")
}
}
return nil
}
func runAndReap(cmd *exec.Cmd) error {
c := make(chan os.Signal, 32)
signal.Notify(c, syscall.SIGCHLD)
if err := cmd.Start(); err != nil {
return err
}
sigc := sigproxy.ForwardAllSignals(context.TODO(), cmd.Process.Pid)
defer sigproxysignal.StopCatch(sigc)
result := make(chan error)
go func() {
defer close(result)
for range c {
for {
if pid, err := syscall.Wait4(-1, nil, syscall.WNOHANG, nil); err != nil || pid <= 0 {
break
} else {
if pid == cmd.Process.Pid {
result <- cmd.Wait()
}
}
}
}
}()
return <-result
}