mirror of https://github.com/k3s-io/k3s
405 lines
11 KiB
Go
405 lines
11 KiB
Go
package parent
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"net"
|
|
"net/http"
|
|
"os"
|
|
"os/exec"
|
|
"os/user"
|
|
"path/filepath"
|
|
"strconv"
|
|
"syscall"
|
|
|
|
"github.com/gofrs/flock"
|
|
"github.com/gorilla/mux"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
|
|
"github.com/rootless-containers/rootlesskit/pkg/api/router"
|
|
"github.com/rootless-containers/rootlesskit/pkg/common"
|
|
"github.com/rootless-containers/rootlesskit/pkg/msgutil"
|
|
"github.com/rootless-containers/rootlesskit/pkg/network"
|
|
"github.com/rootless-containers/rootlesskit/pkg/parent/cgrouputil"
|
|
"github.com/rootless-containers/rootlesskit/pkg/parent/idtools"
|
|
"github.com/rootless-containers/rootlesskit/pkg/port"
|
|
"github.com/rootless-containers/rootlesskit/pkg/sigproxy"
|
|
"github.com/rootless-containers/rootlesskit/pkg/sigproxy/signal"
|
|
)
|
|
|
|
type Opt struct {
|
|
PipeFDEnvKey string // needs to be set
|
|
StateDir string // directory needs to be precreated
|
|
StateDirEnvKey string // optional env key to propagate StateDir value
|
|
NetworkDriver network.ParentDriver // nil for HostNetwork
|
|
PortDriver port.ParentDriver // nil for --port-driver=none
|
|
PublishPorts []port.Spec
|
|
CreatePIDNS bool
|
|
CreateCgroupNS bool
|
|
CreateUTSNS bool
|
|
CreateIPCNS bool
|
|
ParentEUIDEnvKey string // optional env key to propagate geteuid() value
|
|
ParentEGIDEnvKey string // optional env key to propagate getegid() value
|
|
Propagation string
|
|
EvacuateCgroup2 string // e.g. "rootlesskit_evacuation"
|
|
}
|
|
|
|
// Documented state files. Undocumented ones are subject to change.
|
|
const (
|
|
StateFileLock = "lock"
|
|
StateFileChildPID = "child_pid" // decimal pid number text
|
|
StateFileAPISock = "api.sock" // REST API Socket
|
|
)
|
|
|
|
func checkPreflight(opt Opt) error {
|
|
if opt.PipeFDEnvKey == "" {
|
|
return errors.New("pipe FD env key is not set")
|
|
}
|
|
if opt.StateDir == "" {
|
|
return errors.New("state dir is not set")
|
|
}
|
|
if !filepath.IsAbs(opt.StateDir) {
|
|
return errors.New("state dir must be absolute")
|
|
}
|
|
if stat, err := os.Stat(opt.StateDir); err != nil || !stat.IsDir() {
|
|
return errors.Wrap(err, "state dir is inaccessible")
|
|
}
|
|
|
|
if os.Geteuid() == 0 {
|
|
logrus.Warn("Running RootlessKit as the root user is unsupported.")
|
|
}
|
|
|
|
warnSysctl()
|
|
|
|
// invalid propagation doesn't result in an error
|
|
warnPropagation(opt.Propagation)
|
|
return nil
|
|
}
|
|
|
|
// createCleanupLock uses LOCK_SH for preventing automatic cleanup of
|
|
// "/tmp/<Our State Dir>" caused by by systemd.
|
|
//
|
|
// This LOCK_SH lock is different from our lock file in the state dir.
|
|
// We could unify the lock file into LOCK_SH, but we are still keeping
|
|
// the lock file for a historical reason.
|
|
//
|
|
// See:
|
|
// - https://github.com/rootless-containers/rootlesskit/issues/185
|
|
// - https://github.com/rootless-containers/rootlesskit/pull/188
|
|
func createCleanupLock(sDir string) error {
|
|
//lock state dir when using /tmp/ path
|
|
stateDir, err := os.Open(sDir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = unix.Flock(int(stateDir.Fd()), unix.LOCK_SH)
|
|
if err != nil {
|
|
logrus.Warnf("Failed to lock the state dir %s", sDir)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// LockStateDir creates and locks "lock" file in the state dir.
|
|
func LockStateDir(stateDir string) (*flock.Flock, error) {
|
|
lockPath := filepath.Join(stateDir, StateFileLock)
|
|
lock := flock.New(lockPath)
|
|
locked, err := lock.TryLock()
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "failed to lock %s", lockPath)
|
|
}
|
|
if !locked {
|
|
return nil, errors.Errorf("failed to lock %s, another RootlessKit is running with the same state directory?", lockPath)
|
|
}
|
|
return lock, nil
|
|
}
|
|
|
|
func Parent(opt Opt) error {
|
|
if err := checkPreflight(opt); err != nil {
|
|
return err
|
|
}
|
|
|
|
err := createCleanupLock(opt.StateDir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
lock, err := LockStateDir(opt.StateDir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer os.RemoveAll(opt.StateDir)
|
|
defer lock.Unlock()
|
|
|
|
pipeR, pipeW, err := os.Pipe()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
cmd := exec.Command("/proc/self/exe", os.Args[1:]...)
|
|
cmd.SysProcAttr = &syscall.SysProcAttr{
|
|
Pdeathsig: syscall.SIGKILL,
|
|
Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
|
|
}
|
|
if opt.NetworkDriver != nil {
|
|
cmd.SysProcAttr.Unshareflags |= syscall.CLONE_NEWNET
|
|
}
|
|
if opt.CreatePIDNS {
|
|
// cannot be Unshareflags (panics)
|
|
cmd.SysProcAttr.Cloneflags |= syscall.CLONE_NEWPID
|
|
}
|
|
if opt.CreateCgroupNS {
|
|
cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWCGROUP
|
|
}
|
|
if opt.CreateUTSNS {
|
|
cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWUTS
|
|
}
|
|
if opt.CreateIPCNS {
|
|
cmd.SysProcAttr.Unshareflags |= unix.CLONE_NEWIPC
|
|
}
|
|
cmd.Stdin = os.Stdin
|
|
cmd.Stdout = os.Stdout
|
|
cmd.Stderr = os.Stderr
|
|
cmd.ExtraFiles = []*os.File{pipeR}
|
|
cmd.Env = append(os.Environ(), opt.PipeFDEnvKey+"=3")
|
|
if opt.StateDirEnvKey != "" {
|
|
cmd.Env = append(cmd.Env, opt.StateDirEnvKey+"="+opt.StateDir)
|
|
}
|
|
if opt.ParentEUIDEnvKey != "" {
|
|
cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%d", opt.ParentEUIDEnvKey, os.Geteuid()))
|
|
}
|
|
if opt.ParentEGIDEnvKey != "" {
|
|
cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%d", opt.ParentEGIDEnvKey, os.Getegid()))
|
|
}
|
|
if err := cmd.Start(); err != nil {
|
|
return errors.Wrap(err, "failed to start the child")
|
|
}
|
|
if err := setupUIDGIDMap(cmd.Process.Pid); err != nil {
|
|
return errors.Wrap(err, "failed to setup UID/GID map")
|
|
}
|
|
sigc := sigproxy.ForwardAllSignals(context.TODO(), cmd.Process.Pid)
|
|
defer signal.StopCatch(sigc)
|
|
|
|
if opt.EvacuateCgroup2 != "" {
|
|
if err := cgrouputil.EvacuateCgroup2(opt.EvacuateCgroup2); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// send message 0
|
|
msg := common.Message{
|
|
Stage: 0,
|
|
Message0: common.Message0{},
|
|
}
|
|
if _, err := msgutil.MarshalToWriter(pipeW, &msg); err != nil {
|
|
return err
|
|
}
|
|
|
|
// configure Network driver
|
|
msg = common.Message{
|
|
Stage: 1,
|
|
Message1: common.Message1{
|
|
StateDir: opt.StateDir,
|
|
},
|
|
}
|
|
if opt.NetworkDriver != nil {
|
|
netMsg, cleanupNetwork, err := opt.NetworkDriver.ConfigureNetwork(cmd.Process.Pid, opt.StateDir)
|
|
if cleanupNetwork != nil {
|
|
defer cleanupNetwork()
|
|
}
|
|
if err != nil {
|
|
return errors.Wrapf(err, "failed to setup network %+v", opt.NetworkDriver)
|
|
}
|
|
msg.Message1.Network = *netMsg
|
|
}
|
|
|
|
// configure Port driver
|
|
portDriverInitComplete := make(chan struct{})
|
|
portDriverQuit := make(chan struct{})
|
|
portDriverErr := make(chan error)
|
|
if opt.PortDriver != nil {
|
|
msg.Message1.Port.Opaque = opt.PortDriver.OpaqueForChild()
|
|
cctx := &port.ChildContext{
|
|
PID: cmd.Process.Pid,
|
|
IP: net.ParseIP(msg.Network.IP).To4(),
|
|
}
|
|
go func() {
|
|
portDriverErr <- opt.PortDriver.RunParentDriver(portDriverInitComplete,
|
|
portDriverQuit, cctx)
|
|
}()
|
|
}
|
|
|
|
// send message 1
|
|
if _, err := msgutil.MarshalToWriter(pipeW, &msg); err != nil {
|
|
return err
|
|
}
|
|
if err := pipeW.Close(); err != nil {
|
|
return err
|
|
}
|
|
if opt.PortDriver != nil {
|
|
// wait for port driver to be ready
|
|
select {
|
|
case <-portDriverInitComplete:
|
|
case err = <-portDriverErr:
|
|
return err
|
|
}
|
|
// publish ports
|
|
for _, p := range opt.PublishPorts {
|
|
st, err := opt.PortDriver.AddPort(context.TODO(), p)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "failed to expose port %v", p)
|
|
}
|
|
logrus.Debugf("published port %v", st)
|
|
}
|
|
}
|
|
|
|
// after child is fully configured, write PID to child_pid file
|
|
childPIDPath := filepath.Join(opt.StateDir, StateFileChildPID)
|
|
if err := ioutil.WriteFile(childPIDPath, []byte(strconv.Itoa(cmd.Process.Pid)), 0444); err != nil {
|
|
return errors.Wrapf(err, "failed to write the child PID %d to %s", cmd.Process.Pid, childPIDPath)
|
|
}
|
|
// listens the API
|
|
apiSockPath := filepath.Join(opt.StateDir, StateFileAPISock)
|
|
apiCloser, err := listenServeAPI(apiSockPath, &router.Backend{
|
|
StateDir: opt.StateDir,
|
|
ChildPID: cmd.Process.Pid,
|
|
NetworkDriver: opt.NetworkDriver,
|
|
PortDriver: opt.PortDriver,
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// block until the child exits
|
|
if err := cmd.Wait(); err != nil {
|
|
return errors.Wrap(err, "child exited")
|
|
}
|
|
// close the API socket
|
|
if err := apiCloser.Close(); err != nil {
|
|
return errors.Wrapf(err, "failed to close %s", apiSockPath)
|
|
}
|
|
// shut down port driver
|
|
if opt.PortDriver != nil {
|
|
portDriverQuit <- struct{}{}
|
|
err = <-portDriverErr
|
|
}
|
|
return err
|
|
}
|
|
|
|
func newugidmapArgs() ([]string, []string, error) {
|
|
u, err := user.Current()
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
uidMap := []string{
|
|
"0",
|
|
u.Uid,
|
|
"1",
|
|
}
|
|
gidMap := []string{
|
|
"0",
|
|
u.Gid,
|
|
"1",
|
|
}
|
|
|
|
uid, err := strconv.Atoi(u.Uid)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
ims, err := idtools.NewIdentityMapping(uid, u.Username)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
uidMapLast := 1
|
|
for _, im := range ims.UIDs() {
|
|
uidMap = append(uidMap, []string{
|
|
strconv.Itoa(uidMapLast),
|
|
strconv.Itoa(im.HostID),
|
|
strconv.Itoa(im.Size),
|
|
}...)
|
|
uidMapLast += im.Size
|
|
}
|
|
gidMapLast := 1
|
|
for _, im := range ims.GIDs() {
|
|
gidMap = append(gidMap, []string{
|
|
strconv.Itoa(gidMapLast),
|
|
strconv.Itoa(im.HostID),
|
|
strconv.Itoa(im.Size),
|
|
}...)
|
|
gidMapLast += im.Size
|
|
}
|
|
|
|
return uidMap, gidMap, nil
|
|
}
|
|
|
|
func setupUIDGIDMap(pid int) error {
|
|
uArgs, gArgs, err := newugidmapArgs()
|
|
if err != nil {
|
|
return errors.Wrap(err, "failed to compute uid/gid map")
|
|
}
|
|
pidS := strconv.Itoa(pid)
|
|
cmd := exec.Command("newuidmap", append([]string{pidS}, uArgs...)...)
|
|
out, err := cmd.CombinedOutput()
|
|
if err != nil {
|
|
return errors.Wrapf(err, "newuidmap %s %v failed: %s", pidS, uArgs, string(out))
|
|
}
|
|
cmd = exec.Command("newgidmap", append([]string{pidS}, gArgs...)...)
|
|
out, err = cmd.CombinedOutput()
|
|
if err != nil {
|
|
return errors.Wrapf(err, "newgidmap %s %v failed: %s", pidS, gArgs, string(out))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// apiCloser is implemented by *http.Server
|
|
type apiCloser interface {
|
|
Close() error
|
|
Shutdown(context.Context) error
|
|
}
|
|
|
|
func listenServeAPI(socketPath string, backend *router.Backend) (apiCloser, error) {
|
|
r := mux.NewRouter()
|
|
router.AddRoutes(r, backend)
|
|
srv := &http.Server{Handler: r}
|
|
err := os.RemoveAll(socketPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
l, err := net.Listen("unix", socketPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
go srv.Serve(l)
|
|
return srv, nil
|
|
}
|
|
|
|
// InitStateDir removes everything in the state dir except the lock file.
|
|
// This is needed because when the previous execution crashed, the state dir may not be removed successfully.
|
|
//
|
|
// InitStateDir must be called before calling parent functions.
|
|
func InitStateDir(stateDir string) error {
|
|
if err := os.MkdirAll(stateDir, 0755); err != nil {
|
|
return err
|
|
}
|
|
lk, err := LockStateDir(stateDir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer lk.Unlock()
|
|
stateDirStuffs, err := ioutil.ReadDir(stateDir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, f := range stateDirStuffs {
|
|
if f.Name() == StateFileLock {
|
|
continue
|
|
}
|
|
p := filepath.Join(stateDir, f.Name())
|
|
if err := os.RemoveAll(p); err != nil {
|
|
return errors.Wrapf(err, "failed to remove %s", p)
|
|
}
|
|
}
|
|
return nil
|
|
}
|