|
|
|
//go:build !windows
|
|
|
|
// +build !windows
|
|
|
|
|
|
|
|
package rootless
|
|
|
|
|
|
|
|
import (
|
|
|
|
"net"
|
|
|
|
"os"
|
|
|
|
"os/exec"
|
|
|
|
"path/filepath"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
|
|
"github.com/pkg/errors"
|
|
|
|
"github.com/rootless-containers/rootlesskit/pkg/child"
|
|
|
|
"github.com/rootless-containers/rootlesskit/pkg/copyup/tmpfssymlink"
|
|
|
|
"github.com/rootless-containers/rootlesskit/pkg/network/slirp4netns"
|
|
|
|
"github.com/rootless-containers/rootlesskit/pkg/parent"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
|
|
"golang.org/x/sys/unix"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
pipeFD = "_K3S_ROOTLESS_FD"
|
|
|
|
childEnv = "_K3S_ROOTLESS_SOCK"
|
|
|
|
evacuateCgroup2Env = "_K3S_ROOTLESS_EVACUATE_CGROUP2" // boolean
|
|
|
|
Sock = ""
|
|
|
|
|
|
|
|
mtuEnv = "K3S_ROOTLESS_MTU"
|
|
|
|
cidrEnv = "K3S_ROOTLESS_CIDR"
|
|
|
|
enableIPv6Env = "K3S_ROOTLESS_ENABLE_IPV6"
|
|
|
|
portDriverEnv = "K3S_ROOTLESS_PORT_DRIVER"
|
|
|
|
disableLoopbackEnv = "K3S_ROOTLESS_DISABLE_HOST_LOOPBACK"
|
|
|
|
)
|
|
|
|
|
|
|
|
func Rootless(stateDir string, enableIPv6 bool) error {
|
|
|
|
defer func() {
|
|
|
|
os.Unsetenv(pipeFD)
|
|
|
|
os.Unsetenv(childEnv)
|
|
|
|
}()
|
|
|
|
|
|
|
|
hasFD := os.Getenv(pipeFD) != ""
|
|
|
|
hasChildEnv := os.Getenv(childEnv) != ""
|
|
|
|
rootlessDir := filepath.Join(stateDir, "rootless")
|
|
|
|
driver := getDriver(strings.ToLower(os.Getenv(portDriverEnv)))
|
|
|
|
|
|
|
|
if hasFD {
|
|
|
|
logrus.Debug("Running rootless child")
|
|
|
|
childOpt, err := createChildOpt(driver)
|
|
|
|
if err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
|
|
|
if err := child.Child(*childOpt); err != nil {
|
|
|
|
logrus.Fatalf("child died: %v", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if hasChildEnv {
|
|
|
|
Sock = os.Getenv(childEnv)
|
|
|
|
logrus.Debug("Running rootless process")
|
|
|
|
return setupMounts(stateDir)
|
|
|
|
}
|
|
|
|
|
|
|
|
logrus.Debug("Running rootless parent")
|
|
|
|
if err := validateSysctl(); err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
|
|
|
parentOpt, err := createParentOpt(driver, rootlessDir, enableIPv6)
|
|
|
|
if err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
|
|
|
|
|
|
|
os.Setenv(childEnv, filepath.Join(parentOpt.StateDir, parent.StateFileAPISock))
|
|
|
|
if parentOpt.EvacuateCgroup2 != "" {
|
|
|
|
os.Setenv(evacuateCgroup2Env, "1")
|
|
|
|
}
|
|
|
|
if err := parent.Parent(*parentOpt); err != nil {
|
|
|
|
logrus.Fatal(err)
|
|
|
|
}
|
|
|
|
os.Exit(0)
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func validateSysctl() error {
|
|
|
|
expected := map[string]string{
|
|
|
|
// kernel.unprivileged_userns_clone needs to be 1 to allow userns on some distros.
|
|
|
|
"kernel.unprivileged_userns_clone": "1",
|
|
|
|
|
|
|
|
// net.ipv4.ip_forward should not need to be 1 in the parent namespace.
|
|
|
|
// However, the current k3s implementation has a bug that requires net.ipv4.ip_forward=1
|
|
|
|
// https://github.com/k3s-io/k3s/issues/2420#issuecomment-715051120
|
|
|
|
"net.ipv4.ip_forward": "1",
|
|
|
|
}
|
|
|
|
for key, expectedValue := range expected {
|
|
|
|
if actualValue, err := readSysctl(key); err == nil {
|
|
|
|
if expectedValue != actualValue {
|
|
|
|
return errors.Errorf("expected sysctl value %q to be %q, got %q; try adding \"%s=%s\" to /etc/sysctl.conf and running `sudo sysctl --system`",
|
|
|
|
key, expectedValue, actualValue, key, expectedValue)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func readSysctl(key string) (string, error) {
|
|
|
|
p := "/proc/sys/" + strings.ReplaceAll(key, ".", "/")
|
|
|
|
b, err := os.ReadFile(p)
|
|
|
|
if err != nil {
|
|
|
|
return "", err
|
|
|
|
}
|
|
|
|
return strings.TrimSpace(string(b)), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func parseCIDR(s string) (*net.IPNet, error) {
|
|
|
|
if s == "" {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
ip, ipnet, err := net.ParseCIDR(s)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if !ip.Equal(ipnet.IP) {
|
|
|
|
return nil, errors.Errorf("cidr must be like 10.0.2.0/24, not like 10.0.2.100/24")
|
|
|
|
}
|
|
|
|
return ipnet, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func createParentOpt(driver portDriver, stateDir string, enableIPv6 bool) (*parent.Opt, error) {
|
|
|
|
if err := os.MkdirAll(stateDir, 0755); err != nil {
|
|
|
|
return nil, errors.Wrapf(err, "failed to mkdir %s", stateDir)
|
|
|
|
}
|
|
|
|
|
|
|
|
driver.SetStateDir(stateDir)
|
|
|
|
|
|
|
|
opt := &parent.Opt{
|
|
|
|
StateDir: stateDir,
|
|
|
|
CreatePIDNS: true,
|
|
|
|
CreateCgroupNS: true,
|
|
|
|
CreateUTSNS: true,
|
|
|
|
CreateIPCNS: true,
|
|
|
|
}
|
|
|
|
|
|
|
|
selfCgroupMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if selfCgroup2 := selfCgroupMap[""]; selfCgroup2 == "" {
|
|
|
|
logrus.Warnf("Enabling cgroup2 is highly recommended, see https://rootlesscontaine.rs/getting-started/common/cgroup2/")
|
|
|
|
} else {
|
|
|
|
selfCgroup2Dir := filepath.Join("/sys/fs/cgroup", selfCgroup2)
|
|
|
|
if unix.Access(selfCgroup2Dir, unix.W_OK) == nil {
|
|
|
|
opt.EvacuateCgroup2 = "k3s_evac"
|
|
|
|
} else {
|
|
|
|
logrus.Warn("Cannot set cgroup2 evacuation, make sure to run k3s as a systemd unit")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mtu := 0
|
|
|
|
if val := os.Getenv(mtuEnv); val != "" {
|
|
|
|
if v, err := strconv.ParseInt(val, 10, 0); err != nil {
|
|
|
|
logrus.Warn("Failed to parse rootless mtu value; using default")
|
|
|
|
} else {
|
|
|
|
mtu = int(v)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
disableHostLoopback := true
|
|
|
|
if val := os.Getenv(disableLoopbackEnv); val != "" {
|
|
|
|
if v, err := strconv.ParseBool(val); err != nil {
|
|
|
|
logrus.Warn("Failed to parse rootless disable-host-loopback value; using default")
|
|
|
|
} else {
|
|
|
|
disableHostLoopback = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if val := os.Getenv(enableIPv6Env); val != "" {
|
|
|
|
if v, err := strconv.ParseBool(val); err != nil {
|
|
|
|
logrus.Warn("Failed to parse rootless enable-ipv6 value; using default")
|
|
|
|
} else {
|
|
|
|
enableIPv6 = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
cidr := "10.41.0.0/16"
|
|
|
|
if val := os.Getenv(cidrEnv); val != "" {
|
|
|
|
cidr = val
|
|
|
|
}
|
|
|
|
|
|
|
|
ipnet, err := parseCIDR(cidr)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
binary := "slirp4netns"
|
|
|
|
if _, err := exec.LookPath(binary); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
opt.NetworkDriver, err = slirp4netns.NewParentDriver(driver.LogWriter(), binary, mtu, ipnet, "tap0", disableHostLoopback, driver.APISocketPath(), false, false, enableIPv6)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
opt.PortDriver, err = driver.NewParentDriver()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
opt.PipeFDEnvKey = pipeFD
|
|
|
|
|
|
|
|
return opt, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func createChildOpt(driver portDriver) (*child.Opt, error) {
|
|
|
|
opt := &child.Opt{}
|
|
|
|
opt.TargetCmd = os.Args
|
|
|
|
opt.PipeFDEnvKey = pipeFD
|
|
|
|
opt.NetworkDriver = slirp4netns.NewChildDriver()
|
|
|
|
opt.PortDriver = driver.NewChildDriver()
|
|
|
|
opt.CopyUpDirs = []string{"/etc", "/var/run", "/run", "/var/lib"}
|
|
|
|
opt.CopyUpDriver = tmpfssymlink.NewChildDriver()
|
|
|
|
opt.MountProcfs = true
|
|
|
|
opt.Reaper = true
|
|
|
|
if v := os.Getenv(evacuateCgroup2Env); v != "" {
|
|
|
|
var err error
|
|
|
|
opt.EvacuateCgroup2, err = strconv.ParseBool(v)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return opt, nil
|
|
|
|
}
|