From b0b5f686be22ad250f949d9916d77b84b041cb85 Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Wed, 29 May 2019 14:39:12 +0900 Subject: [PATCH 1/2] bump up rootlesskit Signed-off-by: Akihiro Suda --- trash.lock | 2 +- vendor.conf | 2 +- .../rootlesskit/Gopkg.lock | 37 +- .../rootlesskit/Gopkg.toml | 4 + .../rootless-containers/rootlesskit/README.md | 339 ++++++++++++------ .../rootlesskit/pkg/child/child.go | 24 +- .../rootlesskit/pkg/common/message.go | 1 + .../rootlesskit/pkg/network/network.go | 4 +- .../pkg/network/slirp4netns/slirp4netns.go | 10 +- 9 files changed, 285 insertions(+), 138 deletions(-) diff --git a/trash.lock b/trash.lock index 8ddae785e0..316ebd4833 100755 --- a/trash.lock +++ b/trash.lock @@ -237,7 +237,7 @@ import: - package: github.com/robfig/cron version: v1-53-gdf38d32658d878 - package: github.com/rootless-containers/rootlesskit - version: 893c1c3de71f54c301fdb85a7c0dd15c1933c159 + version: v0.4.1 - package: github.com/russross/blackfriday version: v1.4-2-g300106c228d52c - package: github.com/seccomp/libseccomp-golang diff --git a/vendor.conf b/vendor.conf index e8fed2173f..3280f9166c 100644 --- a/vendor.conf +++ b/vendor.conf @@ -126,7 +126,7 @@ gopkg.in/yaml.v2 v2.2.1 github.com/ibuildthecloud/kvsql d37dd2b0829b44a4964e48c9396e14b0536fefb6 https://github.com/erikwilson/rancher-kvsql.git # rootless -github.com/rootless-containers/rootlesskit 893c1c3de71f54c301fdb85a7c0dd15c1933c159 +github.com/rootless-containers/rootlesskit v0.4.1 github.com/theckman/go-flock v0.7.1 github.com/morikuni/aec 39771216ff4c63d11f5e604076f9c45e8be1067b diff --git a/vendor/github.com/rootless-containers/rootlesskit/Gopkg.lock b/vendor/github.com/rootless-containers/rootlesskit/Gopkg.lock index 36b4a002fe..8658cc57e6 100644 --- a/vendor/github.com/rootless-containers/rootlesskit/Gopkg.lock +++ b/vendor/github.com/rootless-containers/rootlesskit/Gopkg.lock @@ -53,6 +53,19 @@ revision = "a7962380ca08b5a188038c69871b8d3fbdf31e89" version = "v1.7.0" +[[projects]] + branch = "master" + digest = "1:a1d58d11ad642c9760251e0846ee37ad1237e83f8245c8d310e3eba0d76bb7f4" + name = "github.com/insomniacslk/dhcp" + packages = [ + "dhcpv4", + "dhcpv4/client4", + "iana", + "rfc1035label", + ] + pruneopts = "UT" + revision = "625d653f51917b167cc2e53ef8fe595e85dd5fa4" + [[projects]] branch = "master" digest = "1:dd1e851f4e3a5ee3f51613c79a01666e04e5e9289e2da3f6f815c008010fc02f" @@ -128,6 +141,18 @@ revision = "392e7fae8f1b0bdbd67dad7237d23f618feb6dbb" version = "v0.7.1" +[[projects]] + branch = "master" + digest = "1:35bdf197f9a11e01b3e2d9a35af57cd2b0ed0023d7c4d63a3e1bd7762275a17a" + name = "github.com/u-root/u-root" + packages = [ + "pkg/rand", + "pkg/ubinary", + "pkg/uio", + ] + pruneopts = "UT" + revision = "34b144e97033ea76860cc0fa6a69256cd5fe2133" + [[projects]] digest = "1:b24d38b282bacf9791408a080f606370efa3d364e4b5fd9ba0f7b87786d3b679" name = "github.com/urfave/cli" @@ -138,9 +163,15 @@ [[projects]] branch = "master" - digest = "1:7ccb2dbb79f60b4e530c7dc3a0b3681b3869cf7ea91c645735e4dd4e6e3264fd" + digest = "1:19df0dab53c3fdea922472b4199990146f3db473881fdc59317d8b4800df145a" name = "golang.org/x/net" - packages = ["context/ctxhttp"] + packages = [ + "bpf", + "context/ctxhttp", + "internal/iana", + "internal/socket", + "ipv4", + ] pruneopts = "UT" revision = "74de082e2cca95839e88aa0aeee5aadf6ce7710f" @@ -162,6 +193,8 @@ "github.com/docker/docker/pkg/idtools", "github.com/google/uuid", "github.com/gorilla/mux", + "github.com/insomniacslk/dhcp/dhcpv4", + "github.com/insomniacslk/dhcp/dhcpv4/client4", "github.com/jamescun/tuntap", "github.com/moby/vpnkit/go/pkg/vmnet", "github.com/pkg/errors", diff --git a/vendor/github.com/rootless-containers/rootlesskit/Gopkg.toml b/vendor/github.com/rootless-containers/rootlesskit/Gopkg.toml index 9a1b2653cf..2a12629dca 100644 --- a/vendor/github.com/rootless-containers/rootlesskit/Gopkg.toml +++ b/vendor/github.com/rootless-containers/rootlesskit/Gopkg.toml @@ -39,3 +39,7 @@ [prune] go-tests = true unused-packages = true + +[[constraint]] + branch = "master" + name = "github.com/insomniacslk/dhcp" diff --git a/vendor/github.com/rootless-containers/rootlesskit/README.md b/vendor/github.com/rootless-containers/rootlesskit/README.md index 634918537f..2a3f1bef75 100644 --- a/vendor/github.com/rootless-containers/rootlesskit/README.md +++ b/vendor/github.com/rootless-containers/rootlesskit/README.md @@ -1,15 +1,40 @@ # RootlessKit: the gate to the rootless world -`rootlesskit` is a kind of Linux-native "fake root" utility, made for mainly running [Docker and Kubernetes as an unprivileged user](https://github.com/rootless-containers/usernetes). +RootlessKit is a kind of Linux-native "fake root" utility, made for mainly running [Docker and Kubernetes as an unprivileged user](https://github.com/rootless-containers/usernetes), so as to protect the real root on the host from potential container-breakout attacks. -`rootlesskit` does an equivalent of [`unshare(1)`](http://man7.org/linux/man-pages/man1/unshare.1.html) and [`newuidmap(1)`](http://man7.org/linux/man-pages/man1/newuidmap.1.html)/[`newgidmap(1)`](http://man7.org/linux/man-pages/man1/newgidmap.1.html) in a single command, for creating unprivileged [`user_namespaces(7)`](http://man7.org/linux/man-pages/man7/user_namespaces.7.html) and [`mount_namespaces(7)`](http://man7.org/linux/man-pages/man7/user_namespaces.7.html) with [`subuid(5)`](http://man7.org/linux/man-pages/man5/subuid.5.html) and [`subgid(5)`](http://man7.org/linux/man-pages/man5/subgid.5.html). + + -`rootlesskit` also supports network namespace isolation and userspace NAT using ["slirp"](#slirp). -Kernel NAT using SUID-enabled [`lxc-user-nic(1)`](https://linuxcontainers.org/lxc/manpages/man1/lxc-user-nic.1.html) is also on the plan. + +- [What it actually does](#what-it-actually-does) +- [Projects using RootlessKit](#projects-using-rootlesskit) +- [Setup](#setup) + - [Requirements](#requirements) + - [Distribution-specific hints](#distribution-specific-hints) +- [Usage](#usage) +- [State directory](#state-directory) +- [Environment variables](#environment-variables) +- [Network Drivers](#network-drivers) + - [`--net=host` (default)](#--nethost-default) + - [`--net=slirp4netns` (recommended)](#--netslirp4netns-recommended) + - [`--net=vpnkit`](#--netvpnkit) + - [`--net=lxc-user-nic` (experimental)](#--netlxc-user-nic-experimental) +- [Port Drivers](#port-drivers) + + + +## What it actually does + +RootlessKit creates [`user_namespaces(7)`](http://man7.org/linux/man-pages/man7/user_namespaces.7.html) and [`mount_namespaces(7)`](http://man7.org/linux/man-pages/man7/mount_namespaces.7.html), and executes [`newuidmap(1)`](http://man7.org/linux/man-pages/man1/newuidmap.1.html)/[`newgidmap(1)`](http://man7.org/linux/man-pages/man1/newgidmap.1.html) along with [`subuid(5)`](http://man7.org/linux/man-pages/man5/subuid.5.html) and [`subgid(5)`](http://man7.org/linux/man-pages/man5/subgid.5.html). + +RootlessKit also supports isolating [`network_namespaces(7)`](http://man7.org/linux/man-pages/man7/network_namespaces.7.html) with userspace NAT using ["slirp"](#network-drivers). +Kernel NAT using SUID-enabled [`lxc-user-nic(1)`](https://linuxcontainers.org/lxc/manpages/man1/lxc-user-nic.1.html) is also experimentally supported. ## Projects using RootlessKit +* [Docker/Moby](https://get.docker.com/rootless) * [Usernetes](https://github.com/rootless-containers/usernetes): Docker & Kubernetes, installable under a non-root user's `$HOME`. +* [k3s](https://k3s.io/): Lightweight Kubernetes * [BuildKit](https://github.com/moby/buildkit): Next-generation `docker build` backend ## Setup @@ -19,22 +44,37 @@ $ go get github.com/rootless-containers/rootlesskit/cmd/rootlesskit $ go get github.com/rootless-containers/rootlesskit/cmd/rootlessctl ``` -Requirements: -* Some distros such as Debian (excluding Ubuntu) and Arch Linux require `sudo sh -c "echo 1 > /proc/sys/kernel/unprivileged_userns_clone"`. -* `newuidmap` and `newgidmap` need to be installed on the host. These commands are provided by the `uidmap` package on most distros. -* `/etc/subuid` and `/etc/subgid` should contain >= 65536 sub-IDs. e.g. `penguin:231072:65536`. +or just run `make` to make binaries under `./bin` directory. + +### Requirements + +* `newuidmap` and `newgidmap` need to be installed on the host. These commands are provided by the `uidmap` package on most distributions. + +* `/etc/subuid` and `/etc/subgid` should contain more than 65536 sub-IDs. e.g. `penguin:231072:65536`. These files are automatically configured on most distributions. ```console $ id -u 1001 $ whoami penguin -$ grep ^$(whoami): /etc/subuid +$ grep "^$(whoami):" /etc/subuid penguin:231072:65536 -$ grep ^$(whoami): /etc/subgid +$ grep "^$(whoami):" /etc/subgid penguin:231072:65536 ``` +#### Distribution-specific hints + +Debian (excluding Ubuntu): +* `sudo sh -c "echo 1 > /proc/sys/kernel/unprivileged_userns_clone"` is required + +Arch Linux: +* `sudo sh -c "echo 1 > /proc/sys/kernel/unprivileged_userns_clone"` is required + +RHEL/CentOS 7: +* `sudo sh -c "echo 28633 > /proc/sys/user/max_user_namespaces"` is required +* [COPR package `vbatts/shadow-utils-newxidmap`](https://copr.fedorainfracloud.org/coprs/vbatts/shadow-utils-newxidmap/) needs to be installed + ## Usage @@ -70,10 +110,10 @@ rootlesskit$ rm /etc/resolv.conf rootlesskit$ vi /etc/resolv.conf ``` -You can even create network namespaces with [Slirp](#slirp): +You can even create network namespaces with [Slirp](#network-drivers): ```console -$ rootlesskit --copy-up=/etc --copy-up=/run --net=slirp4netns bash +$ rootlesskit --copy-up=/etc --copy-up=/run --net=slirp4netns --disable-host-loopback bash rootlesskit$ ip netns add foo ... ``` @@ -95,7 +135,7 @@ allow Full CLI options: ```console -$ rootlesskit --help + NAME: rootlesskit - the gate to the rootless world @@ -103,40 +143,37 @@ USAGE: rootlesskit [global options] command [command options] [arguments...] VERSION: - 0.3.0-alpha.0 + 0.3.0+dev COMMANDS: help, h Shows a list of commands or help for one command GLOBAL OPTIONS: - --debug debug mode - --state-dir value state directory - --net value network driver [host, slirp4netns, vpnkit, vdeplug_slirp] (default: "host") - --slirp4netns-binary value path of slirp4netns binary for --net=slirp4netns (default: "slirp4netns") - --vpnkit-binary value path of VPNKit binary for --net=vpnkit (default: "vpnkit") - --mtu value MTU for non-host network (default: 65520 for slirp4netns, 1500 for others) (default: 0) - --cidr value CIDR for slirp4netns network (default: 10.0.2.0/24, requires slirp4netns v0.3.0+ for custom CIDR) - --disable-host-loopback prohibit connecting to 127.0.0.1:* on the host namespace - --copy-up value mount a filesystem and copy-up the contents. e.g. "--copy-up=/etc" (typically required for non-host network) - --copy-up-mode value copy-up mode [tmpfs+symlink] (default: "tmpfs+symlink") - --port-driver value port driver for non-host network. [none, socat] (default: "none") - --help, -h show help - --version, -v print the version -``` - -## Building from source -`rootlesskit` and `rootlessctl` can be built from source using: - -``` -make + --debug debug mode + --state-dir value state directory + --net value network driver [host, slirp4netns, vpnkit, lxc-user-nic(experimental), vdeplug_slirp(deprecated)] (default: "host") + --slirp4netns-binary value path of slirp4netns binary for --net=slirp4netns (default: "slirp4netns") + --vpnkit-binary value path of VPNKit binary for --net=vpnkit (default: "vpnkit") + --lxc-user-nic-binary value path of lxc-user-nic binary for --net=lxc-user-nic (default: "/usr/lib/x86_64-linux-gnu/lxc/lxc-user-nic") + --lxc-user-nic-bridge value lxc-user-nic bridge name (default: "lxcbr0") + --mtu value MTU for non-host network (default: 65520 for slirp4netns, 1500 for others) (default: 0) + --cidr value CIDR for slirp4netns network (default: 10.0.2.0/24, requires slirp4netns v0.3.0+ for custom CIDR) + --disable-host-loopback prohibit connecting to 127.0.0.1:* on the host namespace + --copy-up value mount a filesystem and copy-up the contents. e.g. "--copy-up=/etc" (typically required for non-host network) + --copy-up-mode value copy-up mode [tmpfs+symlink] (default: "tmpfs+symlink") + --port-driver value port driver for non-host network. [none, socat, slirp4netns, builtin(experimental)] (default: "none") + --help, -h show help + --version, -v print the version ``` ## State directory -The following files will be created in the `--state-dir` directory: +The following files will be created in the state directory, which can be specified with `--state-dir`: * `lock`: lock file * `child_pid`: decimal PID text that can be used for `nsenter(1)`. -* `api.sock`: REST API socket for `rootlessctl`. See [Port forwarding](#port-forwarding) section. +* `api.sock`: REST API socket for `rootlessctl`. See [Port Drivers](#port-drivers) section. + +If `--state-dir` is not specified, RootlessKit creates a temporary state directory on `/tmp` and removes it on exit. Undocumented files are subject to change. @@ -147,20 +184,95 @@ The following environment variables will be set for the child process: Undocumented environment variables are subject to change. -## Slirp +## Network Drivers -Remarks: -* Specifying `--copy-up=/etc` is highly recommended unless `/etc/resolv.conf` is statically configured. Otherwise `/etc/resolv.conf` will be invalidated when it is recreated on the host, typically by NetworkManager or systemd-resolved. +RootlessKit provides several drivers for providing network connectivity: -Currently there are three slirp implementations supported by rootlesskit: -* `--net=slirp4netns`, using [slirp4netns](https://github.com/rootless-containers/slirp4netns) (recommended) -* `--net=vpnkit`, using [VPNKit](https://github.com/moby/vpnkit) -* `--net=vdeplug_slirp`, using [vdeplug_slirp](https://github.com/rd235/vdeplug_slirp) +* `--net=host`: use host network namespace (default) +* `--net=slirp4netns`: use [slirp4netns](https://github.com/rootless-containers/slirp4netns) (recommended) +* `--net=vpnkit`: use [VPNKit](https://github.com/moby/vpnkit) +* `--net=lxc-user-nic`: use `lxc-user-nic` (experimental) +* `--net=vdeplug_slirp`: use [vdeplug_slirp](https://github.com/rd235/vdeplug_slirp) (deprecated) -Usage: +[Benchmark (Aug 28, 2018)](https://github.com/rootless-containers/rootlesskit/pull/16): + +| Implementation | MTU=1500 | MTU=4000 | MTU=16384 | MTU=65520 +|---------------------------------|------------|-------------|-------------|------------ +|(rootful veth) |(52.1 Gbps) | (45.4 Gbps) | (43.6 Gbps )| (51.5 Gbps) +|`rootlesskit --net=slirp4netns` | 1.07 Gbps | 2.78 Gbps | 4.55 Gbps | 9.21 Gbps +|`rootlesskit --net=vpnKit` | 514 Mbps | 526 Mbps | 540 Mbps |(Unsupported) +|`rootlesskit --net=vdeplug_slirp`| 763 Mbps |(Unsupported)|(Unsupported)|(Unsupported) +| + +`--net=lxc-user-nic` is as fast as rootful veth. + +### `--net=host` (default) + +`--net=host` does not isolate the network namespace from the host. + +Pros: +* No performance overhead +* Supports ICMP Echo (`ping`) when `/proc/sys/net/ipv4/ping_group_range` is configured + +Cons: +* No permission for network-namespaced operations, e.g. creating iptables rules, running `tcpdump` + +To route ICMP Echo packets (`ping`), you need to write the range of GIDs to [`net.ipv4.ping_group_range`](http://man7.org/linux/man-pages/man7/icmp.7.html). ```console -$ rootlesskit --state-dir=/run/user/1001/rootlesskit/foo --net=slirp4netns --copy-up=/etc bash +$ sudo sh -c "echo 0 2147483647 > /proc/sys/net/ipv4/ping_group_range" +``` + +### `--net=slirp4netns` (recommended) + +`--net=slirp4netns` isolates the network namespace from the host and launch [slirp4netns](https://github.com/rootless-containers/slirp4netns) for providing usermode networking. + +Pros: +* Possible to perform network-namespaced operations, e.g. creating iptables rules, running `tcpdump` +* Supports ICMP Echo (`ping`) when `/proc/sys/net/ipv4/ping_group_range` is configured + +Cons: +* Extra performance overhead (but still faster than `--net=vpnkit`) +* Supports only TCP, UDP, and ICMP Echo packets + + +To use `--net=slirp4netns`, you need to install slirp4netns. +v0.3.0 or later is recommended. + +```console +$ sudo dnf install slirp4netns +``` + +or + +```console +$ sudo apt-get install slirp4netns +``` + +If binary package is not available for your distribution, install from the source: + +```console +$ git clone https://github.com/rootless-containers/slirp4netns +$ cd slirp4netns +$ ./autogen.sh && ./configure && make +$ cp slirp4netns ~/bin +``` + +The network is configured as follows by default: +* IP: 10.0.2.100/24 +* Gateway: 10.0.2.2 +* DNS: 10.0.2.3 + +The network configuration can be changed by specifying custom CIDR, e.g. `--cidr=10.0.3.0/24` (requires slirp4netns v0.3.0+). + +Specifying `--copy-up=/etc` is highly recommended unless `/etc/resolv.conf` on the host is statically configured. Otherwise `/etc/resolv.conf` in the RootlessKit's mount namespace will be unmounted when `/etc/resolv.conf` on the host is recreated, typically by NetworkManager or systemd-resolved. + +It is also highly recommended to specyfy`--disable-host-loopback`. Otherwise ports listening on 127.0.0.1 in the host are accessible as 10.0.2.2 in the RootlessKit's network namespace. + +Example session: + +```console +$ rootlesskit --net=slirp4netns --copy-up=/etc --disable-host-loopback bash rootlesskit$ ip a 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 @@ -168,13 +280,13 @@ rootlesskit$ ip a valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever -2: tap0: mtu 1500 qdisc fq_codel state UP group default qlen 1000 - link/ether 42:b6:8d:e4:02:c4 brd ff:ff:ff:ff:ff:ff +2: tap0: mtu 65520 qdisc fq_codel state UP group default qlen 1000 + link/ether 46:dc:8d:09:fd:f2 brd ff:ff:ff:ff:ff:ff inet 10.0.2.100/24 scope global tap0 valid_lft forever preferred_lft forever - inet6 fe80::40b6:8dff:fee4:2c4/64 scope link + inet6 fe80::44dc:8dff:fe09:fdf2/64 scope link valid_lft forever preferred_lft forever -rootlesskit$ ip r +ootlesskit$ ip r default via 10.0.2.2 dev tap0 10.0.2.0/24 dev tap0 proto kernel scope link src 10.0.2.100 rootlesskit$ cat /etc/resolv.conf @@ -183,23 +295,70 @@ rootlesskit$ curl https://www.google.com ... ``` -Default network configuration for `--net=slirp4netns` and `--net=vdeplug_slirp`: -* IP: 10.0.2.100/24 -* Gateway: 10.0.2.2 -* DNS: 10.0.2.3 -* Host: 10.0.2.2, 10.0.2.3 -Default network configuration for `--net=vpnkit`: +### `--net=vpnkit` + +`--net=vpnkit` isolates the network namespace from the host and launch [VPNKit](https://github.com/moby/vpnkit) for providing usermode networking. + +Pros: +* Possible to perform network-namespaced operations, e.g. creating iptables rules, running `tcpdump` + +Cons: +* Extra performance overhead +* Supports only TCP and UDP packets. No support for ICMP Echo (`ping`) unlike `--net=slirp4netns`, even if `/proc/sys/net/ipv4/ping_group_range` is configured. + +To use `--net=vpnkit`, you need to install VPNkit. + +```console +$ git clone https://github.com/moby/vpnkit.git +$ cd vpnkit +$ make +$ cp vpnkit.exe ~/bin/vpnkit +``` + +The network is configured as follows by default: * IP: 192.168.65.3/24 * Gateway: 192.168.65.1 * DNS: 192.168.65.1 -* Host: 192.168.65.2 -`--net=slirp4netns` supports specifying custom CIDR, e.g. `--cidr=10.0.3.0/24` (requires slirp4netns v0.3.0+) +As in `--net=slirp4netns`, specifying `--copy-up=/etc` and `--disable-host-loopback` is highly recommended. +If `--disable-host-loopback` is not specified, ports listening on 127.0.0.1 in the host are accessible as 192.168.65.2 in the RootlessKit's network namespace. -It is highly recommended to disable host loopback address by specyfing `--disable-host-loopback`. +### `--net=lxc-user-nic` (experimental) -### Port forwarding +`--net=lxc-user-nic` isolates the network namespace from the host and launch [`lxc-user-nic(1)`](https://linuxcontainers.org/lxc/manpages/man1/lxc-user-nic.1.html) SUID binary for providing kernel-mode NAT. + +Pros: +* No performance overhead +* Possible to perform network-namespaced operations, e.g. creating iptables rules, running `tcpdump` +* Supports ICMP Echo (`ping`) without `/proc/sys/net/ipv4/ping_group_range` configuration + +Cons: +* Less secure +* Needs `/etc/lxc/lxc-usernet` configuration + +To use `lxc-user-nic`, you need to install `liblxc-common` package: +```console +$ sudo apt-get install liblxc-common +``` + +You also need to set up [`/etc/lxc/lxc-usernet`](https://linuxcontainers.org/lxc/manpages/man5/lxc-usernet.5.html): +``` +# USERNAME TYPE BRIDGE COUNT +penguin veth lxcbr0 1 +``` + +The `COUNT` value needs to be increased to run multiple RootlessKit instances with `--net=lxc-user-nic` simultaneously. + +It may take a few seconds to configure the interface using DHCP. + +If you start and stop RootlessKit too frequently, you might use up all available DHCP addresses. +You might need to reset `/var/lib/misc/dnsmasq.lxcbr0.leases` and restart the `lxc-net` service. + +Currently, the MAC address is always set to a random address. + + +## Port Drivers `rootlessctl` can be used for exposing the ports in the network namespace to the host network namespace. You also need to launch `rootlesskit` with `--port-driver=(socat|slirp4netns|builtin)`. `builtin` is the fastest but currently experimental. @@ -207,7 +366,7 @@ You also need to launch `rootlesskit` with `--port-driver=(socat|slirp4netns|bui For example, to expose 80 in the child as 8080 in the parent: ```console -$ rootlesskit --state-dir=/run/user/1001/rootlesskit/foo --net=slirp4netns --copy-up=/etc --port-driver=socat bash +$ rootlesskit --state-dir=/run/user/1001/rootlesskit/foo --net=slirp4netns --disable-host-loopback --copy-up=/etc --port-driver=builtin bash rootlesskit$ rootlessctl --socket=/run/user/1001/rootlesskit/foo/api.sock add-ports 0.0.0.0:8080:80/tcp 1 rootlesskit$ rootlessctl --socket=/run/user/1001/rootlesskit/foo/api.sock list-ports @@ -217,67 +376,9 @@ rootlesskit$ rootlessctl --socket=/run/user/1001/rootlesskit/foo/api.sock remove 1 ``` -You can also expose the ports manually without using the API socket. +You can also expose ports using `socat` and `nsenter` instead of RootlessKit's port drivers. ```console $ pid=$(cat /run/user/1001/rootlesskit/foo/child_pid) $ socat -t -- TCP-LISTEN:8080,reuseaddr,fork EXEC:"nsenter -U -n -t $pid socat -t -- STDIN TCP4\:127.0.0.1\:80" ``` -### Routing ping packets - -To route ping packets, you need to set up `net.ipv4.ping_group_range` properly. - -```console -$ sudo sh -c "echo 0 2147483647 > /proc/sys/net/ipv4/ping_group_range" -``` - -Note: routing ping packets is not supported for `--net=vpnkit`. - -### Annex: benchmark (MTU=1500) - -Aug 1, 2018, on Travis: https://travis-ci.org/rootless-containers/rootlesskit/builds/410721610 - -* `--net=slirp4netns`: 1.07 Gbits/sec -* `--net=vpnkit`: 528 Mbits/sec -* `--net=vdeplug_slirp`: 771 Mbits/sec - -Note: slirp4netns can reach 8.18 Gbits/sec with MTU=65520: https://github.com/rootless-containers/slirp4netns/pull/20 - -### Annex: how to install `slirp4netns` (required for `--net=slirp4netns`) - -See also https://github.com/rootless-containers/slirp4netns - -```console -$ git clone https://github.com/rootless-containers/slirp4netns -$ cd slirp4netns -$ ./autogen.sh && ./configure && make -$ cp slirp4netns ~/bin -``` - -RPM is also available for Fedora: https://rpms.remirepo.net/rpmphp/zoom.php?rpm=slirp4netns - -```console -$ sudo dnf install slirp4netns -``` - -### Annex: how to install VPNKit (required for `--net=vpnkit`) - -See also https://github.com/moby/vpnkit - -```console -$ git clone https://github.com/moby/vpnkit.git -$ cd vpnkit -$ make -$ cp vpnkit.exe ~/bin/vpnkit -``` - -### Annex: how to install `vdeplug_slirp` (required for `--net=vdeplug_slirp`) - -You need to install the following components: - -* https://github.com/rd235/s2argv-execs -* https://github.com/rd235/vdeplug4 (depends on `s2argv-execs`) -* https://github.com/rd235/libslirp -* https://github.com/rd235/vdeplug_slirp (depends on `vdeplug4` and `libslirp`) - -Please refer to README in the each of the components. diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/child/child.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/child/child.go index 8b2b581b5b..c184df74a2 100644 --- a/vendor/github.com/rootless-containers/rootlesskit/pkg/child/child.go +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/child/child.go @@ -4,11 +4,13 @@ import ( "io/ioutil" "os" "os/exec" + "runtime" "strconv" "syscall" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" "github.com/rootless-containers/rootlesskit/pkg/common" "github.com/rootless-containers/rootlesskit/pkg/copyup" @@ -76,12 +78,12 @@ func activateLoopback() error { return nil } -func activateTap(tap, ip string, netmask int, gateway string, mtu int) error { +func activateDev(dev, ip string, netmask int, gateway string, mtu int) error { cmds := [][]string{ - {"ip", "link", "set", tap, "up"}, - {"ip", "link", "set", "dev", tap, "mtu", strconv.Itoa(mtu)}, - {"ip", "addr", "add", ip + "/" + strconv.Itoa(netmask), "dev", tap}, - {"ip", "route", "add", "default", "via", gateway, "dev", tap}, + {"ip", "link", "set", dev, "up"}, + {"ip", "link", "set", "dev", dev, "mtu", strconv.Itoa(mtu)}, + {"ip", "addr", "add", ip + "/" + strconv.Itoa(netmask), "dev", dev}, + {"ip", "route", "add", "default", "via", gateway, "dev", dev}, } if err := common.Execs(os.Stderr, os.Environ(), cmds); err != nil { return errors.Wrapf(err, "executing %v", cmds) @@ -119,11 +121,11 @@ func setupNet(msg common.Message, etcWasCopied bool, driver network.ChildDriver) if err := activateLoopback(); err != nil { return err } - tap, err := driver.ConfigureTap(msg.Network) + dev, err := driver.ConfigureNetworkChild(&msg.Network) if err != nil { return err } - if err := activateTap(tap, msg.Network.IP, msg.Network.Netmask, msg.Network.Gateway, msg.Network.MTU); err != nil { + if err := activateDev(dev, msg.Network.IP, msg.Network.Netmask, msg.Network.Gateway, msg.Network.MTU); err != nil { return err } if etcWasCopied { @@ -187,6 +189,14 @@ func Child(opt Opt) error { if msg.Stage != 1 { return errors.Errorf("expected stage 1, got stage %d", msg.Stage) } + // The parent calls child with Pdeathsig, but it is cleared when newuidmap SUID binary is called + // https://github.com/rootless-containers/rootlesskit/issues/65#issuecomment-492343646 + runtime.LockOSThread() + err = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0) + runtime.UnlockOSThread() + if err != nil { + return err + } os.Unsetenv(opt.PipeFDEnvKey) if err := pipeR.Close(); err != nil { return errors.Wrapf(err, "failed to close fd %d", pipeFD) diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/common/message.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/common/message.go index e9ce61c39c..072044474b 100644 --- a/vendor/github.com/rootless-containers/rootlesskit/pkg/common/message.go +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/common/message.go @@ -22,6 +22,7 @@ type Message1 struct { // NetworkMessage is empty for HostNetwork. type NetworkMessage struct { + Dev string IP string Netmask int Gateway string diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/network/network.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/network/network.go index fa2d336f33..836a877451 100644 --- a/vendor/github.com/rootless-containers/rootlesskit/pkg/network/network.go +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/network/network.go @@ -14,5 +14,7 @@ type ParentDriver interface { // ChildDriver is called from the child namespace type ChildDriver interface { - ConfigureTap(netmsg common.NetworkMessage) (tap string, err error) + // netmsg MAY be modified. + // devName is like "tap" or "eth0" + ConfigureNetworkChild(netmsg *common.NetworkMessage) (devName string, err error) } diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/network/slirp4netns/slirp4netns.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/network/slirp4netns/slirp4netns.go index ea91609463..45e1aa53e7 100644 --- a/vendor/github.com/rootless-containers/rootlesskit/pkg/network/slirp4netns/slirp4netns.go +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/network/slirp4netns/slirp4netns.go @@ -41,8 +41,6 @@ func NewParentDriver(binary string, mtu int, ipnet *net.IPNet, disableHostLoopba } } -const opaqueTap = "slirp4netns.tap" - type parentDriver struct { binary string mtu int @@ -87,10 +85,8 @@ func (d *parentDriver) ConfigureNetwork(childPID int, stateDir string) (*common. return nil, common.Seq(cleanups), errors.Wrapf(err, "executing %v", cmd) } netmsg := common.NetworkMessage{ + Dev: tap, MTU: d.mtu, - Opaque: map[string]string{ - opaqueTap: tap, - }, } if d.ipnet != nil { // TODO: get the actual configuration via slirp4netns API? @@ -126,8 +122,8 @@ func NewChildDriver() network.ChildDriver { type childDriver struct { } -func (d *childDriver) ConfigureTap(netmsg common.NetworkMessage) (string, error) { - tap := netmsg.Opaque[opaqueTap] +func (d *childDriver) ConfigureNetworkChild(netmsg *common.NetworkMessage) (string, error) { + tap := netmsg.Dev if tap == "" { return "", errors.New("could not determine the preconfigured tap") } From 5a51a8de45cdefe3e80a6faf16fa71a3d4200c6b Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Wed, 29 May 2019 15:01:38 +0900 Subject: [PATCH 2/2] rootless: use built-in port driver Signed-off-by: Akihiro Suda --- pkg/rootless/rootless.go | 5 +- .../rootlesskit/pkg/port/builtin/builtin.go | 487 ++++++++++++++++++ .../rootlesskit/pkg/port/socat/socat.go | 218 -------- 3 files changed, 490 insertions(+), 220 deletions(-) create mode 100644 vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/builtin.go delete mode 100644 vendor/github.com/rootless-containers/rootlesskit/pkg/port/socat/socat.go diff --git a/pkg/rootless/rootless.go b/pkg/rootless/rootless.go index 398475cba1..7ab1813d9a 100644 --- a/pkg/rootless/rootless.go +++ b/pkg/rootless/rootless.go @@ -13,7 +13,7 @@ import ( "github.com/rootless-containers/rootlesskit/pkg/copyup/tmpfssymlink" "github.com/rootless-containers/rootlesskit/pkg/network/slirp4netns" "github.com/rootless-containers/rootlesskit/pkg/parent" - "github.com/rootless-containers/rootlesskit/pkg/port/socat" + portbuiltin "github.com/rootless-containers/rootlesskit/pkg/port/builtin" "github.com/sirupsen/logrus" ) @@ -103,7 +103,7 @@ func createParentOpt(stateDir string) (*parent.Opt, error) { return nil, err } opt.NetworkDriver = slirp4netns.NewParentDriver(binary, mtu, ipnet, disableHostLoopback, "") - opt.PortDriver, err = socat.NewParentDriver(&logrusDebugWriter{}) + opt.PortDriver, err = portbuiltin.NewParentDriver(&logrusDebugWriter{}, stateDir) if err != nil { return nil, err } @@ -127,6 +127,7 @@ func createChildOpt() (*child.Opt, error) { opt.TargetCmd = os.Args opt.PipeFDEnvKey = pipeFD opt.NetworkDriver = slirp4netns.NewChildDriver() + opt.PortDriver = portbuiltin.NewChildDriver(&logrusDebugWriter{}) opt.CopyUpDirs = []string{"/etc", "/run"} opt.CopyUpDriver = tmpfssymlink.NewChildDriver() return opt, nil diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/builtin.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/builtin.go new file mode 100644 index 0000000000..a098714c0a --- /dev/null +++ b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/builtin/builtin.go @@ -0,0 +1,487 @@ +package builtin + +import ( + "context" + "fmt" + "io" + "io/ioutil" + "net" + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/pkg/errors" + "golang.org/x/sys/unix" + + "github.com/rootless-containers/rootlesskit/pkg/msgutil" + "github.com/rootless-containers/rootlesskit/pkg/port" + "github.com/rootless-containers/rootlesskit/pkg/port/portutil" +) + +const ( + opaqueKeySocketPath = "builtin.socketpath" + opaqueKeyChildReadyPipePath = "builtin.readypipepath" +) + +// NewParentDriver for builtin driver. +func NewParentDriver(logWriter io.Writer, stateDir string) (port.ParentDriver, error) { + // TODO: consider using socketpair FD instead of socket file + socketPath := filepath.Join(stateDir, ".bp.sock") + childReadyPipePath := filepath.Join(stateDir, ".bp-ready.pipe") + // remove the path just incase the previous rootlesskit instance crashed + if err := os.RemoveAll(childReadyPipePath); err != nil { + return nil, errors.Wrapf(err, "cannot remove %s", childReadyPipePath) + } + if err := syscall.Mkfifo(childReadyPipePath, 0600); err != nil { + return nil, errors.Wrapf(err, "cannot mkfifo %s", childReadyPipePath) + } + d := driver{ + logWriter: logWriter, + socketPath: socketPath, + childReadyPipePath: childReadyPipePath, + ports: make(map[int]*port.Status, 0), + stoppers: make(map[int]func() error, 0), + nextID: 1, + } + return &d, nil +} + +type driver struct { + logWriter io.Writer + socketPath string + childReadyPipePath string + mu sync.Mutex + ports map[int]*port.Status + stoppers map[int]func() error + nextID int +} + +func (d *driver) OpaqueForChild() map[string]string { + return map[string]string{ + opaqueKeySocketPath: d.socketPath, + opaqueKeyChildReadyPipePath: d.childReadyPipePath, + } +} + +func (d *driver) RunParentDriver(initComplete chan struct{}, quit <-chan struct{}, _ *port.ChildContext) error { + childReadyPipeR, err := os.OpenFile(d.childReadyPipePath, os.O_RDONLY, os.ModeNamedPipe) + if err != nil { + return err + } + if _, err = ioutil.ReadAll(childReadyPipeR); err != nil { + return err + } + childReadyPipeR.Close() + var dialer net.Dialer + conn, err := dialer.Dial("unix", d.socketPath) + if err != nil { + return err + } + err = initiate(conn.(*net.UnixConn)) + conn.Close() + if err != nil { + return err + } + initComplete <- struct{}{} + <-quit + return nil +} + +func (d *driver) AddPort(ctx context.Context, spec port.Spec) (*port.Status, error) { + d.mu.Lock() + err := portutil.ValidatePortSpec(spec, d.ports) + d.mu.Unlock() + if err != nil { + return nil, err + } + routineStopCh := make(chan struct{}) + routineStop := func() error { + close(routineStopCh) + return nil // FIXME + } + switch spec.Proto { + case "tcp": + err = startTCPRoutines(d.socketPath, spec, routineStopCh, d.logWriter) + case "udp": + err = startUDPRoutines(d.socketPath, spec, routineStopCh, d.logWriter) + default: + // NOTREACHED + return nil, errors.New("spec was not validated?") + } + if err != nil { + return nil, err + } + d.mu.Lock() + id := d.nextID + st := port.Status{ + ID: id, + Spec: spec, + } + d.ports[id] = &st + d.stoppers[id] = routineStop + d.nextID++ + d.mu.Unlock() + return &st, nil +} + +func (d *driver) ListPorts(ctx context.Context) ([]port.Status, error) { + var ports []port.Status + d.mu.Lock() + for _, p := range d.ports { + ports = append(ports, *p) + } + d.mu.Unlock() + return ports, nil +} + +func (d *driver) RemovePort(ctx context.Context, id int) error { + d.mu.Lock() + defer d.mu.Unlock() + stop, ok := d.stoppers[id] + if !ok { + return errors.Errorf("unknown id: %d", id) + } + err := stop() + delete(d.stoppers, id) + delete(d.ports, id) + return err +} + +func initiate(c *net.UnixConn) error { + req := request{ + Type: requestTypeInit, + } + if _, err := msgutil.MarshalToWriter(c, &req); err != nil { + return err + } + if err := c.CloseWrite(); err != nil { + return err + } + var rep reply + if _, err := msgutil.UnmarshalFromReader(c, &rep); err != nil { + return err + } + return c.CloseRead() +} + +func connectToChild(socketPath string, spec port.Spec) (int, error) { + var dialer net.Dialer + conn, err := dialer.Dial("unix", socketPath) + if err != nil { + return 0, err + } + defer conn.Close() + c := conn.(*net.UnixConn) + req := request{ + Type: requestTypeConnect, + Proto: spec.Proto, + Port: spec.ChildPort, + } + if _, err := msgutil.MarshalToWriter(c, &req); err != nil { + return 0, err + } + if err := c.CloseWrite(); err != nil { + return 0, err + } + oobSpace := unix.CmsgSpace(4) + oob := make([]byte, oobSpace) + _, oobN, _, _, err := c.ReadMsgUnix(nil, oob) + if err != nil { + return 0, err + } + if oobN != oobSpace { + return 0, errors.Errorf("expected OOB space %d, got %d", oobSpace, oobN) + } + oob = oob[:oobN] + fd, err := parseFDFromOOB(oob) + if err != nil { + return 0, err + } + if err := c.CloseRead(); err != nil { + return 0, err + } + return fd, nil +} + +func connectToChildWithRetry(socketPath string, spec port.Spec, retries int) (int, error) { + for i := 0; i < retries; i++ { + fd, err := connectToChild(socketPath, spec) + if i == retries-1 && err != nil { + return 0, err + } + if err == nil { + return fd, err + } + // TODO: backoff + time.Sleep(time.Duration(i*5) * time.Millisecond) + } + // NOT REACHED + return 0, errors.New("reached max retry") +} + +func parseFDFromOOB(oob []byte) (int, error) { + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return 0, err + } + if len(scms) != 1 { + return 0, errors.Errorf("unexpected scms: %v", scms) + } + scm := scms[0] + fds, err := unix.ParseUnixRights(&scm) + if err != nil { + return 0, err + } + if len(fds) != 1 { + return 0, errors.Errorf("unexpected fds: %v", fds) + } + return fds[0], nil +} + +func startTCPRoutines(socketPath string, spec port.Spec, stopCh <-chan struct{}, logWriter io.Writer) error { + ln, err := net.Listen("tcp", fmt.Sprintf("%s:%d", spec.ParentIP, spec.ParentPort)) + if err != nil { + fmt.Fprintf(logWriter, "listen: %v\n", err) + return err + } + newConns := make(chan net.Conn) + go func() { + for { + c, err := ln.Accept() + if err != nil { + fmt.Fprintf(logWriter, "accept: %v\n", err) + close(newConns) + return + } + newConns <- c + } + }() + go func() { + defer ln.Close() + for { + select { + case c, ok := <-newConns: + if !ok { + return + } + go func() { + if err := copyConnToChild(c, socketPath, spec, stopCh); err != nil { + fmt.Fprintf(logWriter, "copyConnToChild: %v\n", err) + return + } + }() + case <-stopCh: + return + } + } + }() + // no wait + return nil +} + +func startUDPRoutines(socketPath string, spec port.Spec, stopCh <-chan struct{}, logWriter io.Writer) error { + addr, err := net.ResolveUDPAddr("udp", fmt.Sprintf("%s:%d", spec.ParentIP, spec.ParentPort)) + if err != nil { + return err + } + c, err := net.ListenUDP("udp", addr) + if err != nil { + return err + } + go func() { + if err := copyConnToChild(c, socketPath, spec, stopCh); err != nil { + fmt.Fprintf(logWriter, "copyConnToChild: %v\n", err) + return + } + }() + // no wait + return nil +} + +func copyConnToChild(c net.Conn, socketPath string, spec port.Spec, stopCh <-chan struct{}) error { + defer c.Close() + // get fd from the child as an SCM_RIGHTS cmsg + fd, err := connectToChildWithRetry(socketPath, spec, 10) + if err != nil { + return err + } + f := os.NewFile(uintptr(fd), "") + defer f.Close() + fc, err := net.FileConn(f) + if err != nil { + return err + } + defer fc.Close() + bicopy(c, fc, stopCh) + return nil +} + +// bicopy is based on libnetwork/cmd/proxy/tcp_proxy.go . +// NOTE: sendfile(2) cannot be used for sockets +func bicopy(x, y net.Conn, quit <-chan struct{}) { + var wg sync.WaitGroup + var broker = func(to, from net.Conn) { + io.Copy(to, from) + if fromTCP, ok := from.(*net.TCPConn); ok { + fromTCP.CloseRead() + } + if toTCP, ok := to.(*net.TCPConn); ok { + toTCP.CloseWrite() + } + wg.Done() + } + + wg.Add(2) + go broker(x, y) + go broker(y, x) + finish := make(chan struct{}) + go func() { + wg.Wait() + close(finish) + }() + + select { + case <-quit: + case <-finish: + } + x.Close() + y.Close() + <-finish +} + +const ( + requestTypeInit = "init" + requestTypeConnect = "connect" +) + +// request and response are encoded as JSON with uint32le length header. +type request struct { + Type string // "init" or "connect" + Proto string // "tcp" or "udp" + Port int +} + +// may contain FD as OOB +type reply struct { + Error string +} + +func NewChildDriver(logWriter io.Writer) port.ChildDriver { + return &childDriver{ + logWriter: logWriter, + } +} + +type childDriver struct { + logWriter io.Writer +} + +func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struct{}) error { + socketPath := opaque[opaqueKeySocketPath] + if socketPath == "" { + return errors.New("socket path not set") + } + childReadyPipePath := opaque[opaqueKeyChildReadyPipePath] + if childReadyPipePath == "" { + return errors.New("child ready pipe path not set") + } + childReadyPipeW, err := os.OpenFile(childReadyPipePath, os.O_WRONLY, os.ModeNamedPipe) + if err != nil { + return err + } + ln, err := net.ListenUnix("unix", &net.UnixAddr{ + Name: socketPath, + Net: "unix", + }) + if err != nil { + return err + } + // write nothing, just close + if err = childReadyPipeW.Close(); err != nil { + return err + } + stopAccept := make(chan struct{}, 1) + go func() { + <-quit + stopAccept <- struct{}{} + ln.Close() + }() + for { + c, err := ln.AcceptUnix() + if err != nil { + select { + case <-stopAccept: + return nil + default: + } + return err + } + go func() { + if rerr := d.routine(c); rerr != nil { + rep := reply{ + Error: rerr.Error(), + } + msgutil.MarshalToWriter(c, &rep) + } + c.Close() + }() + } + return nil +} + +func (d *childDriver) routine(c *net.UnixConn) error { + var req request + if _, err := msgutil.UnmarshalFromReader(c, &req); err != nil { + return err + } + switch req.Type { + case requestTypeInit: + return d.handleConnectInit(c, &req) + case requestTypeConnect: + return d.handleConnectRequest(c, &req) + default: + return errors.Errorf("unknown request type %q", req.Type) + } +} + +func (d *childDriver) handleConnectInit(c *net.UnixConn, req *request) error { + _, err := msgutil.MarshalToWriter(c, nil) + return err +} + +func (d *childDriver) handleConnectRequest(c *net.UnixConn, req *request) error { + switch req.Proto { + case "tcp": + case "udp": + default: + return errors.Errorf("unknown proto: %q", req.Proto) + } + var dialer net.Dialer + targetConn, err := dialer.Dial(req.Proto, fmt.Sprintf("127.0.0.1:%d", req.Port)) + if err != nil { + return err + } + defer targetConn.Close() // no effect on duplicated FD + targetConnFiler, ok := targetConn.(filer) + if !ok { + return errors.Errorf("unknown target connection: %+v", targetConn) + } + targetConnFile, err := targetConnFiler.File() + if err != nil { + return err + } + oob := unix.UnixRights(int(targetConnFile.Fd())) + f, err := c.File() + if err != nil { + return err + } + err = unix.Sendmsg(int(f.Fd()), []byte("dummy"), oob, nil, 0) + return err +} + +// filer is implemented by *net.TCPConn and *net.UDPConn +type filer interface { + File() (f *os.File, err error) +} diff --git a/vendor/github.com/rootless-containers/rootlesskit/pkg/port/socat/socat.go b/vendor/github.com/rootless-containers/rootlesskit/pkg/port/socat/socat.go deleted file mode 100644 index 9dbb0a5c3a..0000000000 --- a/vendor/github.com/rootless-containers/rootlesskit/pkg/port/socat/socat.go +++ /dev/null @@ -1,218 +0,0 @@ -package socat - -import ( - "context" - "fmt" - "io" - "net" - "os" - "os/exec" - "sync" - "syscall" - "time" - - "github.com/pkg/errors" - - "github.com/rootless-containers/rootlesskit/pkg/port" - "github.com/rootless-containers/rootlesskit/pkg/port/portutil" -) - -func NewParentDriver(logWriter io.Writer) (port.ParentDriver, error) { - if _, err := exec.LookPath("socat"); err != nil { - return nil, err - } - if _, err := exec.LookPath("nsenter"); err != nil { - return nil, err - } - d := driver{ - logWriter: logWriter, - ports: make(map[int]*port.Status, 0), - stoppers: make(map[int]func() error, 0), - nextID: 1, - } - return &d, nil -} - -type driver struct { - logWriter io.Writer - mu sync.Mutex - childPID int - ports map[int]*port.Status - stoppers map[int]func() error - nextID int -} - -func (d *driver) OpaqueForChild() map[string]string { - // NOP, as this driver does not have child-side logic. - return nil -} - -func (d *driver) RunParentDriver(initComplete chan struct{}, quit <-chan struct{}, cctx *port.ChildContext) error { - if cctx == nil || cctx.PID <= 0 { - return errors.New("child PID not set") - } - d.childPID = cctx.PID - initComplete <- struct{}{} - <-quit - return nil -} - -func (d *driver) AddPort(ctx context.Context, spec port.Spec) (*port.Status, error) { - if d.childPID <= 0 { - return nil, errors.New("child PID not set") - } - d.mu.Lock() - err := portutil.ValidatePortSpec(spec, d.ports) - d.mu.Unlock() - if err != nil { - return nil, err - } - cf := func() (*exec.Cmd, error) { - return createSocatCmd(ctx, spec, d.logWriter, d.childPID) - } - routineErrorCh := make(chan error) - routineStopCh := make(chan struct{}) - routineStop := func() error { - close(routineStopCh) - return <-routineErrorCh - } - go portRoutine(cf, routineStopCh, routineErrorCh, d.logWriter) - d.mu.Lock() - id := d.nextID - st := port.Status{ - ID: id, - Spec: spec, - } - d.ports[id] = &st - d.stoppers[id] = routineStop - d.nextID++ - d.mu.Unlock() - return &st, nil -} - -func (d *driver) ListPorts(ctx context.Context) ([]port.Status, error) { - var ports []port.Status - d.mu.Lock() - for _, p := range d.ports { - ports = append(ports, *p) - } - d.mu.Unlock() - return ports, nil -} - -func (d *driver) RemovePort(ctx context.Context, id int) error { - d.mu.Lock() - defer d.mu.Unlock() - stop, ok := d.stoppers[id] - if !ok { - return errors.Errorf("unknown port id: %d", id) - } - err := stop() - delete(d.stoppers, id) - delete(d.ports, id) - return err -} - -func createSocatCmd(ctx context.Context, spec port.Spec, logWriter io.Writer, childPID int) (*exec.Cmd, error) { - if spec.Proto != "tcp" && spec.Proto != "udp" { - return nil, errors.Errorf("unsupported proto: %s", spec.Proto) - } - ipStr := "0.0.0.0" - if spec.ParentIP != "" { - ip := net.ParseIP(spec.ParentIP) - if ip == nil { - return nil, errors.Errorf("unsupported parentIP: %s", spec.ParentIP) - } - ip = ip.To4() - if ip == nil { - return nil, errors.Errorf("unsupported parentIP (v6?): %s", spec.ParentIP) - } - ipStr = ip.String() - } - if spec.ParentPort < 1 || spec.ParentPort > 65535 { - return nil, errors.Errorf("unsupported parentPort: %d", spec.ParentPort) - } - if spec.ChildPort < 1 || spec.ChildPort > 65535 { - return nil, errors.Errorf("unsupported childPort: %d", spec.ChildPort) - } - var cmd *exec.Cmd - switch spec.Proto { - case "tcp": - cmd = exec.CommandContext(ctx, - "socat", - fmt.Sprintf("TCP-LISTEN:%d,bind=%s,reuseaddr,fork,rcvbuf=65536,sndbuf=65536", spec.ParentPort, ipStr), - fmt.Sprintf("EXEC:\"%s\",nofork", - fmt.Sprintf("nsenter -U -n --preserve-credentials -t %d socat STDIN TCP4:127.0.0.1:%d", childPID, spec.ChildPort))) - case "udp": - cmd = exec.CommandContext(ctx, - "socat", - fmt.Sprintf("UDP-LISTEN:%d,bind=%s,reuseaddr,fork,rcvbuf=65536,sndbuf=65536", spec.ParentPort, ipStr), - fmt.Sprintf("EXEC:\"%s\",nofork", - fmt.Sprintf("nsenter -U -n --preserve-credentials -t %d socat STDIN UDP4:127.0.0.1:%d", childPID, spec.ChildPort))) - } - cmd.Env = os.Environ() - cmd.Stdout = logWriter - cmd.Stderr = logWriter - cmd.SysProcAttr = &syscall.SysProcAttr{ - Pdeathsig: syscall.SIGKILL, - } - return cmd, nil -} - -type cmdFactory func() (*exec.Cmd, error) - -func portRoutine(cf cmdFactory, stopCh <-chan struct{}, errWCh chan error, logWriter io.Writer) { - retry := 0 - doneCh := make(chan error) - for { - cmd, err := cf() - if err != nil { - errWCh <- err - return - } - cmdDesc := fmt.Sprintf("%s %v", cmd.Path, cmd.Args) - fmt.Fprintf(logWriter, "[exec] starting cmd %s\n", cmdDesc) - if err := cmd.Start(); err != nil { - errWCh <- err - return - } - pid := cmd.Process.Pid - go func() { - err := cmd.Wait() - doneCh <- err - }() - select { - case err := <-doneCh: - // even if err == nil (unexpected for socat), continue the loop - retry++ - sleepDuration := time.Duration((retry*100)%(30*1000)) * time.Millisecond - fmt.Fprintf(logWriter, "[exec] retrying cmd %s after sleeping %v, count=%d, err=%v\n", - cmdDesc, sleepDuration, retry, err) - select { - case <-time.After(sleepDuration): - case <-stopCh: - errWCh <- err - return - } - case <-stopCh: - fmt.Fprintf(logWriter, "[exec] killing cmd %s pid %d\n", cmdDesc, pid) - syscall.Kill(pid, syscall.SIGKILL) - fmt.Fprintf(logWriter, "[exec] killed cmd %s pid %d\n", cmdDesc, pid) - close(errWCh) - return - } - } -} - -func NewChildDriver() port.ChildDriver { - return &childDriver{} -} - -type childDriver struct { -} - -func (d *childDriver) RunChildDriver(opaque map[string]string, quit <-chan struct{}) error { - // NOP - <-quit - return nil -}