Merge pull request #72114 from sjenning/bump-runc

vendor: bump runc to f000fe11
pull/564/head
Kubernetes Prow Robot 2018-12-18 21:27:49 -08:00 committed by GitHub
commit 774ac6408d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 1152 additions and 681 deletions

64
Godeps/Godeps.json generated
View File

@ -2795,83 +2795,83 @@
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer", "ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor", "ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd", "ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs", "ImportPath": "github.com/opencontainers/runc/libcontainer/configs",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate", "ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc", "ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/intelrdt", "ImportPath": "github.com/opencontainers/runc/libcontainer/intelrdt",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/keys", "ImportPath": "github.com/opencontainers/runc/libcontainer/keys",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/mount", "ImportPath": "github.com/opencontainers/runc/libcontainer/mount",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp", "ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace", "ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/system", "ImportPath": "github.com/opencontainers/runc/libcontainer/system",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/user", "ImportPath": "github.com/opencontainers/runc/libcontainer/user",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer/utils", "ImportPath": "github.com/opencontainers/runc/libcontainer/utils",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431", "Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01" "Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
}, },
{ {
"ImportPath": "github.com/opencontainers/runtime-spec/specs-go", "ImportPath": "github.com/opencontainers/runtime-spec/specs-go",

View File

@ -53,6 +53,7 @@ go_library(
"//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
"//vendor/github.com/opencontainers/selinux/go-selinux/label:go_default_library", "//vendor/github.com/opencontainers/selinux/go-selinux/label:go_default_library",
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/github.com/sirupsen/logrus:go_default_library", "//vendor/github.com/sirupsen/logrus:go_default_library",
"//vendor/github.com/syndtr/gocapability/capability:go_default_library", "//vendor/github.com/syndtr/gocapability/capability:go_default_library",
"//vendor/github.com/vishvananda/netlink:go_default_library", "//vendor/github.com/vishvananda/netlink:go_default_library",

View File

@ -323,6 +323,7 @@ generated when building libcontainer with docker.
## Copyright and license ## Copyright and license
Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license. Code and documentation copyright 2014 Docker, inc.
Docs released under Creative commons. The code and documentation are released under the [Apache 2.0 license](../LICENSE).
The documentation is also released under Creative Commons Attribution 4.0 International License.
You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.

View File

@ -156,17 +156,21 @@ init process will block waiting for the parent to finish setup.
### IntelRdt ### IntelRdt
Intel platforms with new Xeon CPU support Intel Resource Director Technology Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
currently supports L3 cache resource allocation. two sub-features of RDT.
This feature provides a way for the software to restrict cache allocation to a Cache Allocation Technology (CAT) provides a way for the software to restrict
defined 'subset' of L3 cache which may be overlapping with other 'subsets'. cache allocation to a defined 'subset' of L3 cache which may be overlapping
The different subsets are identified by class of service (CLOS) and each CLOS with other 'subsets'. The different subsets are identified by class of
has a capacity bitmask (CBM). service (CLOS) and each CLOS has a capacity bitmask (CBM).
It can be used to handle L3 cache resource allocation for containers if Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
hardware and kernel support Intel RDT/CAT. over memory bandwidth for the software. A user controls the resource by
indicating the percentage of maximum memory bandwidth.
It can be used to handle L3 cache and memory bandwidth resources allocation
for containers if hardware and kernel support Intel RDT CAT and MBA features.
In Linux 4.10 kernel or newer, the interface is defined and exposed via In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface. "resource control" filesystem, which is a "cgroup-like" interface.
@ -175,6 +179,9 @@ Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout. filesystem layout.
CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
"resource control" filesystem.
Intel RDT "resource control" filesystem hierarchy: Intel RDT "resource control" filesystem hierarchy:
``` ```
mount -t resctrl resctrl /sys/fs/resctrl mount -t resctrl resctrl /sys/fs/resctrl
@ -182,58 +189,84 @@ tree /sys/fs/resctrl
/sys/fs/resctrl/ /sys/fs/resctrl/
|-- info |-- info
| |-- L3 | |-- L3
| |-- cbm_mask | | |-- cbm_mask
| |-- min_cbm_bits | | |-- min_cbm_bits
| | |-- num_closids
| |-- MB
| |-- bandwidth_gran
| |-- delay_linear
| |-- min_bandwidth
| |-- num_closids | |-- num_closids
|-- cpus |-- ...
|-- schemata |-- schemata
|-- tasks |-- tasks
|-- <container_id> |-- <container_id>
|-- cpus |-- ...
|-- schemata |-- schemata
|-- tasks |-- tasks
``` ```
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache For runc, we can make use of `tasks` and `schemata` configuration for L3
resource constraints. cache and memory bandwidth resources constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g., The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID <container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it added to the same group as their parent.
is in root group.
The file `schemata` has allocation masks/values for L3 cache on each socket, The file `schemata` has a list of all the resources available to this group.
which contains L3 cache id and capacity bitmask (CBM). Each resource (L3 cache, memory bandwidth) has its own line and format.
L3 cache schema:
It has allocation bitmasks/values for L3 cache on each socket, which
contains L3 cache id and capacity bitmask (CBM).
``` ```
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..." Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
``` ```
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem supported Intel CPU models. Kernel will check if it is valid when writing.
layout, the CBM in a group should be a subset of the CBM in root. Kernel will e.g., default value 0xfffff in root indicates the max bits of CBM is 20
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits bits, which mapping to entire L3 cache capacity. Some valid CBM values to
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface: Memory bandwidth schema:
It has allocation values for memory bandwidth on each socket, which contains
L3 cache id and memory bandwidth percentage.
```
Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
```
For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
The minimum bandwidth percentage value for each CPU model is predefined and
can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
that is allocated is also dependent on the CPU model and can be looked up at
"info/MB/bandwidth_gran". The available bandwidth control steps are:
min_bw + N * bw_gran. Intermediate values are rounded to the next control
step available on the hardware.
For more information about Intel RDT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
``` ```
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
inside the container only have access to the "upper" 80% of L3 cache id 0 and with a memory bandwidth granularity of 10%.
the "lower" 50% L3 cache id 1:
Tasks inside the container only have access to the "upper" 7/11 of L3 cache
on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
"linux": { "linux": {
"intelRdt": { "intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff" "closID": "guaranteed_group",
"l3CacheSchema": "L3:0=7f0;1=1f",
"memBwSchema": "MB:0=20;1=70"
} }
} }
``` ```

View File

@ -12,6 +12,7 @@ go_library(
"freezer.go", "freezer.go",
"fs_unsupported.go", "fs_unsupported.go",
"hugetlb.go", "hugetlb.go",
"kmem.go",
"memory.go", "memory.go",
"name.go", "name.go",
"net_cls.go", "net_cls.go",
@ -29,6 +30,7 @@ go_library(
"//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/utils:go_default_library", "//vendor/github.com/opencontainers/runc/libcontainer/utils:go_default_library",
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/golang.org/x/sys/unix:go_default_library", "//vendor/golang.org/x/sys/unix:go_default_library",
], ],
"//conditions:default": [], "//conditions:default": [],

View File

@ -3,7 +3,6 @@
package fs package fs
import ( import (
"errors"
"fmt" "fmt"
"io" "io"
"io/ioutil" "io/ioutil"
@ -14,6 +13,8 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
) )
var ( var (
@ -35,7 +36,7 @@ var (
HugePageSizes, _ = cgroups.GetHugePageSize() HugePageSizes, _ = cgroups.GetHugePageSize()
) )
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
type subsystemSet []subsystem type subsystemSet []subsystem
@ -64,6 +65,7 @@ type subsystem interface {
type Manager struct { type Manager struct {
mu sync.Mutex mu sync.Mutex
Cgroups *configs.Cgroup Cgroups *configs.Cgroup
Rootless bool // ignore permission-related errors
Paths map[string]string Paths map[string]string
} }
@ -100,6 +102,33 @@ type cgroupData struct {
pid int pid int
} }
// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
// We do not ignore errors if we are root.
if !rootless {
return false
}
// Is it an ordinary EPERM?
if os.IsPermission(errors.Cause(err)) {
return true
}
// Try to handle other errnos.
var errno error
switch err := errors.Cause(err).(type) {
case *os.PathError:
errno = err.Err
case *os.LinkError:
errno = err.Err
case *os.SyscallError:
errno = err.Err
}
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
func (m *Manager) Apply(pid int) (err error) { func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil { if m.Cgroups == nil {
return nil return nil
@ -145,11 +174,11 @@ func (m *Manager) Apply(pid int) (err error) {
m.Paths[sys.Name()] = p m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil { if err := sys.Apply(d); err != nil {
if os.IsPermission(err) && m.Cgroups.Path == "" { // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
// If we didn't set a cgroup path, then let's defer the error here // been set, we don't bail on error in case of permission problems.
// until we know whether we have set limits or not. // Cases where limits have been set (and we couldn't create our own
// If we hadn't set limits, then it's ok that we couldn't join this cgroup, because // cgroup) are handled by Set.
// it will have the same limits as its parent. if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
delete(m.Paths, sys.Name()) delete(m.Paths, sys.Name())
continue continue
} }
@ -207,9 +236,16 @@ func (m *Manager) Set(container *configs.Config) error {
for _, sys := range subsystems { for _, sys := range subsystems {
path := paths[sys.Name()] path := paths[sys.Name()]
if err := sys.Set(path, container.Cgroups); err != nil { if err := sys.Set(path, container.Cgroups); err != nil {
if m.Rootless && sys.Name() == "devices" {
continue
}
// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" { if path == "" {
// cgroup never applied // We never created a path for this cgroup, so we cannot set
return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name()) // limits for it (though we have already tried at this point).
return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
} }
return err return err
} }

View File

@ -46,11 +46,7 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error
} }
// because we are not using d.join we need to place the pid into the procs file // because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems // unlike the other subsystems
if err := cgroups.WriteCgroupProc(path, pid); err != nil { return cgroups.WriteCgroupProc(path, pid)
return err
}
return nil
} }
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
@ -83,11 +79,7 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
return err return err
} }
} }
if err := s.SetRtSched(path, cgroup); err != nil { return s.SetRtSched(path, cgroup)
return err
}
return nil
} }
func (s *CpuGroup) Remove(d *cgroupData) error { func (s *CpuGroup) Remove(d *cgroupData) error {

View File

@ -77,18 +77,14 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
// The logic is, if user specified cpuset configs, use these // The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes // specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and // cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatbility. // keep backward compatibility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil { if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err return err
} }
// because we are not using d.join we need to place the pid into the procs file // because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems // unlike the other subsystems
if err := cgroups.WriteCgroupProc(dir, pid); err != nil { return cgroups.WriteCgroupProc(dir, pid)
return err
}
return nil
} }
func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {

View File

@ -0,0 +1,55 @@
// +build linux,!nokmem
package fs
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"syscall" // for Errno type only
"github.com/opencontainers/runc/libcontainer/cgroups"
"golang.org/x/sys/unix"
)
const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
func EnableKernelMemoryAccounting(path string) error {
// Check if kernel memory is enabled
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// kernel memory is not enabled on the system so we should do nothing
return nil
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if pathErr, ok := err.(*os.PathError); ok {
if errNo, ok := pathErr.Err.(syscall.Errno); ok {
if errNo == unix.EBUSY {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
}
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
}
return nil
}

View File

@ -0,0 +1,11 @@
// +build linux,nokmem
package fs
func EnableKernelMemoryAccounting(path string) error {
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
return nil
}

View File

@ -5,21 +5,16 @@ package fs
import ( import (
"bufio" "bufio"
"fmt" "fmt"
"io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"syscall" // only for Errno
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
) )
const ( const (
cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes" cgroupMemoryLimit = "memory.limit_in_bytes"
) )
@ -67,44 +62,6 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return nil return nil
} }
func EnableKernelMemoryAccounting(path string) error {
// Check if kernel memory is enabled
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// kernel memory is not enabled on the system so we should do nothing
return nil
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if pathErr, ok := err.(*os.PathError); ok {
if errNo, ok := pathErr.Err.(syscall.Errno); ok {
if errNo == unix.EBUSY {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
}
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
}
return nil
}
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// If the memory update is set to -1 we should also // If the memory update is set to -1 we should also
// set swap to -1, it means unlimited memory. // set swap to -1, it means unlimited memory.

View File

@ -5,6 +5,7 @@ package systemd
import ( import (
"errors" "errors"
"fmt" "fmt"
"math"
"os" "os"
"path/filepath" "path/filepath"
"strings" "strings"
@ -295,14 +296,20 @@ func (m *Manager) Apply(pid int) error {
// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod // corresponds to USEC_INFINITY in systemd
// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
// always setting a property value ensures we can apply a quota and remove it later
cpuQuotaPerSecUSec := uint64(math.MaxUint64)
if c.Resources.CpuQuota > 0 {
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of // (integer percentage of CPU) internally. This means that if a fractional percent of
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
if cpuQuotaPerSecUSec%10000 != 0 { if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
} }
}
properties = append(properties, properties = append(properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
} }
@ -312,6 +319,12 @@ func (m *Manager) Apply(pid int) error {
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
} }
if c.Resources.PidsLimit > 0 {
properties = append(properties,
newProp("TasksAccounting", true),
newProp("TasksMax", uint64(c.Resources.PidsLimit)))
}
// We have to set kernel memory here, as we can't change it once // We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup. // processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 { if c.Resources.KernelMemory != 0 {

View File

@ -13,7 +13,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/docker/go-units" units "github.com/docker/go-units"
) )
const ( const (
@ -103,7 +103,7 @@ func FindCgroupMountpointDir() (string, error) {
} }
if postSeparatorFields[0] == "cgroup" { if postSeparatorFields[0] == "cgroup" {
// Check that the mount is properly formated. // Check that the mount is properly formatted.
if numPostFields < 3 { if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
} }
@ -151,20 +151,21 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
Root: fields[3], Root: fields[3],
} }
for _, opt := range strings.Split(fields[len(fields)-1], ",") { for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] { seen, known := ss[opt]
if !known || (!all && seen) {
continue continue
} }
ss[opt] = true
if strings.HasPrefix(opt, cgroupNamePrefix) { if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) opt = opt[len(cgroupNamePrefix):]
} else {
m.Subsystems = append(m.Subsystems, opt)
} }
if !all { m.Subsystems = append(m.Subsystems, opt)
numFound++ numFound++
} }
} if len(m.Subsystems) > 0 || all {
res = append(res, m) res = append(res, m)
} }
}
if err := scanner.Err(); err != nil { if err := scanner.Err(); err != nil {
return nil, err return nil, err
} }
@ -187,7 +188,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) {
allMap := make(map[string]bool) allMap := make(map[string]bool)
for s := range allSubsystems { for s := range allSubsystems {
allMap[s] = true allMap[s] = false
} }
return getCgroupMountsHelper(allMap, f, all) return getCgroupMountsHelper(allMap, f, all)
} }
@ -262,7 +263,7 @@ func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
} }
// This is needed for nested containers, because in /proc/self/cgroup we // This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container. // see paths from host, which don't exist in container.
relCgroup, err := filepath.Rel(root, cgroup) relCgroup, err := filepath.Rel(root, cgroup)
if err != nil { if err != nil {
return "", err return "", err

View File

@ -141,9 +141,10 @@ type Config struct {
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with // for a process. Valid values are between the range [-1000, '1000'], where processes with
// higher scores are preferred for being killed. // higher scores are preferred for being killed. If it is unset then we don't touch the current
// value.
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj int `json:"oom_score_adj"` OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UidMappings is an array of User ID mappings for User Namespaces // UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"` UidMappings []IDMap `json:"uid_mappings"`
@ -185,12 +186,19 @@ type Config struct {
// callers keyring in this case. // callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"` NoNewKeyring bool `json:"no_new_keyring"`
// Rootless specifies whether the container is a rootless container. // IntelRdt specifies settings for Intel RDT group that the container is placed into
Rootless bool `json:"rootless"` // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
// IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into
// to limit the resources (e.g., L3 cache) the container has available
IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
// RootlessEUID is set when the runc was launched with non-zero EUID.
// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
// When RootlessEUID is set, runc creates a new userns for the container.
// (config.json needs to contain userns settings)
RootlessEUID bool `json:"rootless_euid,omitempty"`
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
} }
type Hooks struct { type Hooks struct {

View File

@ -4,4 +4,8 @@ type IntelRdt struct {
// The schema for L3 cache id and capacity bitmask (CBM) // The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..." // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3_cache_schema,omitempty"` L3CacheSchema string `json:"l3_cache_schema,omitempty"`
// The schema of memory bandwidth percentage per L3 cache id
// Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
MemBwSchema string `json:"memBwSchema,omitempty"`
} }

View File

@ -2,23 +2,18 @@ package validate
import ( import (
"fmt" "fmt"
"os"
"reflect"
"strings" "strings"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
) )
var ( // rootlessEUID makes sure that the config can be applied when runc
geteuid = os.Geteuid // is being executed as a non-root user (euid != 0) in the current user namespace.
getegid = os.Getegid func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
) if err := rootlessEUIDMappings(config); err != nil {
func (v *ConfigValidator) rootless(config *configs.Config) error {
if err := rootlessMappings(config); err != nil {
return err return err
} }
if err := rootlessMount(config); err != nil { if err := rootlessEUIDMount(config); err != nil {
return err return err
} }
@ -38,11 +33,9 @@ func hasIDMapping(id int, mappings []configs.IDMap) bool {
return false return false
} }
func rootlessMappings(config *configs.Config) error { func rootlessEUIDMappings(config *configs.Config) error {
if euid := geteuid(); euid != 0 {
if !config.Namespaces.Contains(configs.NEWUSER) { if !config.Namespaces.Contains(configs.NEWUSER) {
return fmt.Errorf("rootless containers require user namespaces") return fmt.Errorf("rootless container requires user namespaces")
}
} }
if len(config.UidMappings) == 0 { if len(config.UidMappings) == 0 {
@ -51,34 +44,13 @@ func rootlessMappings(config *configs.Config) error {
if len(config.GidMappings) == 0 { if len(config.GidMappings) == 0 {
return fmt.Errorf("rootless containers requires at least one GID mapping") return fmt.Errorf("rootless containers requires at least one GID mapping")
} }
return nil
}
// cgroup verifies that the user isn't trying to set any cgroup limits or paths.
func rootlessCgroup(config *configs.Config) error {
// Nothing set at all.
if config.Cgroups == nil || config.Cgroups.Resources == nil {
return nil
}
// Used for comparing to the zero value.
left := reflect.ValueOf(*config.Cgroups.Resources)
right := reflect.Zero(left.Type())
// This is all we need to do, since specconv won't add cgroup options in
// rootless mode.
if !reflect.DeepEqual(left.Interface(), right.Interface()) {
return fmt.Errorf("cannot specify resource limits in rootless container")
}
return nil return nil
} }
// mount verifies that the user isn't trying to set up any mounts they don't have // mount verifies that the user isn't trying to set up any mounts they don't have
// the rights to do. In addition, it makes sure that no mount has a `uid=` or // the rights to do. In addition, it makes sure that no mount has a `uid=` or
// `gid=` option that doesn't resolve to root. // `gid=` option that doesn't resolve to root.
func rootlessMount(config *configs.Config) error { func rootlessEUIDMount(config *configs.Config) error {
// XXX: We could whitelist allowed devices at this point, but I'm not // XXX: We could whitelist allowed devices at this point, but I'm not
// convinced that's a good idea. The kernel is the best arbiter of // convinced that's a good idea. The kernel is the best arbiter of
// access control. // access control.

View File

@ -44,8 +44,8 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.intelrdt(config); err != nil { if err := v.intelrdt(config); err != nil {
return err return err
} }
if config.Rootless { if config.RootlessEUID {
if err := v.rootless(config); err != nil { if err := v.rootlessEUID(config); err != nil {
return err return err
} }
} }
@ -151,6 +151,16 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
} }
} }
if config.Namespaces.Contains(configs.NEWUTS) {
switch s {
case "kernel.domainname":
// This is namespaced and there's no explicit OCI field for it.
continue
case "kernel.hostname":
// This is namespaced but there's a conflicting (dedicated) OCI field for it.
return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
}
}
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
} }
@ -159,11 +169,22 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
func (v *ConfigValidator) intelrdt(config *configs.Config) error { func (v *ConfigValidator) intelrdt(config *configs.Config) error {
if config.IntelRdt != nil { if config.IntelRdt != nil {
if !intelrdt.IsEnabled() { if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled") return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled")
} }
if config.IntelRdt.L3CacheSchema == "" {
return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" {
return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
}
if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" {
return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
}
if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" {
return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
}
if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" {
return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty")
} }
} }

View File

@ -28,7 +28,6 @@ import (
"github.com/golang/protobuf/proto" "github.com/golang/protobuf/proto"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl" "github.com/vishvananda/netlink/nl"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@ -60,7 +59,8 @@ type State struct {
// Platform specific fields below here // Platform specific fields below here
// Specifies if the container was started under the rootless mode. // Specified if the container was started under the rootless mode.
// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
Rootless bool `json:"rootless"` Rootless bool `json:"rootless"`
// Path to all the cgroups setup for a container. Key is cgroup subsystem name // Path to all the cgroups setup for a container. Key is cgroup subsystem name
@ -225,17 +225,13 @@ func (c *linuxContainer) Set(config configs.Config) error {
func (c *linuxContainer) Start(process *Process) error { func (c *linuxContainer) Start(process *Process) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
status, err := c.currentStatus() if process.Init {
if err != nil {
return err
}
if status == Stopped {
if err := c.createExecFifo(); err != nil { if err := c.createExecFifo(); err != nil {
return err return err
} }
} }
if err := c.start(process, status == Stopped); err != nil { if err := c.start(process); err != nil {
if status == Stopped { if process.Init {
c.deleteExecFifo() c.deleteExecFifo()
} }
return err return err
@ -244,17 +240,10 @@ func (c *linuxContainer) Start(process *Process) error {
} }
func (c *linuxContainer) Run(process *Process) error { func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
status, err := c.currentStatus()
if err != nil {
c.m.Unlock()
return err
}
c.m.Unlock()
if err := c.Start(process); err != nil { if err := c.Start(process); err != nil {
return err return err
} }
if status == Stopped { if process.Init {
return c.exec() return c.exec()
} }
return nil return nil
@ -335,8 +324,8 @@ type openResult struct {
err error err error
} }
func (c *linuxContainer) start(process *Process, isInit bool) error { func (c *linuxContainer) start(process *Process) error {
parent, err := c.newParentProcess(process, isInit) parent, err := c.newParentProcess(process)
if err != nil { if err != nil {
return newSystemErrorWithCause(err, "creating new parent process") return newSystemErrorWithCause(err, "creating new parent process")
} }
@ -349,7 +338,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
} }
// generate a timestamp indicating when the container was started // generate a timestamp indicating when the container was started
c.created = time.Now().UTC() c.created = time.Now().UTC()
if isInit { if process.Init {
c.state = &createdState{ c.state = &createdState{
c: c, c: c,
} }
@ -411,10 +400,7 @@ func (c *linuxContainer) createExecFifo() error {
return err return err
} }
unix.Umask(oldMask) unix.Umask(oldMask)
if err := os.Chown(fifoName, rootuid, rootgid); err != nil { return os.Chown(fifoName, rootuid, rootgid)
return err
}
return nil
} }
func (c *linuxContainer) deleteExecFifo() { func (c *linuxContainer) deleteExecFifo() {
@ -439,7 +425,7 @@ func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
return nil return nil
} }
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
parentPipe, childPipe, err := utils.NewSockPair("init") parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil { if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe") return nil, newSystemErrorWithCause(err, "creating new init pipe")
@ -448,7 +434,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces
if err != nil { if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template") return nil, newSystemErrorWithCause(err, "creating new command template")
} }
if !doInit { if !p.Init {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe) return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
} }
@ -473,6 +459,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
if cmd.SysProcAttr == nil { if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{} cmd.SysProcAttr = &syscall.SysProcAttr{}
} }
cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
if p.ConsoleSocket != nil { if p.ConsoleSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
@ -535,6 +522,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
return &setnsProcess{ return &setnsProcess{
cmd: cmd, cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(), cgroupPaths: c.cgroupManager.GetPaths(),
rootlessCgroups: c.config.RootlessCgroups,
intelRdtPath: state.IntelRdtPath, intelRdtPath: state.IntelRdtPath,
childPipe: childPipe, childPipe: childPipe,
parentPipe: parentPipe, parentPipe: parentPipe,
@ -556,7 +544,8 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
PassedFilesCount: len(process.ExtraFiles), PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(), ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges, NoNewPrivileges: c.config.NoNewPrivileges,
Rootless: c.config.Rootless, RootlessEUID: c.config.RootlessEUID,
RootlessCgroups: c.config.RootlessCgroups,
AppArmorProfile: c.config.AppArmorProfile, AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel, ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits, Rlimits: c.config.Rlimits,
@ -624,16 +613,16 @@ func (c *linuxContainer) Resume() error {
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups. // XXX(cyphar): This requires cgroups.
if c.config.Rootless { if c.config.RootlessCgroups {
return nil, fmt.Errorf("cannot get OOM notifications from rootless container") logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
} }
return notifyOnOOM(c.cgroupManager.GetPaths()) return notifyOnOOM(c.cgroupManager.GetPaths())
} }
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups. // XXX(cyphar): This requires cgroups.
if c.config.Rootless { if c.config.RootlessCgroups {
return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
} }
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
} }
@ -668,7 +657,7 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.
Features: criuFeat, Features: criuFeat,
} }
err := c.criuSwrk(nil, req, criuOpts, false) err := c.criuSwrk(nil, req, criuOpts, false, nil)
if err != nil { if err != nil {
logrus.Debugf("%s", err) logrus.Debugf("%s", err)
return fmt.Errorf("CRIU feature check failed") return fmt.Errorf("CRIU feature check failed")
@ -781,7 +770,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error {
Type: &t, Type: &t,
} }
err := c.criuSwrk(nil, req, nil, false) err := c.criuSwrk(nil, req, nil, false, nil)
if err != nil { if err != nil {
return fmt.Errorf("CRIU version check failed: %s", err) return fmt.Errorf("CRIU version check failed: %s", err)
} }
@ -877,12 +866,11 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
// Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
// (CLI prints a warning)
// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
// support for doing unprivileged dumps, but the setup of // support for doing unprivileged dumps, but the setup of
// rootless containers might make this complicated. // rootless containers might make this complicated.
if c.config.Rootless {
return fmt.Errorf("cannot checkpoint a rootless container")
}
// criu 1.5.2 => 10502 // criu 1.5.2 => 10502
if err := c.checkCriuVersion(10502); err != nil { if err := c.checkCriuVersion(10502); err != nil {
@ -939,6 +927,33 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
LazyPages: proto.Bool(criuOpts.LazyPages), LazyPages: proto.Bool(criuOpts.LazyPages),
} }
// If the container is running in a network namespace and has
// a path to the network namespace configured, we will dump
// that network namespace as an external namespace and we
// will expect that the namespace exists during restore.
// This basically means that CRIU will ignore the namespace
// and expect to be setup correctly.
nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
if nsPath != "" {
// For this to work we need at least criu 3.11.0 => 31100.
// As there was already a successful version check we will
// not error out if it fails. runc will just behave as it used
// to do and ignore external network namespaces.
err := c.checkCriuVersion(31100)
if err == nil {
// CRIU expects the information about an external namespace
// like this: --external net[<inode>]:<key>
// This <key> is always 'extRootNetNS'.
var netns syscall.Stat_t
err = syscall.Stat(nsPath, &netns)
if err != nil {
return err
}
criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino)
rpcOpts.External = append(rpcOpts.External, criuExternal)
}
}
fcg := c.cgroupManager.GetPaths()["freezer"] fcg := c.cgroupManager.GetPaths()["freezer"]
if fcg != "" { if fcg != "" {
rpcOpts.FreezeCgroup = proto.String(fcg) rpcOpts.FreezeCgroup = proto.String(fcg)
@ -1043,7 +1058,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
} }
} }
err = c.criuSwrk(nil, req, criuOpts, false) err = c.criuSwrk(nil, req, criuOpts, false, nil)
if err != nil { if err != nil {
return err return err
} }
@ -1087,11 +1102,12 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
var extraFiles []*os.File
// Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
// (CLI prints a warning)
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
// support for unprivileged restore at the moment. // support for unprivileged restore at the moment.
if c.config.Rootless {
return fmt.Errorf("cannot restore a rootless container")
}
// criu 1.5.2 => 10502 // criu 1.5.2 => 10502
if err := c.checkCriuVersion(10502); err != nil { if err := c.checkCriuVersion(10502); err != nil {
@ -1161,6 +1177,38 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
}, },
} }
// Same as during checkpointing. If the container has a specific network namespace
// assigned to it, this now expects that the checkpoint will be restored in a
// already created network namespace.
nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
if nsPath != "" {
// For this to work we need at least criu 3.11.0 => 31100.
// As there was already a successful version check we will
// not error out if it fails. runc will just behave as it used
// to do and ignore external network namespaces.
err := c.checkCriuVersion(31100)
if err == nil {
// CRIU wants the information about an existing network namespace
// like this: --inherit-fd fd[<fd>]:<key>
// The <key> needs to be the same as during checkpointing.
// We are always using 'extRootNetNS' as the key in this.
netns, err := os.Open(nsPath)
defer netns.Close()
if err != nil {
logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
}
inheritFd := new(criurpc.InheritFd)
inheritFd.Key = proto.String("extRootNetNS")
// The offset of four is necessary because 0, 1, 2 and 3 is already
// used by stdin, stdout, stderr, 'criu swrk' socket.
inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles)))
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
// All open FDs need to be transferred to CRIU via extraFiles
extraFiles = append(extraFiles, netns)
}
}
for _, m := range c.config.Mounts { for _, m := range c.config.Mounts {
switch m.Device { switch m.Device {
case "bind": case "bind":
@ -1219,7 +1267,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
} }
} }
return c.criuSwrk(process, req, criuOpts, true) return c.criuSwrk(process, req, criuOpts, true, extraFiles)
} }
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
@ -1249,7 +1297,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
return nil return nil
} }
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
if err != nil { if err != nil {
return err return err
@ -1290,6 +1338,9 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
cmd.Stderr = process.Stderr cmd.Stderr = process.Stderr
} }
cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
if extraFiles != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
}
if err := cmd.Start(); err != nil { if err := cmd.Start(); err != nil {
return err return err
@ -1664,7 +1715,7 @@ func (c *linuxContainer) currentState() (*State, error) {
InitProcessStartTime: startTime, InitProcessStartTime: startTime,
Created: c.created, Created: c.created,
}, },
Rootless: c.config.Rootless, Rootless: c.config.RootlessEUID && c.config.RootlessCgroups,
CgroupPaths: c.cgroupManager.GetPaths(), CgroupPaths: c.cgroupManager.GetPaths(),
IntelRdtPath: intelRdtPath, IntelRdtPath: intelRdtPath,
NamespacePaths: make(map[configs.NamespaceType]string), NamespacePaths: make(map[configs.NamespaceType]string),
@ -1765,7 +1816,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
if !joinExistingUser { if !joinExistingUser {
// write uid mappings // write uid mappings
if len(c.config.UidMappings) > 0 { if len(c.config.UidMappings) > 0 {
if c.config.Rootless && c.newuidmapPath != "" { if c.config.RootlessEUID && c.newuidmapPath != "" {
r.AddData(&Bytemsg{ r.AddData(&Bytemsg{
Type: UidmapPathAttr, Type: UidmapPathAttr,
Value: []byte(c.newuidmapPath), Value: []byte(c.newuidmapPath),
@ -1791,19 +1842,13 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Type: GidmapAttr, Type: GidmapAttr,
Value: b, Value: b,
}) })
if c.config.Rootless && c.newgidmapPath != "" { if c.config.RootlessEUID && c.newgidmapPath != "" {
r.AddData(&Bytemsg{ r.AddData(&Bytemsg{
Type: GidmapPathAttr, Type: GidmapPathAttr,
Value: []byte(c.newgidmapPath), Value: []byte(c.newgidmapPath),
}) })
} }
if requiresRootOrMappingTool(c.config) { if requiresRootOrMappingTool(c.config) {
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(0)
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{ r.AddData(&Boolmsg{
Type: SetgroupAttr, Type: SetgroupAttr,
Value: true, Value: true,
@ -1811,18 +1856,19 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
} }
} }
} }
}
if c.config.OomScoreAdj != nil {
// write oom_score_adj // write oom_score_adj
r.AddData(&Bytemsg{ r.AddData(&Bytemsg{
Type: OomScoreAdjAttr, Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
}) })
}
// write rootless // write rootless
r.AddData(&Boolmsg{ r.AddData(&Boolmsg{
Type: RootlessAttr, Type: RootlessEUIDAttr,
Value: c.config.Rootless, Value: c.config.RootlessEUID,
}) })
return bytes.NewReader(r.Serialize()), nil return bytes.NewReader(r.Serialize()), nil

View File

@ -11,6 +11,7 @@ import (
"runtime/debug" "runtime/debug"
"strconv" "strconv"
"github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
@ -59,9 +60,9 @@ func SystemdCgroups(l *LinuxFactory) error {
return nil return nil
} }
// Cgroupfs is an options func to configure a LinuxFactory to return // Cgroupfs is an options func to configure a LinuxFactory to return containers
// containers that use the native cgroups filesystem implementation to // that use the native cgroups filesystem implementation to create and manage
// create and manage cgroups. // cgroups.
func Cgroupfs(l *LinuxFactory) error { func Cgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{ return &fs.Manager{
@ -72,9 +73,26 @@ func Cgroupfs(l *LinuxFactory) error {
return nil return nil
} }
// RootlessCgroupfs is an options func to configure a LinuxFactory to return
// containers that use the native cgroups filesystem implementation to create
// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is
// that RootlessCgroupfs can transparently handle permission errors that occur
// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if
// they've been set up properly).
func RootlessCgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{
Cgroups: config,
Rootless: true,
Paths: paths,
}
}
return nil
}
// IntelRdtfs is an options func to configure a LinuxFactory to return // IntelRdtfs is an options func to configure a LinuxFactory to return
// containers that use the Intel RDT "resource control" filesystem to // containers that use the Intel RDT "resource control" filesystem to
// create and manage Intel Xeon platform shared resources (e.g., L3 cache). // create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
func IntelRdtFs(l *LinuxFactory) error { func IntelRdtFs(l *LinuxFactory) error {
l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
return &intelrdt.IntelRdtManager{ return &intelrdt.IntelRdtManager{
@ -178,7 +196,10 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil { if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid) return nil, newGenericError(err, ConfigInvalid)
} }
containerRoot := filepath.Join(l.Root, id) containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
if _, err := os.Stat(containerRoot); err == nil { if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) { } else if !os.IsNotExist(err) {
@ -201,7 +222,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
newgidmapPath: l.NewgidmapPath, newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
} }
if intelrdt.IsEnabled() { if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
c.intelRdtManager = l.NewIntelRdtManager(config, id, "") c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
} }
c.state = &stoppedState{c: c} c.state = &stoppedState{c: c}
@ -212,7 +233,14 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
if l.Root == "" { if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
} }
containerRoot := filepath.Join(l.Root, id) //when load, we need to check id is valid or not.
if err := l.validateID(id); err != nil {
return nil, err
}
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
state, err := l.loadState(containerRoot, id) state, err := l.loadState(containerRoot, id)
if err != nil { if err != nil {
return nil, err return nil, err
@ -240,7 +268,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
if err := c.refreshState(); err != nil { if err := c.refreshState(); err != nil {
return nil, err return nil, err
} }
if intelrdt.IsEnabled() { if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
} }
return c, nil return c, nil
@ -322,7 +350,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
} }
func (l *LinuxFactory) loadState(root, id string) (*State, error) { func (l *LinuxFactory) loadState(root, id string) (*State, error) {
f, err := os.Open(filepath.Join(root, stateFilename)) stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
if err != nil {
return nil, err
}
f, err := os.Open(stateFilePath)
if err != nil { if err != nil {
if os.IsNotExist(err) { if os.IsNotExist(err) {
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
@ -338,7 +370,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) {
} }
func (l *LinuxFactory) validateID(id string) error { func (l *LinuxFactory) validateID(id string) error {
if !idRegex.MatchString(id) { if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
} }

View File

@ -6,6 +6,7 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"io/ioutil"
"net" "net"
"os" "os"
"strings" "strings"
@ -20,6 +21,7 @@ import (
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"github.com/vishvananda/netlink" "github.com/vishvananda/netlink"
) )
@ -64,7 +66,8 @@ type initConfig struct {
CreateConsole bool `json:"create_console"` CreateConsole bool `json:"create_console"`
ConsoleWidth uint16 `json:"console_width"` ConsoleWidth uint16 `json:"console_width"`
ConsoleHeight uint16 `json:"console_height"` ConsoleHeight uint16 `json:"console_height"`
Rootless bool `json:"rootless"` RootlessEUID bool `json:"rootless_euid,omitempty"`
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
} }
type initer interface { type initer interface {
@ -121,7 +124,7 @@ func finalizeNamespace(config *initConfig) error {
// inherited are marked close-on-exec so they stay out of the // inherited are marked close-on-exec so they stay out of the
// container // container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return err return errors.Wrap(err, "close exec fds")
} }
capabilities := &configs.Capabilities{} capabilities := &configs.Capabilities{}
@ -136,20 +139,20 @@ func finalizeNamespace(config *initConfig) error {
} }
// drop capabilities in bounding set before changing user // drop capabilities in bounding set before changing user
if err := w.ApplyBoundingSet(); err != nil { if err := w.ApplyBoundingSet(); err != nil {
return err return errors.Wrap(err, "apply bounding set")
} }
// preserve existing capabilities while we change users // preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil { if err := system.SetKeepCaps(); err != nil {
return err return errors.Wrap(err, "set keep caps")
} }
if err := setupUser(config); err != nil { if err := setupUser(config); err != nil {
return err return errors.Wrap(err, "setup user")
} }
if err := system.ClearKeepCaps(); err != nil { if err := system.ClearKeepCaps(); err != nil {
return err return errors.Wrap(err, "clear keep caps")
} }
if err := w.ApplyCaps(); err != nil { if err := w.ApplyCaps(); err != nil {
return err return errors.Wrap(err, "apply caps")
} }
if config.Cwd != "" { if config.Cwd != "" {
if err := unix.Chdir(config.Cwd); err != nil { if err := unix.Chdir(config.Cwd); err != nil {
@ -217,11 +220,7 @@ func syncParentReady(pipe io.ReadWriter) error {
} }
// Wait for parent to give the all-clear. // Wait for parent to give the all-clear.
if err := readSync(pipe, procRun); err != nil { return readSync(pipe, procRun)
return err
}
return nil
} }
// syncParentHooks sends to the given pipe a JSON payload which indicates that // syncParentHooks sends to the given pipe a JSON payload which indicates that
@ -234,11 +233,7 @@ func syncParentHooks(pipe io.ReadWriter) error {
} }
// Wait for parent to give the all-clear. // Wait for parent to give the all-clear.
if err := readSync(pipe, procResume); err != nil { return readSync(pipe, procResume)
return err
}
return nil
} }
// setupUser changes the groups, gid, and uid for the user inside the container // setupUser changes the groups, gid, and uid for the user inside the container
@ -282,7 +277,7 @@ func setupUser(config *initConfig) error {
return fmt.Errorf("cannot set gid to unmapped user in user namespace") return fmt.Errorf("cannot set gid to unmapped user in user namespace")
} }
if config.Rootless { if config.RootlessEUID {
// We cannot set any additional groups in a rootless container and thus // We cannot set any additional groups in a rootless container and thus
// we bail if the user asked us to do so. TODO: We currently can't do // we bail if the user asked us to do so. TODO: We currently can't do
// this check earlier, but if libcontainer.Process.User was typesafe // this check earlier, but if libcontainer.Process.User was typesafe
@ -298,11 +293,18 @@ func setupUser(config *initConfig) error {
return err return err
} }
setgroups, err := ioutil.ReadFile("/proc/self/setgroups")
if err != nil && !os.IsNotExist(err) {
return err
}
// This isn't allowed in an unprivileged user namespace since Linux 3.19. // This isn't allowed in an unprivileged user namespace since Linux 3.19.
// There's nothing we can do about /etc/group entries, so we silently // There's nothing we can do about /etc/group entries, so we silently
// ignore setting groups here (since the user didn't explicitly ask us to // ignore setting groups here (since the user didn't explicitly ask us to
// set the group). // set the group).
if !config.Rootless { allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny"
if allowSupGroups {
suppGroups := append(execUser.Sgids, addGroups...) suppGroups := append(execUser.Sgids, addGroups...)
if err := unix.Setgroups(suppGroups); err != nil { if err := unix.Setgroups(suppGroups); err != nil {
return err return err

View File

@ -16,20 +16,25 @@ import (
) )
/* /*
* About Intel RDT/CAT feature: * About Intel RDT features:
* Intel platforms with new Xeon CPU support Resource Director Technology (RDT). * Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
* Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
* Cache is the only resource that is supported in RDT. * two sub-features of RDT.
* *
* This feature provides a way for the software to restrict cache allocation to a * Cache Allocation Technology (CAT) provides a way for the software to restrict
* defined 'subset' of L3 cache which may be overlapping with other 'subsets'. * cache allocation to a defined 'subset' of L3 cache which may be overlapping
* The different subsets are identified by class of service (CLOS) and each CLOS * with other 'subsets'. The different subsets are identified by class of
* has a capacity bitmask (CBM). * service (CLOS) and each CLOS has a capacity bitmask (CBM).
* *
* For more information about Intel RDT/CAT can be found in the section 17.17 * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
* of Intel Software Developer Manual. * over memory bandwidth for the software. A user controls the resource by
* indicating the percentage of maximum memory bandwidth.
* *
* About Intel RDT/CAT kernel interface: * More details about Intel RDT CAT and MBA can be found in the section 17.18
* of Intel Software Developer Manual:
* https://software.intel.com/en-us/articles/intel-sdm
*
* About Intel RDT kernel interface:
* In Linux 4.10 kernel or newer, the interface is defined and exposed via * In Linux 4.10 kernel or newer, the interface is defined and exposed via
* "resource control" filesystem, which is a "cgroup-like" interface. * "resource control" filesystem, which is a "cgroup-like" interface.
* *
@ -37,59 +42,86 @@ import (
* interfaces in a container. But unlike cgroups' hierarchy, it has single level * interfaces in a container. But unlike cgroups' hierarchy, it has single level
* filesystem layout. * filesystem layout.
* *
* CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
* "resource control" filesystem.
*
* Intel RDT "resource control" filesystem hierarchy: * Intel RDT "resource control" filesystem hierarchy:
* mount -t resctrl resctrl /sys/fs/resctrl * mount -t resctrl resctrl /sys/fs/resctrl
* tree /sys/fs/resctrl * tree /sys/fs/resctrl
* /sys/fs/resctrl/ * /sys/fs/resctrl/
* |-- info * |-- info
* | |-- L3 * | |-- L3
* | |-- cbm_mask * | | |-- cbm_mask
* | |-- min_cbm_bits * | | |-- min_cbm_bits
* | | |-- num_closids
* | |-- MB
* | |-- bandwidth_gran
* | |-- delay_linear
* | |-- min_bandwidth
* | |-- num_closids * | |-- num_closids
* |-- cpus * |-- ...
* |-- schemata * |-- schemata
* |-- tasks * |-- tasks
* |-- <container_id> * |-- <container_id>
* |-- cpus * |-- ...
* |-- schemata * |-- schemata
* |-- tasks * |-- tasks
* *
* For runc, we can make use of `tasks` and `schemata` configuration for L3 cache * For runc, we can make use of `tasks` and `schemata` configuration for L3
* resource constraints. * cache and memory bandwidth resources constraints.
* *
* The file `tasks` has a list of tasks that belongs to this group (e.g., * The file `tasks` has a list of tasks that belongs to this group (e.g.,
* <container_id>" group). Tasks can be added to a group by writing the task ID * <container_id>" group). Tasks can be added to a group by writing the task ID
* to the "tasks" file (which will automatically remove them from the previous * to the "tasks" file (which will automatically remove them from the previous
* group to which they belonged). New tasks created by fork(2) and clone(2) are * group to which they belonged). New tasks created by fork(2) and clone(2) are
* added to the same group as their parent. If a pid is not in any sub group, it is * added to the same group as their parent.
* in root group.
* *
* The file `schemata` has allocation bitmasks/values for L3 cache on each socket, * The file `schemata` has a list of all the resources available to this group.
* which contains L3 cache id and capacity bitmask (CBM). * Each resource (L3 cache, memory bandwidth) has its own line and format.
*
* L3 cache schema:
* It has allocation bitmasks/values for L3 cache on each socket, which
* contains L3 cache id and capacity bitmask (CBM).
* Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..." * Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
* For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
* which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
* *
* The valid L3 cache CBM is a *contiguous bits set* and number of bits that can * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
* be set is less than the max bit. The max bits in the CBM is varied among * be set is less than the max bit. The max bits in the CBM is varied among
* supported Intel Xeon platforms. In Intel RDT "resource control" filesystem * supported Intel CPU models. Kernel will check if it is valid when writing.
* layout, the CBM in a group should be a subset of the CBM in root. Kernel will * e.g., default value 0xfffff in root indicates the max bits of CBM is 20
* check if it is valid when writing. e.g., 0xfffff in root indicates the max bits * bits, which mapping to entire L3 cache capacity. Some valid CBM values to
* of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
* values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
* *
* For more information about Intel RDT/CAT kernel interface: * Memory bandwidth schema:
* It has allocation values for memory bandwidth on each socket, which contains
* L3 cache id and memory bandwidth percentage.
* Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
* For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
*
* The minimum bandwidth percentage value for each CPU model is predefined and
* can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
* that is allocated is also dependent on the CPU model and can be looked up at
* "info/MB/bandwidth_gran". The available bandwidth control steps are:
* min_bw + N * bw_gran. Intermediate values are rounded to the next control
* step available on the hardware.
*
* For more information about Intel RDT kernel interface:
* https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
* *
* An example for runc: * An example for runc:
* Consider a two-socket machine with two L3 caches where the default CBM is * Consider a two-socket machine with two L3 caches where the default CBM is
* 0xfffff and the max CBM length is 20 bits. With this configuration, tasks * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
* inside the container only have access to the "upper" 80% of L3 cache id 0 and * with a memory bandwidth granularity of 10%.
* the "lower" 50% L3 cache id 1: *
* Tasks inside the container only have access to the "upper" 7/11 of L3 cache
* on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
* maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
* *
* "linux": { * "linux": {
* "intelRdt": { * "intelRdt": {
* "l3CacheSchema": "L3:0=ffff0;1=3ff" * "l3CacheSchema": "L3:0=7f0;1=1f",
* "memBwSchema": "MB:0=20;1=70"
* } * }
* } * }
*/ */
@ -129,8 +161,10 @@ var (
intelRdtRoot string intelRdtRoot string
intelRdtRootLock sync.Mutex intelRdtRootLock sync.Mutex
// The flag to indicate if Intel RDT is supported // The flag to indicate if Intel RDT/CAT is enabled
isEnabled bool isCatEnabled bool
// The flag to indicate if Intel RDT/MBA is enabled
isMbaEnabled bool
) )
type intelRdtData struct { type intelRdtData struct {
@ -139,19 +173,35 @@ type intelRdtData struct {
pid int pid int
} }
// Check if Intel RDT is enabled in init() // Check if Intel RDT sub-features are enabled in init()
func init() { func init() {
// 1. Check if hardware and kernel support Intel RDT/CAT feature // 1. Check if hardware and kernel support Intel RDT sub-features
// "cat_l3" flag is set if supported // "cat_l3" flag for CAT and "mba" flag for MBA
isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo")
if !isFlagSet || err != nil { if err != nil {
isEnabled = false
return return
} }
// 2. Check if Intel RDT "resource control" filesystem is mounted // 2. Check if Intel RDT "resource control" filesystem is mounted
// The user guarantees to mount the filesystem // The user guarantees to mount the filesystem
isEnabled = isIntelRdtMounted() if !isIntelRdtMounted() {
return
}
// 3. Double check if Intel RDT sub-features are available in
// "resource control" filesystem. Intel RDT sub-features can be
// selectively disabled or enabled by kernel command line
// (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
if isCatFlagSet {
if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil {
isCatEnabled = true
}
}
if isMbaFlagSet {
if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil {
isMbaEnabled = true
}
}
} }
// Return the mount point path of Intel RDT "resource control" filesysem // Return the mount point path of Intel RDT "resource control" filesysem
@ -177,7 +227,7 @@ func findIntelRdtMountpointDir() (string, error) {
} }
if postSeparatorFields[0] == "resctrl" { if postSeparatorFields[0] == "resctrl" {
// Check that the mount is properly formated. // Check that the mount is properly formatted.
if numPostFields < 3 { if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
} }
@ -223,30 +273,40 @@ func isIntelRdtMounted() bool {
return true return true
} }
func parseCpuInfoFile(path string) (bool, error) { func parseCpuInfoFile(path string) (bool, bool, error) {
isCatFlagSet := false
isMbaFlagSet := false
f, err := os.Open(path) f, err := os.Open(path)
if err != nil { if err != nil {
return false, err return false, false, err
} }
defer f.Close() defer f.Close()
s := bufio.NewScanner(f) s := bufio.NewScanner(f)
for s.Scan() { for s.Scan() {
if err := s.Err(); err != nil { if err := s.Err(); err != nil {
return false, err return false, false, err
} }
text := s.Text() line := s.Text()
flags := strings.Split(text, " ")
// "cat_l3" flag is set if Intel RDT/CAT is supported // Search "cat_l3" and "mba" flags in first "flags" line
if strings.Contains(line, "flags") {
flags := strings.Split(line, " ")
// "cat_l3" flag for CAT and "mba" flag for MBA
for _, flag := range flags { for _, flag := range flags {
if flag == "cat_l3" { switch flag {
return true, nil case "cat_l3":
isCatFlagSet = true
case "mba":
isMbaFlagSet = true
} }
} }
return isCatFlagSet, isMbaFlagSet, nil
} }
return false, nil }
return isCatFlagSet, isMbaFlagSet, nil
} }
func parseUint(s string, base, bitSize int) (uint64, error) { func parseUint(s string, base, bitSize int) (uint64, error) {
@ -292,30 +352,6 @@ func getIntelRdtParamString(path, file string) (string, error) {
return strings.TrimSpace(string(contents)), nil return strings.TrimSpace(string(contents)), nil
} }
func readTasksFile(dir string) ([]int, error) {
f, err := os.Open(filepath.Join(dir, IntelRdtTasks))
if err != nil {
return nil, err
}
defer f.Close()
var (
s = bufio.NewScanner(f)
out = []int{}
)
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, err
}
out = append(out, pid)
}
}
return out, nil
}
func writeFile(dir, file, data string) error { func writeFile(dir, file, data string) error {
if dir == "" { if dir == "" {
return fmt.Errorf("no such directory for %s", file) return fmt.Errorf("no such directory for %s", file)
@ -368,6 +404,57 @@ func getL3CacheInfo() (*L3CacheInfo, error) {
return l3CacheInfo, nil return l3CacheInfo, nil
} }
// Get the read-only memory bandwidth information
func getMemBwInfo() (*MemBwInfo, error) {
memBwInfo := &MemBwInfo{}
rootPath, err := getIntelRdtRoot()
if err != nil {
return memBwInfo, err
}
path := filepath.Join(rootPath, "info", "MB")
bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran")
if err != nil {
return memBwInfo, err
}
delayLinear, err := getIntelRdtParamUint(path, "delay_linear")
if err != nil {
return memBwInfo, err
}
minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth")
if err != nil {
return memBwInfo, err
}
numClosids, err := getIntelRdtParamUint(path, "num_closids")
if err != nil {
return memBwInfo, err
}
memBwInfo.BandwidthGran = bandwidthGran
memBwInfo.DelayLinear = delayLinear
memBwInfo.MinBandwidth = minBandwidth
memBwInfo.NumClosids = numClosids
return memBwInfo, nil
}
// Get diagnostics for last filesystem operation error from file info/last_cmd_status
func getLastCmdStatus() (string, error) {
rootPath, err := getIntelRdtRoot()
if err != nil {
return "", err
}
path := filepath.Join(rootPath, "info")
lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status")
if err != nil {
return "", err
}
return lastCmdStatus, nil
}
// WriteIntelRdtTasks writes the specified pid into the "tasks" file // WriteIntelRdtTasks writes the specified pid into the "tasks" file
func WriteIntelRdtTasks(dir string, pid int) error { func WriteIntelRdtTasks(dir string, pid int) error {
if dir == "" { if dir == "" {
@ -383,9 +470,14 @@ func WriteIntelRdtTasks(dir string, pid int) error {
return nil return nil
} }
// Check if Intel RDT is enabled // Check if Intel RDT/CAT is enabled
func IsEnabled() bool { func IsCatEnabled() bool {
return isEnabled return isCatEnabled
}
// Check if Intel RDT/MBA is enabled
func IsMbaEnabled() bool {
return isMbaEnabled
} }
// Get the 'container_id' path in Intel RDT "resource control" filesystem // Get the 'container_id' path in Intel RDT "resource control" filesystem
@ -452,6 +544,25 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) {
defer m.mu.Unlock() defer m.mu.Unlock()
stats := NewStats() stats := NewStats()
rootPath, err := getIntelRdtRoot()
if err != nil {
return nil, err
}
// The read-only L3 cache and memory bandwidth schemata in root
tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata")
if err != nil {
return nil, err
}
schemaRootStrings := strings.Split(tmpRootStrings, "\n")
// The L3 cache and memory bandwidth schemata in 'container_id' group
tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata")
if err != nil {
return nil, err
}
schemaStrings := strings.Split(tmpStrings, "\n")
if IsCatEnabled() {
// The read-only L3 cache information // The read-only L3 cache information
l3CacheInfo, err := getL3CacheInfo() l3CacheInfo, err := getL3CacheInfo()
if err != nil { if err != nil {
@ -460,57 +571,103 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) {
stats.L3CacheInfo = l3CacheInfo stats.L3CacheInfo = l3CacheInfo
// The read-only L3 cache schema in root // The read-only L3 cache schema in root
rootPath, err := getIntelRdtRoot() for _, schemaRoot := range schemaRootStrings {
if err != nil { if strings.Contains(schemaRoot, "L3") {
return nil, err stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot)
} }
tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata")
if err != nil {
return nil, err
} }
// L3 cache schema is in the first line
schemaRootStrings := strings.Split(tmpRootStrings, "\n")
stats.L3CacheSchemaRoot = schemaRootStrings[0]
// The L3 cache schema in 'container_id' group // The L3 cache schema in 'container_id' group
tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") for _, schema := range schemaStrings {
if strings.Contains(schema, "L3") {
stats.L3CacheSchema = strings.TrimSpace(schema)
}
}
}
if IsMbaEnabled() {
// The read-only memory bandwidth information
memBwInfo, err := getMemBwInfo()
if err != nil { if err != nil {
return nil, err return nil, err
} }
// L3 cache schema is in the first line stats.MemBwInfo = memBwInfo
schemaStrings := strings.Split(tmpStrings, "\n")
stats.L3CacheSchema = schemaStrings[0] // The read-only memory bandwidth information
for _, schemaRoot := range schemaRootStrings {
if strings.Contains(schemaRoot, "MB") {
stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot)
}
}
// The memory bandwidth schema in 'container_id' group
for _, schema := range schemaStrings {
if strings.Contains(schema, "MB") {
stats.MemBwSchema = strings.TrimSpace(schema)
}
}
}
return stats, nil return stats, nil
} }
// Set Intel RDT "resource control" filesystem as configured. // Set Intel RDT "resource control" filesystem as configured.
func (m *IntelRdtManager) Set(container *configs.Config) error { func (m *IntelRdtManager) Set(container *configs.Config) error {
path := m.GetPath() // About L3 cache schema:
// It has allocation bitmasks/values for L3 cache on each socket,
// About L3 cache schema file:
// The schema has allocation masks/values for L3 cache on each socket,
// which contains L3 cache id and capacity bitmask (CBM). // which contains L3 cache id and capacity bitmask (CBM).
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..." // Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
// For example, on a two-socket machine, L3's schema line could be: // For example, on a two-socket machine, the schema line could be:
// L3:0=ff;1=c0 // L3:0=ff;1=c0
// Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM
// is 0xc0.
// //
// About L3 cache CBM validity:
// The valid L3 cache CBM is a *contiguous bits set* and number of // The valid L3 cache CBM is a *contiguous bits set* and number of
// bits that can be set is less than the max bit. The max bits in the // bits that can be set is less than the max bit. The max bits in the
// CBM is varied among supported Intel Xeon platforms. In Intel RDT // CBM is varied among supported Intel CPU models. Kernel will check
// "resource control" filesystem layout, the CBM in a group should // if it is valid when writing. e.g., default value 0xfffff in root
// be a subset of the CBM in root. Kernel will check if it is valid // indicates the max bits of CBM is 20 bits, which mapping to entire
// when writing. // L3 cache capacity. Some valid CBM values to set in a group:
// e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, // 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
// which mapping to entire L3 cache capacity. Some valid CBM values //
// to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. //
// About memory bandwidth schema:
// It has allocation values for memory bandwidth on each socket, which
// contains L3 cache id and memory bandwidth percentage.
// Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
// For example, on a two-socket machine, the schema line could be:
// "MB:0=20;1=70"
//
// The minimum bandwidth percentage value for each CPU model is
// predefined and can be looked up through "info/MB/min_bandwidth".
// The bandwidth granularity that is allocated is also dependent on
// the CPU model and can be looked up at "info/MB/bandwidth_gran".
// The available bandwidth control steps are: min_bw + N * bw_gran.
// Intermediate values are rounded to the next control step available
// on the hardware.
if container.IntelRdt != nil { if container.IntelRdt != nil {
path := m.GetPath()
l3CacheSchema := container.IntelRdt.L3CacheSchema l3CacheSchema := container.IntelRdt.L3CacheSchema
if l3CacheSchema != "" { memBwSchema := container.IntelRdt.MemBwSchema
// Write a single joint schema string to schemata file
if l3CacheSchema != "" && memBwSchema != "" {
if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil {
return NewLastCmdError(err)
}
}
// Write only L3 cache schema string to schemata file
if l3CacheSchema != "" && memBwSchema == "" {
if err := writeFile(path, "schemata", l3CacheSchema); err != nil { if err := writeFile(path, "schemata", l3CacheSchema); err != nil {
return err return NewLastCmdError(err)
}
}
// Write only memory bandwidth schema string to schemata file
if l3CacheSchema == "" && memBwSchema != "" {
if err := writeFile(path, "schemata", memBwSchema); err != nil {
return NewLastCmdError(err)
} }
} }
} }
@ -521,11 +678,11 @@ func (m *IntelRdtManager) Set(container *configs.Config) error {
func (raw *intelRdtData) join(id string) (string, error) { func (raw *intelRdtData) join(id string) (string, error) {
path := filepath.Join(raw.root, id) path := filepath.Join(raw.root, id)
if err := os.MkdirAll(path, 0755); err != nil { if err := os.MkdirAll(path, 0755); err != nil {
return "", err return "", NewLastCmdError(err)
} }
if err := WriteIntelRdtTasks(path, raw.pid); err != nil { if err := WriteIntelRdtTasks(path, raw.pid); err != nil {
return "", err return "", NewLastCmdError(err)
} }
return path, nil return path, nil
} }
@ -551,3 +708,23 @@ func IsNotFound(err error) bool {
_, ok := err.(*NotFoundError) _, ok := err.(*NotFoundError)
return ok return ok
} }
type LastCmdError struct {
LastCmdStatus string
Err error
}
func (e *LastCmdError) Error() string {
return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus)
}
func NewLastCmdError(err error) error {
lastCmdStatus, err1 := getLastCmdStatus()
if err1 == nil {
return &LastCmdError{
LastCmdStatus: lastCmdStatus,
Err: err,
}
}
return err
}

View File

@ -8,6 +8,13 @@ type L3CacheInfo struct {
NumClosids uint64 `json:"num_closids,omitempty"` NumClosids uint64 `json:"num_closids,omitempty"`
} }
type MemBwInfo struct {
BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
DelayLinear uint64 `json:"delay_linear,omitempty"`
MinBandwidth uint64 `json:"min_bandwidth,omitempty"`
NumClosids uint64 `json:"num_closids,omitempty"`
}
type Stats struct { type Stats struct {
// The read-only L3 cache information // The read-only L3 cache information
L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
@ -17,6 +24,15 @@ type Stats struct {
// The L3 cache schema in 'container_id' group // The L3 cache schema in 'container_id' group
L3CacheSchema string `json:"l3_cache_schema,omitempty"` L3CacheSchema string `json:"l3_cache_schema,omitempty"`
// The read-only memory bandwidth information
MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"`
// The read-only memory bandwidth schema in root
MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`
// The memory bandwidth schema in 'container_id' group
MemBwSchema string `json:"mem_bw_schema,omitempty"`
} }
func NewStats() *Stats { func NewStats() *Stats {

View File

@ -8,6 +8,7 @@ go_library(
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = select({ deps = select({
"@io_bazel_rules_go//go/platform:linux": [ "@io_bazel_rules_go//go/platform:linux": [
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/golang.org/x/sys/unix:go_default_library", "//vendor/golang.org/x/sys/unix:go_default_library",
], ],
"//conditions:default": [], "//conditions:default": [],

View File

@ -7,6 +7,8 @@ import (
"strconv" "strconv"
"strings" "strings"
"github.com/pkg/errors"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@ -15,7 +17,7 @@ type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) { func JoinSessionKeyring(name string) (KeySerial, error) {
sessKeyId, err := unix.KeyctlJoinSessionKeyring(name) sessKeyId, err := unix.KeyctlJoinSessionKeyring(name)
if err != nil { if err != nil {
return 0, fmt.Errorf("could not create session key: %v", err) return 0, errors.Wrap(err, "create session key")
} }
return KeySerial(sessKeyId), nil return KeySerial(sessKeyId), nil
} }
@ -42,9 +44,5 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
perm := (uint32(perm64) & mask) | setbits perm := (uint32(perm64) & mask) | setbits
if err := unix.KeyctlSetperm(int(ringId), perm); err != nil { return unix.KeyctlSetperm(int(ringId), perm)
return err
}
return nil
} }

View File

@ -17,7 +17,7 @@ const (
GidmapAttr uint16 = 27284 GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285 SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286 OomScoreAdjAttr uint16 = 27286
RootlessAttr uint16 = 27287 RootlessEUIDAttr uint16 = 27287
UidmapPathAttr uint16 = 27288 UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289 GidmapPathAttr uint16 = 27289
) )

View File

@ -5,18 +5,15 @@ package libcontainer
import ( import (
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"net"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/vishvananda/netlink" "github.com/vishvananda/netlink"
) )
var strategies = map[string]networkStrategy{ var strategies = map[string]networkStrategy{
"veth": &veth{},
"loopback": &loopback{}, "loopback": &loopback{},
} }
@ -103,157 +100,3 @@ func (l *loopback) attach(n *configs.Network) (err error) {
func (l *loopback) detach(n *configs.Network) (err error) { func (l *loopback) detach(n *configs.Network) (err error) {
return nil return nil
} }
// veth is a network strategy that uses a bridge and creates
// a veth pair, one that is attached to the bridge on the host and the other
// is placed inside the container's namespace
type veth struct {
}
func (v *veth) detach(n *configs.Network) (err error) {
return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil)
}
// attach a container network interface to an external network
func (v *veth) attach(n *configs.Network) (err error) {
brl, err := netlink.LinkByName(n.Bridge)
if err != nil {
return err
}
br, ok := brl.(*netlink.Bridge)
if !ok {
return fmt.Errorf("Wrong device type %T", brl)
}
host, err := netlink.LinkByName(n.HostInterfaceName)
if err != nil {
return err
}
if err := netlink.LinkSetMaster(host, br); err != nil {
return err
}
if err := netlink.LinkSetMTU(host, n.Mtu); err != nil {
return err
}
if n.HairpinMode {
if err := netlink.LinkSetHairpin(host, true); err != nil {
return err
}
}
if err := netlink.LinkSetUp(host); err != nil {
return err
}
return nil
}
func (v *veth) create(n *network, nspid int) (err error) {
tmpName, err := v.generateTempPeerName()
if err != nil {
return err
}
n.TempVethPeerName = tmpName
if n.Bridge == "" {
return fmt.Errorf("bridge is not specified")
}
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{
Name: n.HostInterfaceName,
TxQLen: n.TxQueueLen,
},
PeerName: n.TempVethPeerName,
}
if err := netlink.LinkAdd(veth); err != nil {
return err
}
defer func() {
if err != nil {
netlink.LinkDel(veth)
}
}()
if err := v.attach(&n.Network); err != nil {
return err
}
child, err := netlink.LinkByName(n.TempVethPeerName)
if err != nil {
return err
}
return netlink.LinkSetNsPid(child, nspid)
}
func (v *veth) generateTempPeerName() (string, error) {
return utils.GenerateRandomName("veth", 7)
}
func (v *veth) initialize(config *network) error {
peer := config.TempVethPeerName
if peer == "" {
return fmt.Errorf("peer is not specified")
}
child, err := netlink.LinkByName(peer)
if err != nil {
return err
}
if err := netlink.LinkSetDown(child); err != nil {
return err
}
if err := netlink.LinkSetName(child, config.Name); err != nil {
return err
}
// get the interface again after we changed the name as the index also changes.
if child, err = netlink.LinkByName(config.Name); err != nil {
return err
}
if config.MacAddress != "" {
mac, err := net.ParseMAC(config.MacAddress)
if err != nil {
return err
}
if err := netlink.LinkSetHardwareAddr(child, mac); err != nil {
return err
}
}
ip, err := netlink.ParseAddr(config.Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip); err != nil {
return err
}
if config.IPv6Address != "" {
ip6, err := netlink.ParseAddr(config.IPv6Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip6); err != nil {
return err
}
}
if err := netlink.LinkSetMTU(child, config.Mtu); err != nil {
return err
}
if err := netlink.LinkSetUp(child); err != nil {
return err
}
if config.Gateway != "" {
gw := net.ParseIP(config.Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
if config.IPv6Gateway != "" {
gw := net.ParseIP(config.IPv6Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
return nil
}

View File

@ -72,6 +72,9 @@ type Process struct {
// ConsoleSocket provides the masterfd console. // ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File ConsoleSocket *os.File
// Init specifies whether the process is the first process in the container.
Init bool
ops processOperations ops processOperations
} }

View File

@ -50,6 +50,7 @@ type setnsProcess struct {
parentPipe *os.File parentPipe *os.File
childPipe *os.File childPipe *os.File
cgroupPaths map[string]string cgroupPaths map[string]string
rootlessCgroups bool
intelRdtPath string intelRdtPath string
config *initConfig config *initConfig
fds []string fds []string
@ -86,7 +87,7 @@ func (p *setnsProcess) start() (err error) {
return newSystemErrorWithCause(err, "executing setns process") return newSystemErrorWithCause(err, "executing setns process")
} }
if len(p.cgroupPaths) > 0 { if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
} }
} }
@ -537,7 +538,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
} }
fds = append(fds, r.Fd(), w.Fd()) fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace // change ownership of the pipes in case we are in a user namespace
for _, fd := range fds { for _, fd := range fds {
if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err return nil, err

View File

@ -46,6 +46,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
return newSystemErrorWithCause(err, "preparing rootfs") return newSystemErrorWithCause(err, "preparing rootfs")
} }
setupDev := needsSetupDev(config)
for _, m := range config.Mounts { for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds { for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil { if err := mountCmd(precmd); err != nil {
@ -64,8 +65,6 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
} }
} }
setupDev := needsSetupDev(config)
if setupDev { if setupDev {
if err := createDevices(config); err != nil { if err := createDevices(config); err != nil {
return newSystemErrorWithCause(err, "creating device nodes") return newSystemErrorWithCause(err, "creating device nodes")
@ -153,6 +152,26 @@ func finalizeRootfs(config *configs.Config) (err error) {
return nil return nil
} }
// /tmp has to be mounted as private to allow MS_MOVE to work in all situations
func prepareTmp(topTmpDir string) (string, error) {
tmpdir, err := ioutil.TempDir(topTmpDir, "runctop")
if err != nil {
return "", err
}
if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil {
return "", err
}
if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil {
return "", err
}
return tmpdir, nil
}
func cleanupTmp(tmpdir string) error {
unix.Unmount(tmpdir, 0)
return os.RemoveAll(tmpdir)
}
func mountCmd(cmd configs.Command) error { func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...) command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env command.Env = cmd.Env
@ -200,7 +219,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
} }
} }
if copyUp { if copyUp {
tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir") tmpdir, err := prepareTmp("/tmp")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
}
defer cleanupTmp(tmpdir)
tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
if err != nil { if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
} }
@ -397,6 +421,7 @@ func checkMountDestination(rootfs, dest string) error {
"/proc/stat", "/proc/stat",
"/proc/swaps", "/proc/swaps",
"/proc/uptime", "/proc/uptime",
"/proc/loadavg",
"/proc/net/dev", "/proc/net/dev",
} }
for _, valid := range validDestinations { for _, valid := range validDestinations {
@ -413,7 +438,7 @@ func checkMountDestination(rootfs, dest string) error {
if err != nil { if err != nil {
return err return err
} }
if path == "." || !strings.HasPrefix(path, "..") { if path != "." && !strings.HasPrefix(path, "..") {
return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid) return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid)
} }
} }
@ -803,10 +828,7 @@ func remount(m *configs.Mount, rootfs string) error {
if !strings.HasPrefix(dest, rootfs) { if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest) dest = filepath.Join(rootfs, dest)
} }
if err := unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), ""); err != nil { return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
return err
}
return nil
} }
// Do the mount operation followed by additional mounts required to take care // Do the mount operation followed by additional mounts required to take care

View File

@ -5,12 +5,14 @@ package libcontainer
import ( import (
"fmt" "fmt"
"os" "os"
"runtime"
"github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux/label" "github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@ -28,10 +30,19 @@ func (l *linuxSetnsInit) getSessionRingName() string {
} }
func (l *linuxSetnsInit) Init() error { func (l *linuxSetnsInit) Init() error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if !l.config.Config.NoNewKeyring { if !l.config.Config.NoNewKeyring {
// do not inherit the parent's session keyring // Do not inherit the parent's session keyring.
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err // Same justification as in standart_init_linux.go as to why we
// don't bail on ENOSYS.
//
// TODO(cyphar): And we should have logging here too.
if errors.Cause(err) != unix.ENOSYS {
return errors.Wrap(err, "join session keyring")
}
} }
} }
if l.config.CreateConsole { if l.config.CreateConsole {
@ -47,6 +58,10 @@ func (l *linuxSetnsInit) Init() error {
return err return err
} }
} }
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
defer label.SetProcessLabel("")
// Without NoNewPrivileges seccomp is a privileged operation, so we need to // Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible // do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible. // just before execve so as few syscalls take place after it as possible.
@ -61,9 +76,6 @@ func (l *linuxSetnsInit) Init() error {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err return err
} }
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
// Set seccomp as close to execve as possible, so as few syscalls take // Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to // place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles). // enable in their seccomp profiles).

View File

@ -6,6 +6,7 @@ import (
"fmt" "fmt"
"os" "os"
"os/exec" "os/exec"
"runtime"
"syscall" //only for Exec "syscall" //only for Exec
"github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/apparmor"
@ -14,6 +15,7 @@ import (
"github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux/label" "github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@ -43,17 +45,31 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
} }
func (l *linuxStandardInit) Init() error { func (l *linuxStandardInit) Init() error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if !l.config.Config.NoNewKeyring { if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams() ringname, keepperms, newperms := l.getSessionRingParams()
// Do not inherit the parent's session keyring. // Do not inherit the parent's session keyring.
sessKeyId, err := keys.JoinSessionKeyring(ringname) if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
if err != nil { // If keyrings aren't supported then it is likely we are on an
return err // older kernel (or inside an LXC container). While we could bail,
// the security feature we are using here is best-effort (it only
// really provides marginal protection since VFS credentials are
// the only significant protection of keyrings).
//
// TODO(cyphar): Log this so people know what's going on, once we
// have proper logging in 'runc init'.
if errors.Cause(err) != unix.ENOSYS {
return errors.Wrap(err, "join session keyring")
} }
// Make session keyring searcheable. } else {
// Make session keyring searcheable. If we've gotten this far we
// bail on any error -- we don't want to have a keyring with bad
// permissions.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err return errors.Wrap(err, "mod keyring permissions")
}
} }
} }
@ -76,7 +92,7 @@ func (l *linuxStandardInit) Init() error {
return err return err
} }
if err := system.Setctty(); err != nil { if err := system.Setctty(); err != nil {
return err return errors.Wrap(err, "setctty")
} }
} }
@ -89,46 +105,47 @@ func (l *linuxStandardInit) Init() error {
if hostname := l.config.Config.Hostname; hostname != "" { if hostname := l.config.Config.Hostname; hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil { if err := unix.Sethostname([]byte(hostname)); err != nil {
return err return errors.Wrap(err, "sethostname")
} }
} }
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err return errors.Wrap(err, "apply apparmor profile")
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
} }
for key, value := range l.config.Config.Sysctl { for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil { if err := writeSystemProperty(key, value); err != nil {
return err return errors.Wrapf(err, "write sysctl key %s", key)
} }
} }
for _, path := range l.config.Config.ReadonlyPaths { for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil { if err := readonlyPath(path); err != nil {
return err return errors.Wrapf(err, "readonly path %s", path)
} }
} }
for _, path := range l.config.Config.MaskPaths { for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path, l.config.Config.MountLabel); err != nil { if err := maskPath(path, l.config.Config.MountLabel); err != nil {
return err return errors.Wrapf(err, "mask path %s", path)
} }
} }
pdeath, err := system.GetParentDeathSignal() pdeath, err := system.GetParentDeathSignal()
if err != nil { if err != nil {
return err return errors.Wrap(err, "get pdeath signal")
} }
if l.config.NoNewPrivileges { if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err return errors.Wrap(err, "set nonewprivileges")
} }
} }
// Tell our parent that we're ready to Execv. This must be done before the // Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and // Seccomp rules have been applied, because we need to be able to read and
// write to a socket. // write to a socket.
if err := syncParentReady(l.pipe); err != nil { if err := syncParentReady(l.pipe); err != nil {
return err return errors.Wrap(err, "sync ready")
} }
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return errors.Wrap(err, "set process label")
}
defer label.SetProcessLabel("")
// Without NoNewPrivileges seccomp is a privileged operation, so we need to // Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible // do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible. // just before execve so as few syscalls take place after it as possible.
@ -143,7 +160,7 @@ func (l *linuxStandardInit) Init() error {
// finalizeNamespace can change user/group which clears the parent death // finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here. // signal, so we restore it here.
if err := pdeath.Restore(); err != nil { if err := pdeath.Restore(); err != nil {
return err return errors.Wrap(err, "restore pdeath signal")
} }
// Compare the parent from the initial start of the init process and make // Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died // sure that it did not change. if the parent changes that means it died

View File

@ -41,10 +41,7 @@ type syncT struct {
// writeSync is used to write to a synchronisation pipe. An error is returned // writeSync is used to write to a synchronisation pipe. An error is returned
// if there was a problem writing the payload. // if there was a problem writing the payload.
func writeSync(pipe io.Writer, sync syncType) error { func writeSync(pipe io.Writer, sync syncType) error {
if err := utils.WriteJSON(pipe, syncT{sync}); err != nil { return utils.WriteJSON(pipe, syncT{sync})
return err
}
return nil
} }
// readSync is used to read from a synchronisation pipe. An error is returned // readSync is used to read from a synchronisation pipe. An error is returned

View File

@ -17,6 +17,41 @@ go_library(
importpath = "github.com/opencontainers/runc/libcontainer/system", importpath = "github.com/opencontainers/runc/libcontainer/system",
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = select({ deps = select({
"@io_bazel_rules_go//go/platform:android": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:darwin": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:dragonfly": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:freebsd": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:linux": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:nacl": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:netbsd": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:openbsd": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:plan9": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:solaris": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:windows": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"//conditions:default": [],
}) + select({
"@io_bazel_rules_go//go/platform:linux_386": [ "@io_bazel_rules_go//go/platform:linux_386": [
"//vendor/golang.org/x/sys/unix:go_default_library", "//vendor/golang.org/x/sys/unix:go_default_library",
], ],

View File

@ -3,13 +3,12 @@
package system package system
import ( import (
"bufio"
"fmt"
"os" "os"
"os/exec" "os/exec"
"syscall" // only for exec "syscall" // only for exec
"unsafe" "unsafe"
"github.com/opencontainers/runc/libcontainer/user"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@ -102,34 +101,43 @@ func Setctty() error {
} }
// RunningInUserNS detects whether we are currently running in a user namespace. // RunningInUserNS detects whether we are currently running in a user namespace.
// Copied from github.com/lxc/lxd/shared/util.go // Originally copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool { func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map") uidmap, err := user.CurrentProcessUIDMap()
if err != nil { if err != nil {
// This kernel-provided file only exists if user namespaces are supported // This kernel-provided file only exists if user namespaces are supported
return false return false
} }
defer file.Close() return UIDMapInUserNS(uidmap)
}
buf := bufio.NewReader(file) func UIDMapInUserNS(uidmap []user.IDMap) bool {
l, _, err := buf.ReadLine()
if err != nil {
return false
}
line := string(l)
var a, b, c int64
fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
/* /*
* We assume we are in the initial user namespace if we have a full * We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0. * range - 4294967295 uids starting at uid 0.
*/ */
if a == 0 && b == 0 && c == 4294967295 { if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
return false return false
} }
return true return true
} }
// GetParentNSeuid returns the euid within the parent user namespace
func GetParentNSeuid() int64 {
euid := int64(os.Geteuid())
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return euid
}
for _, um := range uidmap {
if um.ID <= euid && euid <= um.ID+um.Count-1 {
return um.ParentID + euid - um.ID
}
}
return euid
}
// SetSubreaper sets the value i as the subreaper setting for the calling process // SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error { func SetSubreaper(i int) error {
return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)

View File

@ -2,8 +2,26 @@
package system package system
import (
"os"
"github.com/opencontainers/runc/libcontainer/user"
)
// RunningInUserNS is a stub for non-Linux systems // RunningInUserNS is a stub for non-Linux systems
// Always returns false // Always returns false
func RunningInUserNS() bool { func RunningInUserNS() bool {
return false return false
} }
// UIDMapInUserNS is a stub for non-Linux systems
// Always returns false
func UIDMapInUserNS(uidmap []user.IDMap) bool {
return false
}
// GetParentNSeuid returns the euid within the parent user namespace
// Always returns os.Geteuid on non-linux
func GetParentNSeuid() int {
return os.Geteuid()
}

View File

@ -5,6 +5,7 @@ package user
import ( import (
"io" "io"
"os" "os"
"strconv"
"golang.org/x/sys/unix" "golang.org/x/sys/unix"
) )
@ -114,3 +115,30 @@ func CurrentUser() (User, error) {
func CurrentGroup() (Group, error) { func CurrentGroup() (Group, error) {
return LookupGid(unix.Getgid()) return LookupGid(unix.Getgid())
} }
func currentUserSubIDs(fileName string) ([]SubID, error) {
u, err := CurrentUser()
if err != nil {
return nil, err
}
filter := func(entry SubID) bool {
return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
}
return ParseSubIDFileFilter(fileName, filter)
}
func CurrentUserSubUIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subuid")
}
func CurrentUserSubGIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subgid")
}
func CurrentProcessUIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/uid_map")
}
func CurrentProcessGIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/gid_map")
}

View File

@ -75,12 +75,29 @@ func groupFromOS(g *user.Group) (Group, error) {
return newGroup, nil return newGroup, nil
} }
// SubID represents an entry in /etc/sub{u,g}id
type SubID struct {
Name string
SubID int64
Count int64
}
// IDMap represents an entry in /proc/PID/{u,g}id_map
type IDMap struct {
ID int64
ParentID int64
Count int64
}
func parseLine(line string, v ...interface{}) { func parseLine(line string, v ...interface{}) {
if line == "" { parseParts(strings.Split(line, ":"), v...)
}
func parseParts(parts []string, v ...interface{}) {
if len(parts) == 0 {
return return
} }
parts := strings.Split(line, ":")
for i, p := range parts { for i, p := range parts {
// Ignore cases where we don't have enough fields to populate the arguments. // Ignore cases where we don't have enough fields to populate the arguments.
// Some configuration files like to misbehave. // Some configuration files like to misbehave.
@ -96,6 +113,8 @@ func parseLine(line string, v ...interface{}) {
case *int: case *int:
// "numbers", with conversion errors ignored because of some misbehaving configuration files. // "numbers", with conversion errors ignored because of some misbehaving configuration files.
*e, _ = strconv.Atoi(p) *e, _ = strconv.Atoi(p)
case *int64:
*e, _ = strconv.ParseInt(p, 10, 64)
case *[]string: case *[]string:
// Comma-separated lists. // Comma-separated lists.
if p != "" { if p != "" {
@ -105,7 +124,7 @@ func parseLine(line string, v ...interface{}) {
} }
default: default:
// Someone goof'd when writing code using this function. Scream so they can hear us. // Someone goof'd when writing code using this function. Scream so they can hear us.
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e)) panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
} }
} }
} }
@ -479,3 +498,111 @@ func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int
} }
return GetAdditionalGroups(additionalGroups, group) return GetAdditionalGroups(additionalGroups, group)
} }
func ParseSubIDFile(path string) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubID(subid)
}
func ParseSubID(subid io.Reader) ([]SubID, error) {
return ParseSubIDFilter(subid, nil)
}
func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubIDFilter(subid, filter)
}
func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
if r == nil {
return nil, fmt.Errorf("nil source for subid-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []SubID{}
)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
// see: man 5 subuid
p := SubID{}
parseLine(line, &p.Name, &p.SubID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
return out, nil
}
func ParseIDMapFile(path string) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMap(r)
}
func ParseIDMap(r io.Reader) ([]IDMap, error) {
return ParseIDMapFilter(r, nil)
}
func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMapFilter(r, filter)
}
func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
if r == nil {
return nil, fmt.Errorf("nil source for idmap-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []IDMap{}
)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
// see: man 7 user_namespaces
p := IDMap{}
parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
return out, nil
}

View File

@ -1,8 +1,6 @@
package utils package utils
import ( import (
"crypto/rand"
"encoding/hex"
"encoding/json" "encoding/json"
"io" "io"
"os" "os"
@ -17,19 +15,6 @@ const (
exitSignalOffset = 128 exitSignalOffset = 128
) )
// GenerateRandomName returns a new name joined with a prefix. This size
// specified is used to truncate the randomly generated value
func GenerateRandomName(prefix string, size int) (string, error) {
id := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, id); err != nil {
return "", err
}
if size > 64 {
size = 64
}
return prefix + hex.EncodeToString(id)[:size], nil
}
// ResolveRootfs ensures that the current working directory is // ResolveRootfs ensures that the current working directory is
// not a symlink and returns the absolute path to the rootfs // not a symlink and returns the absolute path to the rootfs
func ResolveRootfs(uncleanRootfs string) (string, error) { func ResolveRootfs(uncleanRootfs string) (string, error) {