Merge pull request #72114 from sjenning/bump-runc

vendor: bump runc to f000fe11
pull/564/head
Kubernetes Prow Robot 2018-12-18 21:27:49 -08:00 committed by GitHub
commit 774ac6408d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 1152 additions and 681 deletions

64
Godeps/Godeps.json generated
View File

@ -2795,83 +2795,83 @@
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/apparmor",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/fs",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/cgroups/systemd",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/configs/validate",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/criurpc",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/intelrdt",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/keys",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/mount",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/seccomp",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/stacktrace",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/system",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/user",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer/utils",
"Comment": "v1.0.0-rc5-46-g871ba2e58e2431",
"Rev": "871ba2e58e24314d1fab4517a80410191ba5ad01"
"Comment": "v1.0.0-rc5-176-gf000fe11ece1b7",
"Rev": "f000fe11ece1b79f744edd9c8e1a53ba0f5e0f24"
},
{
"ImportPath": "github.com/opencontainers/runtime-spec/specs-go",

View File

@ -53,6 +53,7 @@ go_library(
"//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
"//vendor/github.com/opencontainers/selinux/go-selinux/label:go_default_library",
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/github.com/sirupsen/logrus:go_default_library",
"//vendor/github.com/syndtr/gocapability/capability:go_default_library",
"//vendor/github.com/vishvananda/netlink:go_default_library",

View File

@ -323,6 +323,7 @@ generated when building libcontainer with docker.
## Copyright and license
Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license.
Docs released under Creative commons.
Code and documentation copyright 2014 Docker, inc.
The code and documentation are released under the [Apache 2.0 license](../LICENSE).
The documentation is also released under Creative Commons Attribution 4.0 International License.
You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/.

View File

@ -156,17 +156,21 @@ init process will block waiting for the parent to finish setup.
### IntelRdt
Intel platforms with new Xeon CPU support Intel Resource Director Technology
(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which
currently supports L3 cache resource allocation.
Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
two sub-features of RDT.
This feature provides a way for the software to restrict cache allocation to a
defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
The different subsets are identified by class of service (CLOS) and each CLOS
has a capacity bitmask (CBM).
Cache Allocation Technology (CAT) provides a way for the software to restrict
cache allocation to a defined 'subset' of L3 cache which may be overlapping
with other 'subsets'. The different subsets are identified by class of
service (CLOS) and each CLOS has a capacity bitmask (CBM).
It can be used to handle L3 cache resource allocation for containers if
hardware and kernel support Intel RDT/CAT.
Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
over memory bandwidth for the software. A user controls the resource by
indicating the percentage of maximum memory bandwidth.
It can be used to handle L3 cache and memory bandwidth resources allocation
for containers if hardware and kernel support Intel RDT CAT and MBA features.
In Linux 4.10 kernel or newer, the interface is defined and exposed via
"resource control" filesystem, which is a "cgroup-like" interface.
@ -175,6 +179,9 @@ Comparing with cgroups, it has similar process management lifecycle and
interfaces in a container. But unlike cgroups' hierarchy, it has single level
filesystem layout.
CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
"resource control" filesystem.
Intel RDT "resource control" filesystem hierarchy:
```
mount -t resctrl resctrl /sys/fs/resctrl
@ -182,59 +189,85 @@ tree /sys/fs/resctrl
/sys/fs/resctrl/
|-- info
| |-- L3
| |-- cbm_mask
| |-- min_cbm_bits
| | |-- cbm_mask
| | |-- min_cbm_bits
| | |-- num_closids
| |-- MB
| |-- bandwidth_gran
| |-- delay_linear
| |-- min_bandwidth
| |-- num_closids
|-- cpus
|-- ...
|-- schemata
|-- tasks
|-- <container_id>
|-- cpus
|-- ...
|-- schemata
|-- tasks
```
For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
resource constraints.
For runc, we can make use of `tasks` and `schemata` configuration for L3
cache and memory bandwidth resources constraints.
The file `tasks` has a list of tasks that belongs to this group (e.g.,
<container_id>" group). Tasks can be added to a group by writing the task ID
to the "tasks" file (which will automatically remove them from the previous
to the "tasks" file (which will automatically remove them from the previous
group to which they belonged). New tasks created by fork(2) and clone(2) are
added to the same group as their parent. If a pid is not in any sub group, it
is in root group.
added to the same group as their parent.
The file `schemata` has allocation masks/values for L3 cache on each socket,
which contains L3 cache id and capacity bitmask (CBM).
The file `schemata` has a list of all the resources available to this group.
Each resource (L3 cache, memory bandwidth) has its own line and format.
L3 cache schema:
It has allocation bitmasks/values for L3 cache on each socket, which
contains L3 cache id and capacity bitmask (CBM).
```
Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
```
For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
be set is less than the max bit. The max bits in the CBM is varied among
supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
layout, the CBM in a group should be a subset of the CBM in root. Kernel will
check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
supported Intel CPU models. Kernel will check if it is valid when writing.
e.g., default value 0xfffff in root indicates the max bits of CBM is 20
bits, which mapping to entire L3 cache capacity. Some valid CBM values to
set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
For more information about Intel RDT/CAT kernel interface:
Memory bandwidth schema:
It has allocation values for memory bandwidth on each socket, which contains
L3 cache id and memory bandwidth percentage.
```
Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
```
For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
The minimum bandwidth percentage value for each CPU model is predefined and
can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
that is allocated is also dependent on the CPU model and can be looked up at
"info/MB/bandwidth_gran". The available bandwidth control steps are:
min_bw + N * bw_gran. Intermediate values are rounded to the next control
step available on the hardware.
For more information about Intel RDT kernel interface:
https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
An example for runc:
```
An example for runc:
Consider a two-socket machine with two L3 caches where the default CBM is
0xfffff and the max CBM length is 20 bits. With this configuration, tasks
inside the container only have access to the "upper" 80% of L3 cache id 0 and
the "lower" 50% L3 cache id 1:
0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
with a memory bandwidth granularity of 10%.
Tasks inside the container only have access to the "upper" 7/11 of L3 cache
on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
"linux": {
"intelRdt": {
"l3CacheSchema": "L3:0=ffff0;1=3ff"
}
"intelRdt": {
"closID": "guaranteed_group",
"l3CacheSchema": "L3:0=7f0;1=1f",
"memBwSchema": "MB:0=20;1=70"
}
}
```

View File

@ -12,6 +12,7 @@ go_library(
"freezer.go",
"fs_unsupported.go",
"hugetlb.go",
"kmem.go",
"memory.go",
"name.go",
"net_cls.go",
@ -29,6 +30,7 @@ go_library(
"//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library",
"//vendor/github.com/opencontainers/runc/libcontainer/utils:go_default_library",
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/golang.org/x/sys/unix:go_default_library",
],
"//conditions:default": [],

View File

@ -3,7 +3,6 @@
package fs
import (
"errors"
"fmt"
"io"
"io/ioutil"
@ -14,6 +13,8 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
var (
@ -35,7 +36,7 @@ var (
HugePageSizes, _ = cgroups.GetHugePageSize()
)
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
type subsystemSet []subsystem
@ -62,9 +63,10 @@ type subsystem interface {
}
type Manager struct {
mu sync.Mutex
Cgroups *configs.Cgroup
Paths map[string]string
mu sync.Mutex
Cgroups *configs.Cgroup
Rootless bool // ignore permission-related errors
Paths map[string]string
}
// The absolute path to the root of the cgroup hierarchies.
@ -100,6 +102,33 @@ type cgroupData struct {
pid int
}
// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
// We do not ignore errors if we are root.
if !rootless {
return false
}
// Is it an ordinary EPERM?
if os.IsPermission(errors.Cause(err)) {
return true
}
// Try to handle other errnos.
var errno error
switch err := errors.Cause(err).(type) {
case *os.PathError:
errno = err.Err
case *os.LinkError:
errno = err.Err
case *os.SyscallError:
errno = err.Err
}
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil {
return nil
@ -145,11 +174,11 @@ func (m *Manager) Apply(pid int) (err error) {
m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
if os.IsPermission(err) && m.Cgroups.Path == "" {
// If we didn't set a cgroup path, then let's defer the error here
// until we know whether we have set limits or not.
// If we hadn't set limits, then it's ok that we couldn't join this cgroup, because
// it will have the same limits as its parent.
// In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
// been set, we don't bail on error in case of permission problems.
// Cases where limits have been set (and we couldn't create our own
// cgroup) are handled by Set.
if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
delete(m.Paths, sys.Name())
continue
}
@ -207,9 +236,16 @@ func (m *Manager) Set(container *configs.Config) error {
for _, sys := range subsystems {
path := paths[sys.Name()]
if err := sys.Set(path, container.Cgroups); err != nil {
if m.Rootless && sys.Name() == "devices" {
continue
}
// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
// cgroup never applied
return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name())
// We never created a path for this cgroup, so we cannot set
// limits for it (though we have already tried at this point).
return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
}
return err
}

View File

@ -46,11 +46,7 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return err
}
return nil
return cgroups.WriteCgroupProc(path, pid)
}
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
@ -83,11 +79,7 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
}
if err := s.SetRtSched(path, cgroup); err != nil {
return err
}
return nil
return s.SetRtSched(path, cgroup)
}
func (s *CpuGroup) Remove(d *cgroupData) error {

View File

@ -77,18 +77,14 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatbility.
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := cgroups.WriteCgroupProc(dir, pid); err != nil {
return err
}
return nil
return cgroups.WriteCgroupProc(dir, pid)
}
func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {

View File

@ -0,0 +1,55 @@
// +build linux,!nokmem
package fs
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"syscall" // for Errno type only
"github.com/opencontainers/runc/libcontainer/cgroups"
"golang.org/x/sys/unix"
)
const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
func EnableKernelMemoryAccounting(path string) error {
// Check if kernel memory is enabled
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// kernel memory is not enabled on the system so we should do nothing
return nil
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if pathErr, ok := err.(*os.PathError); ok {
if errNo, ok := pathErr.Err.(syscall.Errno); ok {
if errNo == unix.EBUSY {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
}
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
}
return nil
}

View File

@ -0,0 +1,11 @@
// +build linux,nokmem
package fs
func EnableKernelMemoryAccounting(path string) error {
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
return nil
}

View File

@ -5,23 +5,18 @@ package fs
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"syscall" // only for Errno
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
const (
cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
)
type MemoryGroup struct {
@ -67,44 +62,6 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return nil
}
func EnableKernelMemoryAccounting(path string) error {
// Check if kernel memory is enabled
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// kernel memory is not enabled on the system so we should do nothing
return nil
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if pathErr, ok := err.(*os.PathError); ok {
if errNo, ok := pathErr.Err.(syscall.Errno); ok {
if errNo == unix.EBUSY {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
}
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
}
return nil
}
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// If the memory update is set to -1 we should also
// set swap to -1, it means unlimited memory.

View File

@ -5,6 +5,7 @@ package systemd
import (
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strings"
@ -295,13 +296,19 @@ func (m *Manager) Apply(pid int) error {
// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
// corresponds to USEC_INFINITY in systemd
// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
// always setting a property value ensures we can apply a quota and remove it later
cpuQuotaPerSecUSec := uint64(math.MaxUint64)
if c.Resources.CpuQuota > 0 {
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
}
}
properties = append(properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
@ -312,6 +319,12 @@ func (m *Manager) Apply(pid int) error {
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
}
if c.Resources.PidsLimit > 0 {
properties = append(properties,
newProp("TasksAccounting", true),
newProp("TasksMax", uint64(c.Resources.PidsLimit)))
}
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {

View File

@ -13,7 +13,7 @@ import (
"strings"
"time"
"github.com/docker/go-units"
units "github.com/docker/go-units"
)
const (
@ -103,7 +103,7 @@ func FindCgroupMountpointDir() (string, error) {
}
if postSeparatorFields[0] == "cgroup" {
// Check that the mount is properly formated.
// Check that the mount is properly formatted.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
@ -151,19 +151,20 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
seen, known := ss[opt]
if !known || (!all && seen) {
continue
}
ss[opt] = true
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
} else {
m.Subsystems = append(m.Subsystems, opt)
}
if !all {
numFound++
opt = opt[len(cgroupNamePrefix):]
}
m.Subsystems = append(m.Subsystems, opt)
numFound++
}
if len(m.Subsystems) > 0 || all {
res = append(res, m)
}
res = append(res, m)
}
if err := scanner.Err(); err != nil {
return nil, err
@ -187,7 +188,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) {
allMap := make(map[string]bool)
for s := range allSubsystems {
allMap[s] = true
allMap[s] = false
}
return getCgroupMountsHelper(allMap, f, all)
}
@ -262,7 +263,7 @@ func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
// see paths from host, which don't exist in container.
relCgroup, err := filepath.Rel(root, cgroup)
if err != nil {
return "", err

View File

@ -141,9 +141,10 @@ type Config struct {
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
// higher scores are preferred for being killed.
// higher scores are preferred for being killed. If it is unset then we don't touch the current
// value.
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj int `json:"oom_score_adj"`
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
@ -185,12 +186,19 @@ type Config struct {
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
// Rootless specifies whether the container is a rootless container.
Rootless bool `json:"rootless"`
// IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into
// to limit the resources (e.g., L3 cache) the container has available
// IntelRdt specifies settings for Intel RDT group that the container is placed into
// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
// RootlessEUID is set when the runc was launched with non-zero EUID.
// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
// When RootlessEUID is set, runc creates a new userns for the container.
// (config.json needs to contain userns settings)
RootlessEUID bool `json:"rootless_euid,omitempty"`
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
}
type Hooks struct {

View File

@ -4,4 +4,8 @@ type IntelRdt struct {
// The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
// The schema of memory bandwidth percentage per L3 cache id
// Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
MemBwSchema string `json:"memBwSchema,omitempty"`
}

View File

@ -2,23 +2,18 @@ package validate
import (
"fmt"
"os"
"reflect"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
geteuid = os.Geteuid
getegid = os.Getegid
)
func (v *ConfigValidator) rootless(config *configs.Config) error {
if err := rootlessMappings(config); err != nil {
// rootlessEUID makes sure that the config can be applied when runc
// is being executed as a non-root user (euid != 0) in the current user namespace.
func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
if err := rootlessEUIDMappings(config); err != nil {
return err
}
if err := rootlessMount(config); err != nil {
if err := rootlessEUIDMount(config); err != nil {
return err
}
@ -38,11 +33,9 @@ func hasIDMapping(id int, mappings []configs.IDMap) bool {
return false
}
func rootlessMappings(config *configs.Config) error {
if euid := geteuid(); euid != 0 {
if !config.Namespaces.Contains(configs.NEWUSER) {
return fmt.Errorf("rootless containers require user namespaces")
}
func rootlessEUIDMappings(config *configs.Config) error {
if !config.Namespaces.Contains(configs.NEWUSER) {
return fmt.Errorf("rootless container requires user namespaces")
}
if len(config.UidMappings) == 0 {
@ -51,34 +44,13 @@ func rootlessMappings(config *configs.Config) error {
if len(config.GidMappings) == 0 {
return fmt.Errorf("rootless containers requires at least one GID mapping")
}
return nil
}
// cgroup verifies that the user isn't trying to set any cgroup limits or paths.
func rootlessCgroup(config *configs.Config) error {
// Nothing set at all.
if config.Cgroups == nil || config.Cgroups.Resources == nil {
return nil
}
// Used for comparing to the zero value.
left := reflect.ValueOf(*config.Cgroups.Resources)
right := reflect.Zero(left.Type())
// This is all we need to do, since specconv won't add cgroup options in
// rootless mode.
if !reflect.DeepEqual(left.Interface(), right.Interface()) {
return fmt.Errorf("cannot specify resource limits in rootless container")
}
return nil
}
// mount verifies that the user isn't trying to set up any mounts they don't have
// the rights to do. In addition, it makes sure that no mount has a `uid=` or
// `gid=` option that doesn't resolve to root.
func rootlessMount(config *configs.Config) error {
func rootlessEUIDMount(config *configs.Config) error {
// XXX: We could whitelist allowed devices at this point, but I'm not
// convinced that's a good idea. The kernel is the best arbiter of
// access control.

View File

@ -44,8 +44,8 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.intelrdt(config); err != nil {
return err
}
if config.Rootless {
if err := v.rootless(config); err != nil {
if config.RootlessEUID {
if err := v.rootlessEUID(config); err != nil {
return err
}
}
@ -151,6 +151,16 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
}
}
if config.Namespaces.Contains(configs.NEWUTS) {
switch s {
case "kernel.domainname":
// This is namespaced and there's no explicit OCI field for it.
continue
case "kernel.hostname":
// This is namespaced but there's a conflicting (dedicated) OCI field for it.
return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname")
}
}
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
}
@ -159,11 +169,22 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
func (v *ConfigValidator) intelrdt(config *configs.Config) error {
if config.IntelRdt != nil {
if !intelrdt.IsEnabled() {
return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled")
if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled")
}
if config.IntelRdt.L3CacheSchema == "" {
return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" {
return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
}
if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" {
return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
}
if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" {
return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
}
if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" {
return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty")
}
}

View File

@ -28,7 +28,6 @@ import (
"github.com/golang/protobuf/proto"
"github.com/sirupsen/logrus"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/unix"
)
@ -60,7 +59,8 @@ type State struct {
// Platform specific fields below here
// Specifies if the container was started under the rootless mode.
// Specified if the container was started under the rootless mode.
// Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
Rootless bool `json:"rootless"`
// Path to all the cgroups setup for a container. Key is cgroup subsystem name
@ -225,17 +225,13 @@ func (c *linuxContainer) Set(config configs.Config) error {
func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
if process.Init {
if err := c.createExecFifo(); err != nil {
return err
}
}
if err := c.start(process, status == Stopped); err != nil {
if status == Stopped {
if err := c.start(process); err != nil {
if process.Init {
c.deleteExecFifo()
}
return err
@ -244,17 +240,10 @@ func (c *linuxContainer) Start(process *Process) error {
}
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
status, err := c.currentStatus()
if err != nil {
c.m.Unlock()
return err
}
c.m.Unlock()
if err := c.Start(process); err != nil {
return err
}
if status == Stopped {
if process.Init {
return c.exec()
}
return nil
@ -335,8 +324,8 @@ type openResult struct {
err error
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
func (c *linuxContainer) start(process *Process) error {
parent, err := c.newParentProcess(process)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
@ -349,7 +338,7 @@ func (c *linuxContainer) start(process *Process, isInit bool) error {
}
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
if isInit {
if process.Init {
c.state = &createdState{
c: c,
}
@ -411,10 +400,7 @@ func (c *linuxContainer) createExecFifo() error {
return err
}
unix.Umask(oldMask)
if err := os.Chown(fifoName, rootuid, rootgid); err != nil {
return err
}
return nil
return os.Chown(fifoName, rootuid, rootgid)
}
func (c *linuxContainer) deleteExecFifo() {
@ -439,7 +425,7 @@ func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
return nil
}
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
parentPipe, childPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
@ -448,7 +434,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
if !p.Init {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
@ -473,6 +459,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
if p.ConsoleSocket != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
@ -533,14 +520,15 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
return nil, err
}
return &setnsProcess{
cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(),
intelRdtPath: state.IntelRdtPath,
childPipe: childPipe,
parentPipe: parentPipe,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(),
rootlessCgroups: c.config.RootlessCgroups,
intelRdtPath: state.IntelRdtPath,
childPipe: childPipe,
parentPipe: parentPipe,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
}, nil
}
@ -556,7 +544,8 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges,
Rootless: c.config.Rootless,
RootlessEUID: c.config.RootlessEUID,
RootlessCgroups: c.config.RootlessCgroups,
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
@ -624,16 +613,16 @@ func (c *linuxContainer) Resume() error {
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups.
if c.config.Rootless {
return nil, fmt.Errorf("cannot get OOM notifications from rootless container")
if c.config.RootlessCgroups {
logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
}
return notifyOnOOM(c.cgroupManager.GetPaths())
}
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
// XXX(cyphar): This requires cgroups.
if c.config.Rootless {
return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container")
if c.config.RootlessCgroups {
logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
}
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
@ -668,7 +657,7 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.
Features: criuFeat,
}
err := c.criuSwrk(nil, req, criuOpts, false)
err := c.criuSwrk(nil, req, criuOpts, false, nil)
if err != nil {
logrus.Debugf("%s", err)
return fmt.Errorf("CRIU feature check failed")
@ -781,7 +770,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error {
Type: &t,
}
err := c.criuSwrk(nil, req, nil, false)
err := c.criuSwrk(nil, req, nil, false, nil)
if err != nil {
return fmt.Errorf("CRIU version check failed: %s", err)
}
@ -877,12 +866,11 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
// Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
// (CLI prints a warning)
// TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
// support for doing unprivileged dumps, but the setup of
// rootless containers might make this complicated.
if c.config.Rootless {
return fmt.Errorf("cannot checkpoint a rootless container")
}
// criu 1.5.2 => 10502
if err := c.checkCriuVersion(10502); err != nil {
@ -939,6 +927,33 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
LazyPages: proto.Bool(criuOpts.LazyPages),
}
// If the container is running in a network namespace and has
// a path to the network namespace configured, we will dump
// that network namespace as an external namespace and we
// will expect that the namespace exists during restore.
// This basically means that CRIU will ignore the namespace
// and expect to be setup correctly.
nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
if nsPath != "" {
// For this to work we need at least criu 3.11.0 => 31100.
// As there was already a successful version check we will
// not error out if it fails. runc will just behave as it used
// to do and ignore external network namespaces.
err := c.checkCriuVersion(31100)
if err == nil {
// CRIU expects the information about an external namespace
// like this: --external net[<inode>]:<key>
// This <key> is always 'extRootNetNS'.
var netns syscall.Stat_t
err = syscall.Stat(nsPath, &netns)
if err != nil {
return err
}
criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino)
rpcOpts.External = append(rpcOpts.External, criuExternal)
}
}
fcg := c.cgroupManager.GetPaths()["freezer"]
if fcg != "" {
rpcOpts.FreezeCgroup = proto.String(fcg)
@ -1043,7 +1058,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
}
}
err = c.criuSwrk(nil, req, criuOpts, false)
err = c.criuSwrk(nil, req, criuOpts, false, nil)
if err != nil {
return err
}
@ -1087,11 +1102,12 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
var extraFiles []*os.File
// Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
// (CLI prints a warning)
// TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
// support for unprivileged restore at the moment.
if c.config.Rootless {
return fmt.Errorf("cannot restore a rootless container")
}
// criu 1.5.2 => 10502
if err := c.checkCriuVersion(10502); err != nil {
@ -1161,6 +1177,38 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
},
}
// Same as during checkpointing. If the container has a specific network namespace
// assigned to it, this now expects that the checkpoint will be restored in a
// already created network namespace.
nsPath := c.config.Namespaces.PathOf(configs.NEWNET)
if nsPath != "" {
// For this to work we need at least criu 3.11.0 => 31100.
// As there was already a successful version check we will
// not error out if it fails. runc will just behave as it used
// to do and ignore external network namespaces.
err := c.checkCriuVersion(31100)
if err == nil {
// CRIU wants the information about an existing network namespace
// like this: --inherit-fd fd[<fd>]:<key>
// The <key> needs to be the same as during checkpointing.
// We are always using 'extRootNetNS' as the key in this.
netns, err := os.Open(nsPath)
defer netns.Close()
if err != nil {
logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
}
inheritFd := new(criurpc.InheritFd)
inheritFd.Key = proto.String("extRootNetNS")
// The offset of four is necessary because 0, 1, 2 and 3 is already
// used by stdin, stdout, stderr, 'criu swrk' socket.
inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles)))
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
// All open FDs need to be transferred to CRIU via extraFiles
extraFiles = append(extraFiles, netns)
}
}
for _, m := range c.config.Mounts {
switch m.Device {
case "bind":
@ -1219,7 +1267,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
}
}
return c.criuSwrk(process, req, criuOpts, true)
return c.criuSwrk(process, req, criuOpts, true, extraFiles)
}
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
@ -1249,7 +1297,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
return nil
}
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
if err != nil {
return err
@ -1290,6 +1338,9 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
cmd.Stderr = process.Stderr
}
cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
if extraFiles != nil {
cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
}
if err := cmd.Start(); err != nil {
return err
@ -1664,7 +1715,7 @@ func (c *linuxContainer) currentState() (*State, error) {
InitProcessStartTime: startTime,
Created: c.created,
},
Rootless: c.config.Rootless,
Rootless: c.config.RootlessEUID && c.config.RootlessCgroups,
CgroupPaths: c.cgroupManager.GetPaths(),
IntelRdtPath: intelRdtPath,
NamespacePaths: make(map[configs.NamespaceType]string),
@ -1765,7 +1816,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
if c.config.Rootless && c.newuidmapPath != "" {
if c.config.RootlessEUID && c.newuidmapPath != "" {
r.AddData(&Bytemsg{
Type: UidmapPathAttr,
Value: []byte(c.newuidmapPath),
@ -1791,38 +1842,33 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Type: GidmapAttr,
Value: b,
})
if c.config.Rootless && c.newgidmapPath != "" {
if c.config.RootlessEUID && c.newgidmapPath != "" {
r.AddData(&Bytemsg{
Type: GidmapPathAttr,
Value: []byte(c.newgidmapPath),
})
}
if requiresRootOrMappingTool(c.config) {
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(0)
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
// write oom_score_adj
r.AddData(&Bytemsg{
Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)),
})
if c.config.OomScoreAdj != nil {
// write oom_score_adj
r.AddData(&Bytemsg{
Type: OomScoreAdjAttr,
Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
})
}
// write rootless
r.AddData(&Boolmsg{
Type: RootlessAttr,
Value: c.config.Rootless,
Type: RootlessEUIDAttr,
Value: c.config.RootlessEUID,
})
return bytes.NewReader(r.Serialize()), nil

View File

@ -11,6 +11,7 @@ import (
"runtime/debug"
"strconv"
"github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
@ -59,9 +60,9 @@ func SystemdCgroups(l *LinuxFactory) error {
return nil
}
// Cgroupfs is an options func to configure a LinuxFactory to return
// containers that use the native cgroups filesystem implementation to
// create and manage cgroups.
// Cgroupfs is an options func to configure a LinuxFactory to return containers
// that use the native cgroups filesystem implementation to create and manage
// cgroups.
func Cgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{
@ -72,9 +73,26 @@ func Cgroupfs(l *LinuxFactory) error {
return nil
}
// RootlessCgroupfs is an options func to configure a LinuxFactory to return
// containers that use the native cgroups filesystem implementation to create
// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is
// that RootlessCgroupfs can transparently handle permission errors that occur
// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if
// they've been set up properly).
func RootlessCgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{
Cgroups: config,
Rootless: true,
Paths: paths,
}
}
return nil
}
// IntelRdtfs is an options func to configure a LinuxFactory to return
// containers that use the Intel RDT "resource control" filesystem to
// create and manage Intel Xeon platform shared resources (e.g., L3 cache).
// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
func IntelRdtFs(l *LinuxFactory) error {
l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
return &intelrdt.IntelRdtManager{
@ -178,7 +196,10 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
@ -201,7 +222,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}
if intelrdt.IsEnabled() {
if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
}
c.state = &stoppedState{c: c}
@ -212,7 +233,14 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
//when load, we need to check id is valid or not.
if err := l.validateID(id); err != nil {
return nil, err
}
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
state, err := l.loadState(containerRoot, id)
if err != nil {
return nil, err
@ -240,7 +268,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
if err := c.refreshState(); err != nil {
return nil, err
}
if intelrdt.IsEnabled() {
if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
}
return c, nil
@ -322,7 +350,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
}
func (l *LinuxFactory) loadState(root, id string) (*State, error) {
f, err := os.Open(filepath.Join(root, stateFilename))
stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
if err != nil {
return nil, err
}
f, err := os.Open(stateFilePath)
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
@ -338,7 +370,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) {
}
func (l *LinuxFactory) validateID(id string) error {
if !idRegex.MatchString(id) {
if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
}

View File

@ -6,6 +6,7 @@ import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"strings"
@ -20,6 +21,7 @@ import (
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
)
@ -64,7 +66,8 @@ type initConfig struct {
CreateConsole bool `json:"create_console"`
ConsoleWidth uint16 `json:"console_width"`
ConsoleHeight uint16 `json:"console_height"`
Rootless bool `json:"rootless"`
RootlessEUID bool `json:"rootless_euid,omitempty"`
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
}
type initer interface {
@ -121,7 +124,7 @@ func finalizeNamespace(config *initConfig) error {
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return err
return errors.Wrap(err, "close exec fds")
}
capabilities := &configs.Capabilities{}
@ -136,20 +139,20 @@ func finalizeNamespace(config *initConfig) error {
}
// drop capabilities in bounding set before changing user
if err := w.ApplyBoundingSet(); err != nil {
return err
return errors.Wrap(err, "apply bounding set")
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return err
return errors.Wrap(err, "set keep caps")
}
if err := setupUser(config); err != nil {
return err
return errors.Wrap(err, "setup user")
}
if err := system.ClearKeepCaps(); err != nil {
return err
return errors.Wrap(err, "clear keep caps")
}
if err := w.ApplyCaps(); err != nil {
return err
return errors.Wrap(err, "apply caps")
}
if config.Cwd != "" {
if err := unix.Chdir(config.Cwd); err != nil {
@ -217,11 +220,7 @@ func syncParentReady(pipe io.ReadWriter) error {
}
// Wait for parent to give the all-clear.
if err := readSync(pipe, procRun); err != nil {
return err
}
return nil
return readSync(pipe, procRun)
}
// syncParentHooks sends to the given pipe a JSON payload which indicates that
@ -234,11 +233,7 @@ func syncParentHooks(pipe io.ReadWriter) error {
}
// Wait for parent to give the all-clear.
if err := readSync(pipe, procResume); err != nil {
return err
}
return nil
return readSync(pipe, procResume)
}
// setupUser changes the groups, gid, and uid for the user inside the container
@ -282,7 +277,7 @@ func setupUser(config *initConfig) error {
return fmt.Errorf("cannot set gid to unmapped user in user namespace")
}
if config.Rootless {
if config.RootlessEUID {
// We cannot set any additional groups in a rootless container and thus
// we bail if the user asked us to do so. TODO: We currently can't do
// this check earlier, but if libcontainer.Process.User was typesafe
@ -298,11 +293,18 @@ func setupUser(config *initConfig) error {
return err
}
setgroups, err := ioutil.ReadFile("/proc/self/setgroups")
if err != nil && !os.IsNotExist(err) {
return err
}
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
// There's nothing we can do about /etc/group entries, so we silently
// ignore setting groups here (since the user didn't explicitly ask us to
// set the group).
if !config.Rootless {
allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny"
if allowSupGroups {
suppGroups := append(execUser.Sgids, addGroups...)
if err := unix.Setgroups(suppGroups); err != nil {
return err

View File

@ -16,20 +16,25 @@ import (
)
/*
* About Intel RDT/CAT feature:
* About Intel RDT features:
* Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
* Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3
* Cache is the only resource that is supported in RDT.
* Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are
* two sub-features of RDT.
*
* This feature provides a way for the software to restrict cache allocation to a
* defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
* The different subsets are identified by class of service (CLOS) and each CLOS
* has a capacity bitmask (CBM).
* Cache Allocation Technology (CAT) provides a way for the software to restrict
* cache allocation to a defined 'subset' of L3 cache which may be overlapping
* with other 'subsets'. The different subsets are identified by class of
* service (CLOS) and each CLOS has a capacity bitmask (CBM).
*
* For more information about Intel RDT/CAT can be found in the section 17.17
* of Intel Software Developer Manual.
* Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle
* over memory bandwidth for the software. A user controls the resource by
* indicating the percentage of maximum memory bandwidth.
*
* About Intel RDT/CAT kernel interface:
* More details about Intel RDT CAT and MBA can be found in the section 17.18
* of Intel Software Developer Manual:
* https://software.intel.com/en-us/articles/intel-sdm
*
* About Intel RDT kernel interface:
* In Linux 4.10 kernel or newer, the interface is defined and exposed via
* "resource control" filesystem, which is a "cgroup-like" interface.
*
@ -37,59 +42,86 @@ import (
* interfaces in a container. But unlike cgroups' hierarchy, it has single level
* filesystem layout.
*
* CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via
* "resource control" filesystem.
*
* Intel RDT "resource control" filesystem hierarchy:
* mount -t resctrl resctrl /sys/fs/resctrl
* tree /sys/fs/resctrl
* /sys/fs/resctrl/
* |-- info
* | |-- L3
* | |-- cbm_mask
* | |-- min_cbm_bits
* | | |-- cbm_mask
* | | |-- min_cbm_bits
* | | |-- num_closids
* | |-- MB
* | |-- bandwidth_gran
* | |-- delay_linear
* | |-- min_bandwidth
* | |-- num_closids
* |-- cpus
* |-- ...
* |-- schemata
* |-- tasks
* |-- <container_id>
* |-- cpus
* |-- ...
* |-- schemata
* |-- tasks
*
* For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
* resource constraints.
* For runc, we can make use of `tasks` and `schemata` configuration for L3
* cache and memory bandwidth resources constraints.
*
* The file `tasks` has a list of tasks that belongs to this group (e.g.,
* The file `tasks` has a list of tasks that belongs to this group (e.g.,
* <container_id>" group). Tasks can be added to a group by writing the task ID
* to the "tasks" file (which will automatically remove them from the previous
* to the "tasks" file (which will automatically remove them from the previous
* group to which they belonged). New tasks created by fork(2) and clone(2) are
* added to the same group as their parent. If a pid is not in any sub group, it is
* in root group.
* added to the same group as their parent.
*
* The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
* which contains L3 cache id and capacity bitmask (CBM).
* The file `schemata` has a list of all the resources available to this group.
* Each resource (L3 cache, memory bandwidth) has its own line and format.
*
* L3 cache schema:
* It has allocation bitmasks/values for L3 cache on each socket, which
* contains L3 cache id and capacity bitmask (CBM).
* Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
* For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
* For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0"
* which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
*
* The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
* be set is less than the max bit. The max bits in the CBM is varied among
* supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
* layout, the CBM in a group should be a subset of the CBM in root. Kernel will
* check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
* of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
* values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
* supported Intel CPU models. Kernel will check if it is valid when writing.
* e.g., default value 0xfffff in root indicates the max bits of CBM is 20
* bits, which mapping to entire L3 cache capacity. Some valid CBM values to
* set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
*
* For more information about Intel RDT/CAT kernel interface:
* Memory bandwidth schema:
* It has allocation values for memory bandwidth on each socket, which contains
* L3 cache id and memory bandwidth percentage.
* Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
* For example, on a two-socket machine, the schema line could be "MB:0=20;1=70"
*
* The minimum bandwidth percentage value for each CPU model is predefined and
* can be looked up through "info/MB/min_bandwidth". The bandwidth granularity
* that is allocated is also dependent on the CPU model and can be looked up at
* "info/MB/bandwidth_gran". The available bandwidth control steps are:
* min_bw + N * bw_gran. Intermediate values are rounded to the next control
* step available on the hardware.
*
* For more information about Intel RDT kernel interface:
* https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
*
* An example for runc:
* Consider a two-socket machine with two L3 caches where the default CBM is
* 0xfffff and the max CBM length is 20 bits. With this configuration, tasks
* inside the container only have access to the "upper" 80% of L3 cache id 0 and
* the "lower" 50% L3 cache id 1:
* 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10%
* with a memory bandwidth granularity of 10%.
*
* Tasks inside the container only have access to the "upper" 7/11 of L3 cache
* on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a
* maximum memory bandwidth of 20% on socket 0 and 70% on socket 1.
*
* "linux": {
* "intelRdt": {
* "l3CacheSchema": "L3:0=ffff0;1=3ff"
* "intelRdt": {
* "l3CacheSchema": "L3:0=7f0;1=1f",
* "memBwSchema": "MB:0=20;1=70"
* }
* }
*/
@ -129,8 +161,10 @@ var (
intelRdtRoot string
intelRdtRootLock sync.Mutex
// The flag to indicate if Intel RDT is supported
isEnabled bool
// The flag to indicate if Intel RDT/CAT is enabled
isCatEnabled bool
// The flag to indicate if Intel RDT/MBA is enabled
isMbaEnabled bool
)
type intelRdtData struct {
@ -139,19 +173,35 @@ type intelRdtData struct {
pid int
}
// Check if Intel RDT is enabled in init()
// Check if Intel RDT sub-features are enabled in init()
func init() {
// 1. Check if hardware and kernel support Intel RDT/CAT feature
// "cat_l3" flag is set if supported
isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo")
if !isFlagSet || err != nil {
isEnabled = false
// 1. Check if hardware and kernel support Intel RDT sub-features
// "cat_l3" flag for CAT and "mba" flag for MBA
isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo")
if err != nil {
return
}
// 2. Check if Intel RDT "resource control" filesystem is mounted
// The user guarantees to mount the filesystem
isEnabled = isIntelRdtMounted()
if !isIntelRdtMounted() {
return
}
// 3. Double check if Intel RDT sub-features are available in
// "resource control" filesystem. Intel RDT sub-features can be
// selectively disabled or enabled by kernel command line
// (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
if isCatFlagSet {
if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil {
isCatEnabled = true
}
}
if isMbaFlagSet {
if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil {
isMbaEnabled = true
}
}
}
// Return the mount point path of Intel RDT "resource control" filesysem
@ -177,7 +227,7 @@ func findIntelRdtMountpointDir() (string, error) {
}
if postSeparatorFields[0] == "resctrl" {
// Check that the mount is properly formated.
// Check that the mount is properly formatted.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
@ -223,30 +273,40 @@ func isIntelRdtMounted() bool {
return true
}
func parseCpuInfoFile(path string) (bool, error) {
func parseCpuInfoFile(path string) (bool, bool, error) {
isCatFlagSet := false
isMbaFlagSet := false
f, err := os.Open(path)
if err != nil {
return false, err
return false, false, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
if err := s.Err(); err != nil {
return false, err
return false, false, err
}
text := s.Text()
flags := strings.Split(text, " ")
line := s.Text()
// "cat_l3" flag is set if Intel RDT/CAT is supported
for _, flag := range flags {
if flag == "cat_l3" {
return true, nil
// Search "cat_l3" and "mba" flags in first "flags" line
if strings.Contains(line, "flags") {
flags := strings.Split(line, " ")
// "cat_l3" flag for CAT and "mba" flag for MBA
for _, flag := range flags {
switch flag {
case "cat_l3":
isCatFlagSet = true
case "mba":
isMbaFlagSet = true
}
}
return isCatFlagSet, isMbaFlagSet, nil
}
}
return false, nil
return isCatFlagSet, isMbaFlagSet, nil
}
func parseUint(s string, base, bitSize int) (uint64, error) {
@ -292,30 +352,6 @@ func getIntelRdtParamString(path, file string) (string, error) {
return strings.TrimSpace(string(contents)), nil
}
func readTasksFile(dir string) ([]int, error) {
f, err := os.Open(filepath.Join(dir, IntelRdtTasks))
if err != nil {
return nil, err
}
defer f.Close()
var (
s = bufio.NewScanner(f)
out = []int{}
)
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, err
}
out = append(out, pid)
}
}
return out, nil
}
func writeFile(dir, file, data string) error {
if dir == "" {
return fmt.Errorf("no such directory for %s", file)
@ -368,6 +404,57 @@ func getL3CacheInfo() (*L3CacheInfo, error) {
return l3CacheInfo, nil
}
// Get the read-only memory bandwidth information
func getMemBwInfo() (*MemBwInfo, error) {
memBwInfo := &MemBwInfo{}
rootPath, err := getIntelRdtRoot()
if err != nil {
return memBwInfo, err
}
path := filepath.Join(rootPath, "info", "MB")
bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran")
if err != nil {
return memBwInfo, err
}
delayLinear, err := getIntelRdtParamUint(path, "delay_linear")
if err != nil {
return memBwInfo, err
}
minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth")
if err != nil {
return memBwInfo, err
}
numClosids, err := getIntelRdtParamUint(path, "num_closids")
if err != nil {
return memBwInfo, err
}
memBwInfo.BandwidthGran = bandwidthGran
memBwInfo.DelayLinear = delayLinear
memBwInfo.MinBandwidth = minBandwidth
memBwInfo.NumClosids = numClosids
return memBwInfo, nil
}
// Get diagnostics for last filesystem operation error from file info/last_cmd_status
func getLastCmdStatus() (string, error) {
rootPath, err := getIntelRdtRoot()
if err != nil {
return "", err
}
path := filepath.Join(rootPath, "info")
lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status")
if err != nil {
return "", err
}
return lastCmdStatus, nil
}
// WriteIntelRdtTasks writes the specified pid into the "tasks" file
func WriteIntelRdtTasks(dir string, pid int) error {
if dir == "" {
@ -383,9 +470,14 @@ func WriteIntelRdtTasks(dir string, pid int) error {
return nil
}
// Check if Intel RDT is enabled
func IsEnabled() bool {
return isEnabled
// Check if Intel RDT/CAT is enabled
func IsCatEnabled() bool {
return isCatEnabled
}
// Check if Intel RDT/MBA is enabled
func IsMbaEnabled() bool {
return isMbaEnabled
}
// Get the 'container_id' path in Intel RDT "resource control" filesystem
@ -452,65 +544,130 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) {
defer m.mu.Unlock()
stats := NewStats()
// The read-only L3 cache information
l3CacheInfo, err := getL3CacheInfo()
if err != nil {
return nil, err
}
stats.L3CacheInfo = l3CacheInfo
// The read-only L3 cache schema in root
rootPath, err := getIntelRdtRoot()
if err != nil {
return nil, err
}
// The read-only L3 cache and memory bandwidth schemata in root
tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata")
if err != nil {
return nil, err
}
// L3 cache schema is in the first line
schemaRootStrings := strings.Split(tmpRootStrings, "\n")
stats.L3CacheSchemaRoot = schemaRootStrings[0]
// The L3 cache schema in 'container_id' group
// The L3 cache and memory bandwidth schemata in 'container_id' group
tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata")
if err != nil {
return nil, err
}
// L3 cache schema is in the first line
schemaStrings := strings.Split(tmpStrings, "\n")
stats.L3CacheSchema = schemaStrings[0]
if IsCatEnabled() {
// The read-only L3 cache information
l3CacheInfo, err := getL3CacheInfo()
if err != nil {
return nil, err
}
stats.L3CacheInfo = l3CacheInfo
// The read-only L3 cache schema in root
for _, schemaRoot := range schemaRootStrings {
if strings.Contains(schemaRoot, "L3") {
stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot)
}
}
// The L3 cache schema in 'container_id' group
for _, schema := range schemaStrings {
if strings.Contains(schema, "L3") {
stats.L3CacheSchema = strings.TrimSpace(schema)
}
}
}
if IsMbaEnabled() {
// The read-only memory bandwidth information
memBwInfo, err := getMemBwInfo()
if err != nil {
return nil, err
}
stats.MemBwInfo = memBwInfo
// The read-only memory bandwidth information
for _, schemaRoot := range schemaRootStrings {
if strings.Contains(schemaRoot, "MB") {
stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot)
}
}
// The memory bandwidth schema in 'container_id' group
for _, schema := range schemaStrings {
if strings.Contains(schema, "MB") {
stats.MemBwSchema = strings.TrimSpace(schema)
}
}
}
return stats, nil
}
// Set Intel RDT "resource control" filesystem as configured.
func (m *IntelRdtManager) Set(container *configs.Config) error {
path := m.GetPath()
// About L3 cache schema file:
// The schema has allocation masks/values for L3 cache on each socket,
// About L3 cache schema:
// It has allocation bitmasks/values for L3 cache on each socket,
// which contains L3 cache id and capacity bitmask (CBM).
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
// For example, on a two-socket machine, L3's schema line could be:
// L3:0=ff;1=c0
// Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
// For example, on a two-socket machine, the schema line could be:
// L3:0=ff;1=c0
// which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM
// is 0xc0.
//
// About L3 cache CBM validity:
// The valid L3 cache CBM is a *contiguous bits set* and number of
// bits that can be set is less than the max bit. The max bits in the
// CBM is varied among supported Intel Xeon platforms. In Intel RDT
// "resource control" filesystem layout, the CBM in a group should
// be a subset of the CBM in root. Kernel will check if it is valid
// when writing.
// e.g., 0xfffff in root indicates the max bits of CBM is 20 bits,
// which mapping to entire L3 cache capacity. Some valid CBM values
// to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
// CBM is varied among supported Intel CPU models. Kernel will check
// if it is valid when writing. e.g., default value 0xfffff in root
// indicates the max bits of CBM is 20 bits, which mapping to entire
// L3 cache capacity. Some valid CBM values to set in a group:
// 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
//
//
// About memory bandwidth schema:
// It has allocation values for memory bandwidth on each socket, which
// contains L3 cache id and memory bandwidth percentage.
// Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
// For example, on a two-socket machine, the schema line could be:
// "MB:0=20;1=70"
//
// The minimum bandwidth percentage value for each CPU model is
// predefined and can be looked up through "info/MB/min_bandwidth".
// The bandwidth granularity that is allocated is also dependent on
// the CPU model and can be looked up at "info/MB/bandwidth_gran".
// The available bandwidth control steps are: min_bw + N * bw_gran.
// Intermediate values are rounded to the next control step available
// on the hardware.
if container.IntelRdt != nil {
path := m.GetPath()
l3CacheSchema := container.IntelRdt.L3CacheSchema
if l3CacheSchema != "" {
memBwSchema := container.IntelRdt.MemBwSchema
// Write a single joint schema string to schemata file
if l3CacheSchema != "" && memBwSchema != "" {
if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil {
return NewLastCmdError(err)
}
}
// Write only L3 cache schema string to schemata file
if l3CacheSchema != "" && memBwSchema == "" {
if err := writeFile(path, "schemata", l3CacheSchema); err != nil {
return err
return NewLastCmdError(err)
}
}
// Write only memory bandwidth schema string to schemata file
if l3CacheSchema == "" && memBwSchema != "" {
if err := writeFile(path, "schemata", memBwSchema); err != nil {
return NewLastCmdError(err)
}
}
}
@ -521,11 +678,11 @@ func (m *IntelRdtManager) Set(container *configs.Config) error {
func (raw *intelRdtData) join(id string) (string, error) {
path := filepath.Join(raw.root, id)
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
return "", NewLastCmdError(err)
}
if err := WriteIntelRdtTasks(path, raw.pid); err != nil {
return "", err
return "", NewLastCmdError(err)
}
return path, nil
}
@ -551,3 +708,23 @@ func IsNotFound(err error) bool {
_, ok := err.(*NotFoundError)
return ok
}
type LastCmdError struct {
LastCmdStatus string
Err error
}
func (e *LastCmdError) Error() string {
return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus)
}
func NewLastCmdError(err error) error {
lastCmdStatus, err1 := getLastCmdStatus()
if err1 == nil {
return &LastCmdError{
LastCmdStatus: lastCmdStatus,
Err: err,
}
}
return err
}

View File

@ -8,6 +8,13 @@ type L3CacheInfo struct {
NumClosids uint64 `json:"num_closids,omitempty"`
}
type MemBwInfo struct {
BandwidthGran uint64 `json:"bandwidth_gran,omitempty"`
DelayLinear uint64 `json:"delay_linear,omitempty"`
MinBandwidth uint64 `json:"min_bandwidth,omitempty"`
NumClosids uint64 `json:"num_closids,omitempty"`
}
type Stats struct {
// The read-only L3 cache information
L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
@ -17,6 +24,15 @@ type Stats struct {
// The L3 cache schema in 'container_id' group
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
// The read-only memory bandwidth information
MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"`
// The read-only memory bandwidth schema in root
MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"`
// The memory bandwidth schema in 'container_id' group
MemBwSchema string `json:"mem_bw_schema,omitempty"`
}
func NewStats() *Stats {

View File

@ -8,6 +8,7 @@ go_library(
visibility = ["//visibility:public"],
deps = select({
"@io_bazel_rules_go//go/platform:linux": [
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/golang.org/x/sys/unix:go_default_library",
],
"//conditions:default": [],

View File

@ -7,6 +7,8 @@ import (
"strconv"
"strings"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@ -15,7 +17,7 @@ type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
sessKeyId, err := unix.KeyctlJoinSessionKeyring(name)
if err != nil {
return 0, fmt.Errorf("could not create session key: %v", err)
return 0, errors.Wrap(err, "create session key")
}
return KeySerial(sessKeyId), nil
}
@ -42,9 +44,5 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
perm := (uint32(perm64) & mask) | setbits
if err := unix.KeyctlSetperm(int(ringId), perm); err != nil {
return err
}
return nil
return unix.KeyctlSetperm(int(ringId), perm)
}

View File

@ -10,16 +10,16 @@ import (
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
RootlessAttr uint16 = 27287
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
RootlessEUIDAttr uint16 = 27287
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
)
type Int32msg struct {

View File

@ -5,18 +5,15 @@ package libcontainer
import (
"fmt"
"io/ioutil"
"net"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/vishvananda/netlink"
)
var strategies = map[string]networkStrategy{
"veth": &veth{},
"loopback": &loopback{},
}
@ -103,157 +100,3 @@ func (l *loopback) attach(n *configs.Network) (err error) {
func (l *loopback) detach(n *configs.Network) (err error) {
return nil
}
// veth is a network strategy that uses a bridge and creates
// a veth pair, one that is attached to the bridge on the host and the other
// is placed inside the container's namespace
type veth struct {
}
func (v *veth) detach(n *configs.Network) (err error) {
return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil)
}
// attach a container network interface to an external network
func (v *veth) attach(n *configs.Network) (err error) {
brl, err := netlink.LinkByName(n.Bridge)
if err != nil {
return err
}
br, ok := brl.(*netlink.Bridge)
if !ok {
return fmt.Errorf("Wrong device type %T", brl)
}
host, err := netlink.LinkByName(n.HostInterfaceName)
if err != nil {
return err
}
if err := netlink.LinkSetMaster(host, br); err != nil {
return err
}
if err := netlink.LinkSetMTU(host, n.Mtu); err != nil {
return err
}
if n.HairpinMode {
if err := netlink.LinkSetHairpin(host, true); err != nil {
return err
}
}
if err := netlink.LinkSetUp(host); err != nil {
return err
}
return nil
}
func (v *veth) create(n *network, nspid int) (err error) {
tmpName, err := v.generateTempPeerName()
if err != nil {
return err
}
n.TempVethPeerName = tmpName
if n.Bridge == "" {
return fmt.Errorf("bridge is not specified")
}
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{
Name: n.HostInterfaceName,
TxQLen: n.TxQueueLen,
},
PeerName: n.TempVethPeerName,
}
if err := netlink.LinkAdd(veth); err != nil {
return err
}
defer func() {
if err != nil {
netlink.LinkDel(veth)
}
}()
if err := v.attach(&n.Network); err != nil {
return err
}
child, err := netlink.LinkByName(n.TempVethPeerName)
if err != nil {
return err
}
return netlink.LinkSetNsPid(child, nspid)
}
func (v *veth) generateTempPeerName() (string, error) {
return utils.GenerateRandomName("veth", 7)
}
func (v *veth) initialize(config *network) error {
peer := config.TempVethPeerName
if peer == "" {
return fmt.Errorf("peer is not specified")
}
child, err := netlink.LinkByName(peer)
if err != nil {
return err
}
if err := netlink.LinkSetDown(child); err != nil {
return err
}
if err := netlink.LinkSetName(child, config.Name); err != nil {
return err
}
// get the interface again after we changed the name as the index also changes.
if child, err = netlink.LinkByName(config.Name); err != nil {
return err
}
if config.MacAddress != "" {
mac, err := net.ParseMAC(config.MacAddress)
if err != nil {
return err
}
if err := netlink.LinkSetHardwareAddr(child, mac); err != nil {
return err
}
}
ip, err := netlink.ParseAddr(config.Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip); err != nil {
return err
}
if config.IPv6Address != "" {
ip6, err := netlink.ParseAddr(config.IPv6Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip6); err != nil {
return err
}
}
if err := netlink.LinkSetMTU(child, config.Mtu); err != nil {
return err
}
if err := netlink.LinkSetUp(child); err != nil {
return err
}
if config.Gateway != "" {
gw := net.ParseIP(config.Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
if config.IPv6Gateway != "" {
gw := net.ParseIP(config.IPv6Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
return nil
}

View File

@ -72,6 +72,9 @@ type Process struct {
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File
// Init specifies whether the process is the first process in the container.
Init bool
ops processOperations
}

View File

@ -46,15 +46,16 @@ type parentProcess interface {
}
type setnsProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
cgroupPaths map[string]string
intelRdtPath string
config *initConfig
fds []string
process *Process
bootstrapData io.Reader
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
cgroupPaths map[string]string
rootlessCgroups bool
intelRdtPath string
config *initConfig
fds []string
process *Process
bootstrapData io.Reader
}
func (p *setnsProcess) startTime() (uint64, error) {
@ -86,7 +87,7 @@ func (p *setnsProcess) start() (err error) {
return newSystemErrorWithCause(err, "executing setns process")
}
if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
@ -537,7 +538,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
}
fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
// change ownership of the pipes in case we are in a user namespace
for _, fd := range fds {
if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err

View File

@ -46,6 +46,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
return newSystemErrorWithCause(err, "preparing rootfs")
}
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
@ -64,8 +65,6 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
}
}
setupDev := needsSetupDev(config)
if setupDev {
if err := createDevices(config); err != nil {
return newSystemErrorWithCause(err, "creating device nodes")
@ -153,6 +152,26 @@ func finalizeRootfs(config *configs.Config) (err error) {
return nil
}
// /tmp has to be mounted as private to allow MS_MOVE to work in all situations
func prepareTmp(topTmpDir string) (string, error) {
tmpdir, err := ioutil.TempDir(topTmpDir, "runctop")
if err != nil {
return "", err
}
if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil {
return "", err
}
if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil {
return "", err
}
return tmpdir, nil
}
func cleanupTmp(tmpdir string) error {
unix.Unmount(tmpdir, 0)
return os.RemoveAll(tmpdir)
}
func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env
@ -200,7 +219,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
}
}
if copyUp {
tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir")
tmpdir, err := prepareTmp("/tmp")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
}
defer cleanupTmp(tmpdir)
tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
}
@ -397,6 +421,7 @@ func checkMountDestination(rootfs, dest string) error {
"/proc/stat",
"/proc/swaps",
"/proc/uptime",
"/proc/loadavg",
"/proc/net/dev",
}
for _, valid := range validDestinations {
@ -413,7 +438,7 @@ func checkMountDestination(rootfs, dest string) error {
if err != nil {
return err
}
if path == "." || !strings.HasPrefix(path, "..") {
if path != "." && !strings.HasPrefix(path, "..") {
return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid)
}
}
@ -803,10 +828,7 @@ func remount(m *configs.Mount, rootfs string) error {
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
if err := unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), ""); err != nil {
return err
}
return nil
return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "")
}
// Do the mount operation followed by additional mounts required to take care

View File

@ -5,12 +5,14 @@ package libcontainer
import (
"fmt"
"os"
"runtime"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@ -28,10 +30,19 @@ func (l *linuxSetnsInit) getSessionRingName() string {
}
func (l *linuxSetnsInit) Init() error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if !l.config.Config.NoNewKeyring {
// do not inherit the parent's session keyring
// Do not inherit the parent's session keyring.
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err
// Same justification as in standart_init_linux.go as to why we
// don't bail on ENOSYS.
//
// TODO(cyphar): And we should have logging here too.
if errors.Cause(err) != unix.ENOSYS {
return errors.Wrap(err, "join session keyring")
}
}
}
if l.config.CreateConsole {
@ -47,6 +58,10 @@ func (l *linuxSetnsInit) Init() error {
return err
}
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
defer label.SetProcessLabel("")
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
@ -61,9 +76,6 @@ func (l *linuxSetnsInit) Init() error {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).

View File

@ -6,6 +6,7 @@ import (
"fmt"
"os"
"os/exec"
"runtime"
"syscall" //only for Exec
"github.com/opencontainers/runc/libcontainer/apparmor"
@ -14,6 +15,7 @@ import (
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux/label"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@ -43,17 +45,31 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
}
func (l *linuxStandardInit) Init() error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams()
// Do not inherit the parent's session keyring.
sessKeyId, err := keys.JoinSessionKeyring(ringname)
if err != nil {
return err
}
// Make session keyring searcheable.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil {
// If keyrings aren't supported then it is likely we are on an
// older kernel (or inside an LXC container). While we could bail,
// the security feature we are using here is best-effort (it only
// really provides marginal protection since VFS credentials are
// the only significant protection of keyrings).
//
// TODO(cyphar): Log this so people know what's going on, once we
// have proper logging in 'runc init'.
if errors.Cause(err) != unix.ENOSYS {
return errors.Wrap(err, "join session keyring")
}
} else {
// Make session keyring searcheable. If we've gotten this far we
// bail on any error -- we don't want to have a keyring with bad
// permissions.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return errors.Wrap(err, "mod keyring permissions")
}
}
}
@ -76,7 +92,7 @@ func (l *linuxStandardInit) Init() error {
return err
}
if err := system.Setctty(); err != nil {
return err
return errors.Wrap(err, "setctty")
}
}
@ -89,46 +105,47 @@ func (l *linuxStandardInit) Init() error {
if hostname := l.config.Config.Hostname; hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil {
return err
return errors.Wrap(err, "sethostname")
}
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
return errors.Wrap(err, "apply apparmor profile")
}
for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return err
return errors.Wrapf(err, "write sysctl key %s", key)
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil {
return err
return errors.Wrapf(err, "readonly path %s", path)
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path, l.config.Config.MountLabel); err != nil {
return err
return errors.Wrapf(err, "mask path %s", path)
}
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return err
return errors.Wrap(err, "get pdeath signal")
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
return errors.Wrap(err, "set nonewprivileges")
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
return errors.Wrap(err, "sync ready")
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return errors.Wrap(err, "set process label")
}
defer label.SetProcessLabel("")
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
@ -143,7 +160,7 @@ func (l *linuxStandardInit) Init() error {
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return err
return errors.Wrap(err, "restore pdeath signal")
}
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died

View File

@ -41,10 +41,7 @@ type syncT struct {
// writeSync is used to write to a synchronisation pipe. An error is returned
// if there was a problem writing the payload.
func writeSync(pipe io.Writer, sync syncType) error {
if err := utils.WriteJSON(pipe, syncT{sync}); err != nil {
return err
}
return nil
return utils.WriteJSON(pipe, syncT{sync})
}
// readSync is used to read from a synchronisation pipe. An error is returned

View File

@ -17,6 +17,41 @@ go_library(
importpath = "github.com/opencontainers/runc/libcontainer/system",
visibility = ["//visibility:public"],
deps = select({
"@io_bazel_rules_go//go/platform:android": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:darwin": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:dragonfly": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:freebsd": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:linux": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:nacl": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:netbsd": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:openbsd": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:plan9": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:solaris": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"@io_bazel_rules_go//go/platform:windows": [
"//vendor/github.com/opencontainers/runc/libcontainer/user:go_default_library",
],
"//conditions:default": [],
}) + select({
"@io_bazel_rules_go//go/platform:linux_386": [
"//vendor/golang.org/x/sys/unix:go_default_library",
],

View File

@ -3,13 +3,12 @@
package system
import (
"bufio"
"fmt"
"os"
"os/exec"
"syscall" // only for exec
"unsafe"
"github.com/opencontainers/runc/libcontainer/user"
"golang.org/x/sys/unix"
)
@ -102,34 +101,43 @@ func Setctty() error {
}
// RunningInUserNS detects whether we are currently running in a user namespace.
// Copied from github.com/lxc/lxd/shared/util.go
// Originally copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return false
}
defer file.Close()
return UIDMapInUserNS(uidmap)
}
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return false
}
line := string(l)
var a, b, c int64
fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
func UIDMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if a == 0 && b == 0 && c == 4294967295 {
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
return false
}
return true
}
// GetParentNSeuid returns the euid within the parent user namespace
func GetParentNSeuid() int64 {
euid := int64(os.Geteuid())
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return euid
}
for _, um := range uidmap {
if um.ID <= euid && euid <= um.ID+um.Count-1 {
return um.ParentID + euid - um.ID
}
}
return euid
}
// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)

View File

@ -2,8 +2,26 @@
package system
import (
"os"
"github.com/opencontainers/runc/libcontainer/user"
)
// RunningInUserNS is a stub for non-Linux systems
// Always returns false
func RunningInUserNS() bool {
return false
}
// UIDMapInUserNS is a stub for non-Linux systems
// Always returns false
func UIDMapInUserNS(uidmap []user.IDMap) bool {
return false
}
// GetParentNSeuid returns the euid within the parent user namespace
// Always returns os.Geteuid on non-linux
func GetParentNSeuid() int {
return os.Geteuid()
}

View File

@ -5,6 +5,7 @@ package user
import (
"io"
"os"
"strconv"
"golang.org/x/sys/unix"
)
@ -114,3 +115,30 @@ func CurrentUser() (User, error) {
func CurrentGroup() (Group, error) {
return LookupGid(unix.Getgid())
}
func currentUserSubIDs(fileName string) ([]SubID, error) {
u, err := CurrentUser()
if err != nil {
return nil, err
}
filter := func(entry SubID) bool {
return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid)
}
return ParseSubIDFileFilter(fileName, filter)
}
func CurrentUserSubUIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subuid")
}
func CurrentUserSubGIDs() ([]SubID, error) {
return currentUserSubIDs("/etc/subgid")
}
func CurrentProcessUIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/uid_map")
}
func CurrentProcessGIDMap() ([]IDMap, error) {
return ParseIDMapFile("/proc/self/gid_map")
}

View File

@ -75,12 +75,29 @@ func groupFromOS(g *user.Group) (Group, error) {
return newGroup, nil
}
// SubID represents an entry in /etc/sub{u,g}id
type SubID struct {
Name string
SubID int64
Count int64
}
// IDMap represents an entry in /proc/PID/{u,g}id_map
type IDMap struct {
ID int64
ParentID int64
Count int64
}
func parseLine(line string, v ...interface{}) {
if line == "" {
parseParts(strings.Split(line, ":"), v...)
}
func parseParts(parts []string, v ...interface{}) {
if len(parts) == 0 {
return
}
parts := strings.Split(line, ":")
for i, p := range parts {
// Ignore cases where we don't have enough fields to populate the arguments.
// Some configuration files like to misbehave.
@ -96,6 +113,8 @@ func parseLine(line string, v ...interface{}) {
case *int:
// "numbers", with conversion errors ignored because of some misbehaving configuration files.
*e, _ = strconv.Atoi(p)
case *int64:
*e, _ = strconv.ParseInt(p, 10, 64)
case *[]string:
// Comma-separated lists.
if p != "" {
@ -105,7 +124,7 @@ func parseLine(line string, v ...interface{}) {
}
default:
// Someone goof'd when writing code using this function. Scream so they can hear us.
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e))
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e))
}
}
}
@ -479,3 +498,111 @@ func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int
}
return GetAdditionalGroups(additionalGroups, group)
}
func ParseSubIDFile(path string) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubID(subid)
}
func ParseSubID(subid io.Reader) ([]SubID, error) {
return ParseSubIDFilter(subid, nil)
}
func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) {
subid, err := os.Open(path)
if err != nil {
return nil, err
}
defer subid.Close()
return ParseSubIDFilter(subid, filter)
}
func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) {
if r == nil {
return nil, fmt.Errorf("nil source for subid-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []SubID{}
)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
// see: man 5 subuid
p := SubID{}
parseLine(line, &p.Name, &p.SubID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
return out, nil
}
func ParseIDMapFile(path string) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMap(r)
}
func ParseIDMap(r io.Reader) ([]IDMap, error) {
return ParseIDMapFilter(r, nil)
}
func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) {
r, err := os.Open(path)
if err != nil {
return nil, err
}
defer r.Close()
return ParseIDMapFilter(r, filter)
}
func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) {
if r == nil {
return nil, fmt.Errorf("nil source for idmap-formatted data")
}
var (
s = bufio.NewScanner(r)
out = []IDMap{}
)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
// see: man 7 user_namespaces
p := IDMap{}
parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count)
if filter == nil || filter(p) {
out = append(out, p)
}
}
return out, nil
}

View File

@ -1,8 +1,6 @@
package utils
import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"io"
"os"
@ -17,19 +15,6 @@ const (
exitSignalOffset = 128
)
// GenerateRandomName returns a new name joined with a prefix. This size
// specified is used to truncate the randomly generated value
func GenerateRandomName(prefix string, size int) (string, error) {
id := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, id); err != nil {
return "", err
}
if size > 64 {
size = 64
}
return prefix + hex.EncodeToString(id)[:size], nil
}
// ResolveRootfs ensures that the current working directory is
// not a symlink and returns the absolute path to the rootfs
func ResolveRootfs(uncleanRootfs string) (string, error) {