2019-01-12 04:58:27 +00:00
package configs
import (
"bytes"
"encoding/json"
"fmt"
"os/exec"
"time"
2021-04-14 18:11:13 +00:00
"github.com/opencontainers/runc/libcontainer/devices"
2019-01-12 04:58:27 +00:00
"github.com/opencontainers/runtime-spec/specs-go"
2020-08-10 17:43:49 +00:00
"github.com/pkg/errors"
2019-01-12 04:58:27 +00:00
"github.com/sirupsen/logrus"
)
type Rlimit struct {
Type int ` json:"type" `
Hard uint64 ` json:"hard" `
Soft uint64 ` json:"soft" `
}
// IDMap represents UID/GID Mappings for User Namespaces.
type IDMap struct {
ContainerID int ` json:"container_id" `
HostID int ` json:"host_id" `
Size int ` json:"size" `
}
// Seccomp represents syscall restrictions
// By default, only the native architecture of the kernel is allowed to be used
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action ` json:"default_action" `
Architectures [ ] string ` json:"architectures" `
Syscalls [ ] * Syscall ` json:"syscalls" `
}
// Action is taken upon rule match in Seccomp
type Action int
const (
Kill Action = iota + 1
Errno
Trap
Allow
Trace
2019-12-12 01:27:03 +00:00
Log
2019-01-12 04:58:27 +00:00
)
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int
const (
EqualTo Operator = iota + 1
NotEqualTo
GreaterThan
GreaterThanOrEqualTo
LessThan
LessThanOrEqualTo
MaskEqualTo
)
// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
Index uint ` json:"index" `
Value uint64 ` json:"value" `
ValueTwo uint64 ` json:"value_two" `
Op Operator ` json:"op" `
}
// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
2020-08-10 17:43:49 +00:00
Name string ` json:"name" `
Action Action ` json:"action" `
ErrnoRet * uint ` json:"errnoRet" `
Args [ ] * Arg ` json:"args" `
2019-01-12 04:58:27 +00:00
}
// TODO Windows. Many of these fields should be factored out into those parts
// which are common across platforms, and those which are platform specific.
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
// This is a common option when the container is running in ramdisk
NoPivotRoot bool ` json:"no_pivot_root" `
// ParentDeathSignal specifies the signal that is sent to the container's process in the case
// that the parent process dies.
ParentDeathSignal int ` json:"parent_death_signal" `
// Path to a directory containing the container's root filesystem.
Rootfs string ` json:"rootfs" `
2021-04-14 18:11:13 +00:00
// Umask is the umask to use inside of the container.
Umask * uint32 ` json:"umask" `
2019-01-12 04:58:27 +00:00
// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
// bind mounts are writtable.
Readonlyfs bool ` json:"readonlyfs" `
// Specifies the mount propagation flags to be applied to /.
RootPropagation int ` json:"rootPropagation" `
// Mounts specify additional source and destination paths that will be mounted inside the container's
// rootfs and mount namespace if specified
Mounts [ ] * Mount ` json:"mounts" `
// The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
2021-04-14 18:11:13 +00:00
Devices [ ] * devices . Device ` json:"devices" `
2019-01-12 04:58:27 +00:00
MountLabel string ` json:"mount_label" `
// Hostname optionally sets the container's hostname if provided
Hostname string ` json:"hostname" `
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
Namespaces Namespaces ` json:"namespaces" `
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities * Capabilities ` json:"capabilities" `
// Networks specifies the container's network setup to be created
Networks [ ] * Network ` json:"networks" `
// Routes can be specified to create entries in the route table as the container is started
Routes [ ] * Route ` json:"routes" `
// Cgroups specifies specific cgroup settings for the various subsystems that the container is
// placed into to limit the resources the container has available
Cgroups * Cgroup ` json:"cgroups" `
// AppArmorProfile specifies the profile to apply to the process running in the container and is
// change at the time the process is execed
AppArmorProfile string ` json:"apparmor_profile,omitempty" `
// ProcessLabel specifies the label to apply to the process running in the container. It is
// commonly used by selinux
ProcessLabel string ` json:"process_label,omitempty" `
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits [ ] Rlimit ` json:"rlimits,omitempty" `
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
// higher scores are preferred for being killed. If it is unset then we don't touch the current
// value.
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj * int ` json:"oom_score_adj,omitempty" `
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings [ ] IDMap ` json:"uid_mappings" `
// GidMappings is an array of Group ID mappings for User Namespaces
GidMappings [ ] IDMap ` json:"gid_mappings" `
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
// mount pointing to /dev/null as to prevent reads of the file.
MaskPaths [ ] string ` json:"mask_paths" `
// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
// so that these files prevent any writes.
ReadonlyPaths [ ] string ` json:"readonly_paths" `
// Sysctl is a map of properties and their values. It is the equivalent of using
// sysctl -w my.property.name value in Linux.
Sysctl map [ string ] string ` json:"sysctl" `
// Seccomp allows actions to be taken whenever a syscall is made within the container.
// A number of rules are given, each having an action to be taken if a syscall matches it.
// A default action to be taken if no rules match is also given.
Seccomp * Seccomp ` json:"seccomp" `
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
NoNewPrivileges bool ` json:"no_new_privileges,omitempty" `
// Hooks are a collection of actions to perform at various container lifecycle events.
// CommandHooks are serialized to JSON, but other hooks are not.
2020-08-10 17:43:49 +00:00
Hooks Hooks
2019-01-12 04:58:27 +00:00
// Version is the version of opencontainer specification that is supported.
Version string ` json:"version" `
// Labels are user defined metadata that is stored in the config and populated on the state
Labels [ ] string ` json:"labels" `
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool ` json:"no_new_keyring" `
// IntelRdt specifies settings for Intel RDT group that the container is placed into
// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
IntelRdt * IntelRdt ` json:"intel_rdt,omitempty" `
// RootlessEUID is set when the runc was launched with non-zero EUID.
// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
// When RootlessEUID is set, runc creates a new userns for the container.
// (config.json needs to contain userns settings)
RootlessEUID bool ` json:"rootless_euid,omitempty" `
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool ` json:"rootless_cgroups,omitempty" `
}
2020-08-10 17:43:49 +00:00
type HookName string
type HookList [ ] Hook
type Hooks map [ HookName ] HookList
const (
2019-01-12 04:58:27 +00:00
// Prestart commands are executed after the container namespaces are created,
// but before the user supplied command is executed from init.
2020-08-10 17:43:49 +00:00
// Note: This hook is now deprecated
// Prestart commands are called in the Runtime namespace.
Prestart HookName = "prestart"
// CreateRuntime commands MUST be called as part of the create operation after
// the runtime environment has been created but before the pivot_root has been executed.
// CreateRuntime is called immediately after the deprecated Prestart hook.
// CreateRuntime commands are called in the Runtime Namespace.
2021-04-14 18:11:13 +00:00
CreateRuntime HookName = "createRuntime"
2020-08-10 17:43:49 +00:00
// CreateContainer commands MUST be called as part of the create operation after
// the runtime environment has been created but before the pivot_root has been executed.
// CreateContainer commands are called in the Container namespace.
2021-04-14 18:11:13 +00:00
CreateContainer HookName = "createContainer"
2020-08-10 17:43:49 +00:00
// StartContainer commands MUST be called as part of the start operation and before
// the container process is started.
// StartContainer commands are called in the Container namespace.
2021-04-14 18:11:13 +00:00
StartContainer HookName = "startContainer"
2019-01-12 04:58:27 +00:00
// Poststart commands are executed after the container init process starts.
2020-08-10 17:43:49 +00:00
// Poststart commands are called in the Runtime Namespace.
2021-04-14 18:11:13 +00:00
Poststart HookName = "poststart"
2019-01-12 04:58:27 +00:00
// Poststop commands are executed after the container init process exits.
2020-08-10 17:43:49 +00:00
// Poststop commands are called in the Runtime Namespace.
2021-04-14 18:11:13 +00:00
Poststop HookName = "poststop"
2020-08-10 17:43:49 +00:00
)
2019-01-12 04:58:27 +00:00
type Capabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding [ ] string
// Effective is the set of capabilities checked by the kernel.
Effective [ ] string
// Inheritable is the capabilities preserved across execve.
Inheritable [ ] string
// Permitted is the limiting superset for effective capabilities.
Permitted [ ] string
// Ambient is the ambient set of capabilities that are kept.
Ambient [ ] string
}
2020-08-10 17:43:49 +00:00
func ( hooks HookList ) RunHooks ( state * specs . State ) error {
for i , h := range hooks {
if err := h . Run ( state ) ; err != nil {
return errors . Wrapf ( err , "Running hook #%d:" , i )
}
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
return nil
}
func ( hooks * Hooks ) UnmarshalJSON ( b [ ] byte ) error {
var state map [ HookName ] [ ] CommandHook
2019-01-12 04:58:27 +00:00
if err := json . Unmarshal ( b , & state ) ; err != nil {
return err
}
2020-08-10 17:43:49 +00:00
* hooks = Hooks { }
for n , commandHooks := range state {
if len ( commandHooks ) == 0 {
continue
2019-01-12 04:58:27 +00:00
}
2020-08-10 17:43:49 +00:00
( * hooks ) [ n ] = HookList { }
for _ , h := range commandHooks {
( * hooks ) [ n ] = append ( ( * hooks ) [ n ] , h )
}
2019-01-12 04:58:27 +00:00
}
return nil
}
2020-08-10 17:43:49 +00:00
func ( hooks * Hooks ) MarshalJSON ( ) ( [ ] byte , error ) {
2019-01-12 04:58:27 +00:00
serialize := func ( hooks [ ] Hook ) ( serializableHooks [ ] CommandHook ) {
for _ , hook := range hooks {
switch chook := hook . ( type ) {
case CommandHook :
serializableHooks = append ( serializableHooks , chook )
default :
logrus . Warnf ( "cannot serialize hook of type %T, skipping" , hook )
}
}
return serializableHooks
}
return json . Marshal ( map [ string ] interface { } {
2020-08-10 17:43:49 +00:00
"prestart" : serialize ( ( * hooks ) [ Prestart ] ) ,
"createRuntime" : serialize ( ( * hooks ) [ CreateRuntime ] ) ,
"createContainer" : serialize ( ( * hooks ) [ CreateContainer ] ) ,
"startContainer" : serialize ( ( * hooks ) [ StartContainer ] ) ,
"poststart" : serialize ( ( * hooks ) [ Poststart ] ) ,
"poststop" : serialize ( ( * hooks ) [ Poststop ] ) ,
2019-01-12 04:58:27 +00:00
} )
}
type Hook interface {
// Run executes the hook with the provided state.
2019-09-27 21:51:53 +00:00
Run ( * specs . State ) error
2019-01-12 04:58:27 +00:00
}
// NewFunctionHook will call the provided function when the hook is run.
2019-09-27 21:51:53 +00:00
func NewFunctionHook ( f func ( * specs . State ) error ) FuncHook {
2019-01-12 04:58:27 +00:00
return FuncHook {
run : f ,
}
}
type FuncHook struct {
2019-09-27 21:51:53 +00:00
run func ( * specs . State ) error
2019-01-12 04:58:27 +00:00
}
2019-09-27 21:51:53 +00:00
func ( f FuncHook ) Run ( s * specs . State ) error {
2019-01-12 04:58:27 +00:00
return f . run ( s )
}
type Command struct {
Path string ` json:"path" `
Args [ ] string ` json:"args" `
Env [ ] string ` json:"env" `
Dir string ` json:"dir" `
Timeout * time . Duration ` json:"timeout" `
}
// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook ( cmd Command ) CommandHook {
return CommandHook {
Command : cmd ,
}
}
type CommandHook struct {
Command
}
2019-09-27 21:51:53 +00:00
func ( c Command ) Run ( s * specs . State ) error {
2019-01-12 04:58:27 +00:00
b , err := json . Marshal ( s )
if err != nil {
return err
}
var stdout , stderr bytes . Buffer
cmd := exec . Cmd {
Path : c . Path ,
Args : c . Args ,
Env : c . Env ,
Stdin : bytes . NewReader ( b ) ,
Stdout : & stdout ,
Stderr : & stderr ,
}
if err := cmd . Start ( ) ; err != nil {
return err
}
errC := make ( chan error , 1 )
go func ( ) {
err := cmd . Wait ( )
if err != nil {
err = fmt . Errorf ( "error running hook: %v, stdout: %s, stderr: %s" , err , stdout . String ( ) , stderr . String ( ) )
}
errC <- err
} ( )
var timerCh <- chan time . Time
if c . Timeout != nil {
timer := time . NewTimer ( * c . Timeout )
defer timer . Stop ( )
timerCh = timer . C
}
select {
case err := <- errC :
return err
case <- timerCh :
cmd . Process . Kill ( )
2021-04-14 18:11:13 +00:00
<- errC
2019-01-12 04:58:27 +00:00
return fmt . Errorf ( "hook ran past specified timeout of %.1fs" , c . Timeout . Seconds ( ) )
}
}