Merge pull request #41583 from verb/sharedpid

Automatic merge from submit-queue (batch tested with PRs 41583, 45117, 45123)

Implement shared PID namespace in the dockershim

**What this PR does / why we need it**: Defaults the Docker CRI to using a shared PID namespace for pods. Implements proposal in https://github.com/kubernetes/community/pull/207 tracked by #1615.

//cc @dchen1107 @vishh @timstclair 

**Special notes for your reviewer**: none

**Release note**:
```release-note
Some container runtimes share a process (PID) namespace for all containers in a pod. This will become the default for Docker in a future release of Kubernetes. You can preview this functionality if running with the CRI and Docker 1.13.1 by enabling the --experimental-docker-enable-shared-pid kubelet flag.
```
pull/6/head
Kubernetes Submit Queue 2017-04-28 20:15:03 -07:00 committed by GitHub
commit e2042bb81b
12 changed files with 114 additions and 3 deletions

View File

@ -309,6 +309,7 @@ func (c *kubeletConfiguration) addFlags(fs *pflag.FlagSet) {
fs.StringVar(&c.RemoteRuntimeEndpoint, "container-runtime-endpoint", c.RemoteRuntimeEndpoint, "[Experimental] The unix socket endpoint of remote runtime service. The endpoint is used only when CRI integration is enabled (--enable-cri)")
fs.StringVar(&c.RemoteImageEndpoint, "image-service-endpoint", c.RemoteImageEndpoint, "[Experimental] The unix socket endpoint of remote image service. If not specified, it will be the same with container-runtime-endpoint by default. The endpoint is used only when CRI integration is enabled (--enable-cri)")
fs.BoolVar(&c.DockerEnableSharedPID, "experimental-docker-enable-shared-pid", c.DockerEnableSharedPID, "[Experimental] The Container Runtime Interface (CRI) will eventually default to using a shared PID namespace for containers in a pod. Setting this flag allows previewing this behavior when running with the CRI enabled and Docker version 1.13.1 or higher.")
fs.BoolVar(&c.ExperimentalCheckNodeCapabilitiesBeforeMount, "experimental-check-node-capabilities-before-mount", c.ExperimentalCheckNodeCapabilitiesBeforeMount, "[Experimental] if set true, the kubelet will check the underlying node for required componenets (binaries, etc.) before performing the mount")

View File

@ -976,7 +976,8 @@ func RunDockershim(c *componentconfig.KubeletConfiguration, dockershimRootDir st
}
ds, err := dockershim.NewDockerService(dockerClient, c.SeccompProfileRoot, c.PodInfraContainerImage,
streamingConfig, &pluginSettings, c.RuntimeCgroups, c.CgroupDriver, dockerExecHandler, dockershimRootDir)
streamingConfig, &pluginSettings, c.RuntimeCgroups, c.CgroupDriver, dockerExecHandler, dockershimRootDir,
!c.DockerEnableSharedPID)
if err != nil {
return err
}

View File

@ -245,6 +245,7 @@ experimental-check-node-capabilities-before-mount
experimental-cri
experimental-dockershim
experimental-dockershim-root-directory
experimental-docker-enable-shared-pid
experimental-fail-swap-on
experimental-kernel-memcg-notification
experimental-keystone-ca-file

View File

@ -481,6 +481,11 @@ type KubeletConfiguration struct {
// This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node.
// This can be useful for debugging volume related issues.
KeepTerminatedPodVolumes bool
// This flag, if set, enables use of a shared PID namespace for pods running in the docker CRI runtime.
// A shared PID namespace is the only option in non-docker runtimes and is required by the CRI. The ability to
// disable it for docker will be removed unless a compelling use case is discovered with widespread use.
// TODO: Remove once we no longer support disabling shared PID namespace (https://issues.k8s.io/41938)
DockerEnableSharedPID bool
/* following flags are meant for Node Allocatable */

View File

@ -536,6 +536,8 @@ type KubeletConfiguration struct {
// This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node.
// This can be useful for debugging volume related issues.
KeepTerminatedPodVolumes bool `json:"keepTerminatedPodVolumes,omitempty"`
// This flag, if set, enables use of a shared PID namespace for pods run by the docker CRI runtime.
DockerEnableSharedPID *bool `json:"dockerEnableSharedPID,omitempty"`
/* following flags are meant for Node Allocatable */

View File

@ -436,6 +436,9 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn
out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount
out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes
if err := v1.Convert_Pointer_bool_To_bool(&in.DockerEnableSharedPID, &out.DockerEnableSharedPID, s); err != nil {
return err
}
out.SystemReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.SystemReserved))
out.KubeReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.KubeReserved))
out.SystemReservedCgroup = in.SystemReservedCgroup
@ -637,6 +640,9 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn
out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount
out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes
if err := v1.Convert_bool_To_Pointer_bool(&in.DockerEnableSharedPID, &out.DockerEnableSharedPID, s); err != nil {
return err
}
out.SystemReserved = *(*map[string]string)(unsafe.Pointer(&in.SystemReserved))
out.KubeReserved = *(*map[string]string)(unsafe.Pointer(&in.KubeReserved))
out.SystemReservedCgroup = in.SystemReservedCgroup

View File

@ -303,6 +303,11 @@ func DeepCopy_v1alpha1_KubeletConfiguration(in interface{}, out interface{}, c *
*out = new(bool)
**out = **in
}
if in.DockerEnableSharedPID != nil {
in, out := &in.DockerEnableSharedPID, &out.DockerEnableSharedPID
*out = new(bool)
**out = **in
}
if in.SystemReserved != nil {
in, out := &in.SystemReserved, &out.SystemReserved
*out = make(map[string]string)

View File

@ -163,6 +163,7 @@ func (ds *dockerService) CreateContainer(podSandboxID string, config *runtimeapi
// Apply security context.
applyContainerSecurityContext(lc, podSandboxID, createConfig.Config, hc, securityOptSep)
modifyPIDNamespaceOverrides(ds.disableSharedPID, apiVersion, hc)
}
// Apply cgroupsParent derived from the sandbox config.

View File

@ -147,7 +147,7 @@ var internalLabelKeys []string = []string{containerTypeLabelKey, containerLogPat
// NOTE: Anything passed to DockerService should be eventually handled in another way when we switch to running the shim as a different process.
func NewDockerService(client dockertools.DockerInterface, seccompProfileRoot string, podSandboxImage string, streamingConfig *streaming.Config,
pluginSettings *NetworkPluginSettings, cgroupsName string, kubeCgroupDriver string, execHandler dockertools.ExecHandler, dockershimRootDir string) (DockerService, error) {
pluginSettings *NetworkPluginSettings, cgroupsName string, kubeCgroupDriver string, execHandler dockertools.ExecHandler, dockershimRootDir string, disableSharedPID bool) (DockerService, error) {
c := dockertools.NewInstrumentedDockerInterface(client)
checkpointHandler, err := NewPersistentCheckpointHandler(dockershimRootDir)
if err != nil {
@ -164,6 +164,7 @@ func NewDockerService(client dockertools.DockerInterface, seccompProfileRoot str
},
containerManager: cm.NewContainerManager(cgroupsName, client),
checkpointHandler: checkpointHandler,
disableSharedPID: disableSharedPID,
}
// check docker version compatibility.
@ -249,6 +250,11 @@ type dockerService struct {
// version checking for some operations. Use this cache to avoid querying
// the docker daemon every time we need to do such checks.
versionCache *cache.ObjectCache
// This option provides an escape hatch to override the new default behavior for Docker under
// the CRI to use a shared PID namespace for all pods. It is temporary and will be removed.
// See proposals/pod-pid-namespace.md for details.
// TODO: Remove once the escape hatch is no longer used (https://issues.k8s.io/41938)
disableSharedPID bool
}
// Version returns the runtime name, runtime version and runtime API version

View File

@ -19,7 +19,9 @@ package dockershim
import (
"fmt"
"strconv"
"strings"
"github.com/blang/semver"
dockercontainer "github.com/docker/engine-api/types/container"
"k8s.io/kubernetes/pkg/api/v1"
@ -123,6 +125,7 @@ func modifyContainerNamespaceOptions(nsOpts *runtimeapi.NamespaceOption, sandbox
if nsOpts != nil {
hostNetwork = nsOpts.HostNetwork
}
hostConfig.PidMode = dockercontainer.PidMode(fmt.Sprintf("container:%v", sandboxID))
modifyCommonNamespaceOptions(nsOpts, hostConfig)
modifyHostNetworkOptionForContainer(hostNetwork, sandboxID, hostConfig)
}
@ -172,3 +175,19 @@ func modifyHostNetworkOptionForContainer(hostNetwork bool, sandboxID string, hc
hc.UTSMode = namespaceModeHost
}
}
// modifyPIDNamespaceOverrides implements two temporary overrides for the default PID namespace sharing for Docker:
// 1. Docker engine prior to API Version 1.24 doesn't support attaching to another container's
// PID namespace, and it didn't stabilize until 1.26. This check can be removed when Kubernetes'
// minimum Docker version is at least 1.13.1 (API version 1.26).
// 2. The administrator has overridden the default behavior by means of a kubelet flag. This is an
// "escape hatch" to return to previous behavior of isolated namespaces and should be removed once
// no longer needed.
func modifyPIDNamespaceOverrides(disableSharedPID bool, version *semver.Version, hc *dockercontainer.HostConfig) {
if !strings.HasPrefix(string(hc.PidMode), "container:") {
return
}
if disableSharedPID || version.LT(semver.Version{Major: 1, Minor: 26}) {
hc.PidMode = ""
}
}

View File

@ -21,6 +21,7 @@ import (
"strconv"
"testing"
"github.com/blang/semver"
dockercontainer "github.com/docker/engine-api/types/container"
"github.com/stretchr/testify/assert"
@ -172,12 +173,14 @@ func TestModifyHostConfigAndNamespaceOptionsForContainer(t *testing.T) {
Privileged: true,
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
PidMode: dockercontainer.PidMode(sandboxNSMode),
}
setCapsHC := &dockercontainer.HostConfig{
CapAdd: []string{"addCapA", "addCapB"},
CapDrop: []string{"dropCapA", "dropCapB"},
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
PidMode: dockercontainer.PidMode(sandboxNSMode),
}
setSELinuxHC := &dockercontainer.HostConfig{
SecurityOpt: []string{
@ -188,6 +191,7 @@ func TestModifyHostConfigAndNamespaceOptionsForContainer(t *testing.T) {
},
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
PidMode: dockercontainer.PidMode(sandboxNSMode),
}
cases := []struct {
@ -286,6 +290,7 @@ func TestModifyContainerNamespaceOptions(t *testing.T) {
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
UTSMode: namespaceModeHost,
PidMode: dockercontainer.PidMode(sandboxNSMode),
},
},
{
@ -296,6 +301,7 @@ func TestModifyContainerNamespaceOptions(t *testing.T) {
expected: &dockercontainer.HostConfig{
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
PidMode: dockercontainer.PidMode(sandboxNSMode),
},
},
{
@ -317,6 +323,63 @@ func TestModifyContainerNamespaceOptions(t *testing.T) {
}
}
func TestModifyContainerNamespacePIDOverride(t *testing.T) {
cases := []struct {
name string
disable bool
version *semver.Version
input, expected dockercontainer.PidMode
}{
{
name: "SharedPID.Enable",
disable: false,
version: &semver.Version{Major: 1, Minor: 26},
input: "container:sandbox",
expected: "container:sandbox",
},
{
name: "SharedPID.Disable",
disable: true,
version: &semver.Version{Major: 1, Minor: 26},
input: "container:sandbox",
expected: "",
},
{
name: "SharedPID.OldDocker",
disable: false,
version: &semver.Version{Major: 1, Minor: 25},
input: "container:sandbox",
expected: "",
},
{
name: "SharedPID.HostPid",
disable: true,
version: &semver.Version{Major: 1, Minor: 27},
input: "host",
expected: "host",
},
{
name: "SharedPID.DistantFuture",
disable: false,
version: &semver.Version{Major: 2, Minor: 10},
input: "container:sandbox",
expected: "container:sandbox",
},
{
name: "SharedPID.EmptyPidMode",
disable: true,
version: &semver.Version{Major: 1, Minor: 25},
input: "",
expected: "",
},
}
for _, tc := range cases {
dockerCfg := &dockercontainer.HostConfig{PidMode: tc.input}
modifyPIDNamespaceOverrides(tc.disable, tc.version, dockerCfg)
assert.Equal(t, tc.expected, dockerCfg.PidMode, "[Test case %q]", tc.name)
}
}
func fullValidSecurityContext() *runtimeapi.LinuxContainerSecurityContext {
return &runtimeapi.LinuxContainerSecurityContext{
Privileged: true,

View File

@ -553,7 +553,8 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
// Create and start the CRI shim running as a grpc server.
streamingConfig := getStreamingConfig(kubeCfg, kubeDeps)
ds, err := dockershim.NewDockerService(klet.dockerClient, kubeCfg.SeccompProfileRoot, kubeCfg.PodInfraContainerImage,
streamingConfig, &pluginSettings, kubeCfg.RuntimeCgroups, kubeCfg.CgroupDriver, dockerExecHandler, dockershimRootDir)
streamingConfig, &pluginSettings, kubeCfg.RuntimeCgroups, kubeCfg.CgroupDriver, dockerExecHandler, dockershimRootDir,
!kubeCfg.DockerEnableSharedPID)
if err != nil {
return nil, err
}