mirror of https://github.com/k3s-io/k3s
Merge pull request #41583 from verb/sharedpid
Automatic merge from submit-queue (batch tested with PRs 41583, 45117, 45123) Implement shared PID namespace in the dockershim **What this PR does / why we need it**: Defaults the Docker CRI to using a shared PID namespace for pods. Implements proposal in https://github.com/kubernetes/community/pull/207 tracked by #1615. //cc @dchen1107 @vishh @timstclair **Special notes for your reviewer**: none **Release note**: ```release-note Some container runtimes share a process (PID) namespace for all containers in a pod. This will become the default for Docker in a future release of Kubernetes. You can preview this functionality if running with the CRI and Docker 1.13.1 by enabling the --experimental-docker-enable-shared-pid kubelet flag. ```pull/6/head
commit
e2042bb81b
|
@ -309,6 +309,7 @@ func (c *kubeletConfiguration) addFlags(fs *pflag.FlagSet) {
|
|||
|
||||
fs.StringVar(&c.RemoteRuntimeEndpoint, "container-runtime-endpoint", c.RemoteRuntimeEndpoint, "[Experimental] The unix socket endpoint of remote runtime service. The endpoint is used only when CRI integration is enabled (--enable-cri)")
|
||||
fs.StringVar(&c.RemoteImageEndpoint, "image-service-endpoint", c.RemoteImageEndpoint, "[Experimental] The unix socket endpoint of remote image service. If not specified, it will be the same with container-runtime-endpoint by default. The endpoint is used only when CRI integration is enabled (--enable-cri)")
|
||||
fs.BoolVar(&c.DockerEnableSharedPID, "experimental-docker-enable-shared-pid", c.DockerEnableSharedPID, "[Experimental] The Container Runtime Interface (CRI) will eventually default to using a shared PID namespace for containers in a pod. Setting this flag allows previewing this behavior when running with the CRI enabled and Docker version 1.13.1 or higher.")
|
||||
|
||||
fs.BoolVar(&c.ExperimentalCheckNodeCapabilitiesBeforeMount, "experimental-check-node-capabilities-before-mount", c.ExperimentalCheckNodeCapabilitiesBeforeMount, "[Experimental] if set true, the kubelet will check the underlying node for required componenets (binaries, etc.) before performing the mount")
|
||||
|
||||
|
|
|
@ -976,7 +976,8 @@ func RunDockershim(c *componentconfig.KubeletConfiguration, dockershimRootDir st
|
|||
}
|
||||
|
||||
ds, err := dockershim.NewDockerService(dockerClient, c.SeccompProfileRoot, c.PodInfraContainerImage,
|
||||
streamingConfig, &pluginSettings, c.RuntimeCgroups, c.CgroupDriver, dockerExecHandler, dockershimRootDir)
|
||||
streamingConfig, &pluginSettings, c.RuntimeCgroups, c.CgroupDriver, dockerExecHandler, dockershimRootDir,
|
||||
!c.DockerEnableSharedPID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -245,6 +245,7 @@ experimental-check-node-capabilities-before-mount
|
|||
experimental-cri
|
||||
experimental-dockershim
|
||||
experimental-dockershim-root-directory
|
||||
experimental-docker-enable-shared-pid
|
||||
experimental-fail-swap-on
|
||||
experimental-kernel-memcg-notification
|
||||
experimental-keystone-ca-file
|
||||
|
|
|
@ -481,6 +481,11 @@ type KubeletConfiguration struct {
|
|||
// This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node.
|
||||
// This can be useful for debugging volume related issues.
|
||||
KeepTerminatedPodVolumes bool
|
||||
// This flag, if set, enables use of a shared PID namespace for pods running in the docker CRI runtime.
|
||||
// A shared PID namespace is the only option in non-docker runtimes and is required by the CRI. The ability to
|
||||
// disable it for docker will be removed unless a compelling use case is discovered with widespread use.
|
||||
// TODO: Remove once we no longer support disabling shared PID namespace (https://issues.k8s.io/41938)
|
||||
DockerEnableSharedPID bool
|
||||
|
||||
/* following flags are meant for Node Allocatable */
|
||||
|
||||
|
|
|
@ -536,6 +536,8 @@ type KubeletConfiguration struct {
|
|||
// This flag, if set, instructs the kubelet to keep volumes from terminated pods mounted to the node.
|
||||
// This can be useful for debugging volume related issues.
|
||||
KeepTerminatedPodVolumes bool `json:"keepTerminatedPodVolumes,omitempty"`
|
||||
// This flag, if set, enables use of a shared PID namespace for pods run by the docker CRI runtime.
|
||||
DockerEnableSharedPID *bool `json:"dockerEnableSharedPID,omitempty"`
|
||||
|
||||
/* following flags are meant for Node Allocatable */
|
||||
|
||||
|
|
|
@ -436,6 +436,9 @@ func autoConvert_v1alpha1_KubeletConfiguration_To_componentconfig_KubeletConfigu
|
|||
out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn
|
||||
out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount
|
||||
out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes
|
||||
if err := v1.Convert_Pointer_bool_To_bool(&in.DockerEnableSharedPID, &out.DockerEnableSharedPID, s); err != nil {
|
||||
return err
|
||||
}
|
||||
out.SystemReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.SystemReserved))
|
||||
out.KubeReserved = *(*componentconfig.ConfigurationMap)(unsafe.Pointer(&in.KubeReserved))
|
||||
out.SystemReservedCgroup = in.SystemReservedCgroup
|
||||
|
@ -637,6 +640,9 @@ func autoConvert_componentconfig_KubeletConfiguration_To_v1alpha1_KubeletConfigu
|
|||
out.ExperimentalFailSwapOn = in.ExperimentalFailSwapOn
|
||||
out.ExperimentalCheckNodeCapabilitiesBeforeMount = in.ExperimentalCheckNodeCapabilitiesBeforeMount
|
||||
out.KeepTerminatedPodVolumes = in.KeepTerminatedPodVolumes
|
||||
if err := v1.Convert_bool_To_Pointer_bool(&in.DockerEnableSharedPID, &out.DockerEnableSharedPID, s); err != nil {
|
||||
return err
|
||||
}
|
||||
out.SystemReserved = *(*map[string]string)(unsafe.Pointer(&in.SystemReserved))
|
||||
out.KubeReserved = *(*map[string]string)(unsafe.Pointer(&in.KubeReserved))
|
||||
out.SystemReservedCgroup = in.SystemReservedCgroup
|
||||
|
|
|
@ -303,6 +303,11 @@ func DeepCopy_v1alpha1_KubeletConfiguration(in interface{}, out interface{}, c *
|
|||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
if in.DockerEnableSharedPID != nil {
|
||||
in, out := &in.DockerEnableSharedPID, &out.DockerEnableSharedPID
|
||||
*out = new(bool)
|
||||
**out = **in
|
||||
}
|
||||
if in.SystemReserved != nil {
|
||||
in, out := &in.SystemReserved, &out.SystemReserved
|
||||
*out = make(map[string]string)
|
||||
|
|
|
@ -163,6 +163,7 @@ func (ds *dockerService) CreateContainer(podSandboxID string, config *runtimeapi
|
|||
|
||||
// Apply security context.
|
||||
applyContainerSecurityContext(lc, podSandboxID, createConfig.Config, hc, securityOptSep)
|
||||
modifyPIDNamespaceOverrides(ds.disableSharedPID, apiVersion, hc)
|
||||
}
|
||||
|
||||
// Apply cgroupsParent derived from the sandbox config.
|
||||
|
|
|
@ -147,7 +147,7 @@ var internalLabelKeys []string = []string{containerTypeLabelKey, containerLogPat
|
|||
|
||||
// NOTE: Anything passed to DockerService should be eventually handled in another way when we switch to running the shim as a different process.
|
||||
func NewDockerService(client dockertools.DockerInterface, seccompProfileRoot string, podSandboxImage string, streamingConfig *streaming.Config,
|
||||
pluginSettings *NetworkPluginSettings, cgroupsName string, kubeCgroupDriver string, execHandler dockertools.ExecHandler, dockershimRootDir string) (DockerService, error) {
|
||||
pluginSettings *NetworkPluginSettings, cgroupsName string, kubeCgroupDriver string, execHandler dockertools.ExecHandler, dockershimRootDir string, disableSharedPID bool) (DockerService, error) {
|
||||
c := dockertools.NewInstrumentedDockerInterface(client)
|
||||
checkpointHandler, err := NewPersistentCheckpointHandler(dockershimRootDir)
|
||||
if err != nil {
|
||||
|
@ -164,6 +164,7 @@ func NewDockerService(client dockertools.DockerInterface, seccompProfileRoot str
|
|||
},
|
||||
containerManager: cm.NewContainerManager(cgroupsName, client),
|
||||
checkpointHandler: checkpointHandler,
|
||||
disableSharedPID: disableSharedPID,
|
||||
}
|
||||
|
||||
// check docker version compatibility.
|
||||
|
@ -249,6 +250,11 @@ type dockerService struct {
|
|||
// version checking for some operations. Use this cache to avoid querying
|
||||
// the docker daemon every time we need to do such checks.
|
||||
versionCache *cache.ObjectCache
|
||||
// This option provides an escape hatch to override the new default behavior for Docker under
|
||||
// the CRI to use a shared PID namespace for all pods. It is temporary and will be removed.
|
||||
// See proposals/pod-pid-namespace.md for details.
|
||||
// TODO: Remove once the escape hatch is no longer used (https://issues.k8s.io/41938)
|
||||
disableSharedPID bool
|
||||
}
|
||||
|
||||
// Version returns the runtime name, runtime version and runtime API version
|
||||
|
|
|
@ -19,7 +19,9 @@ package dockershim
|
|||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/blang/semver"
|
||||
dockercontainer "github.com/docker/engine-api/types/container"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
|
@ -123,6 +125,7 @@ func modifyContainerNamespaceOptions(nsOpts *runtimeapi.NamespaceOption, sandbox
|
|||
if nsOpts != nil {
|
||||
hostNetwork = nsOpts.HostNetwork
|
||||
}
|
||||
hostConfig.PidMode = dockercontainer.PidMode(fmt.Sprintf("container:%v", sandboxID))
|
||||
modifyCommonNamespaceOptions(nsOpts, hostConfig)
|
||||
modifyHostNetworkOptionForContainer(hostNetwork, sandboxID, hostConfig)
|
||||
}
|
||||
|
@ -172,3 +175,19 @@ func modifyHostNetworkOptionForContainer(hostNetwork bool, sandboxID string, hc
|
|||
hc.UTSMode = namespaceModeHost
|
||||
}
|
||||
}
|
||||
|
||||
// modifyPIDNamespaceOverrides implements two temporary overrides for the default PID namespace sharing for Docker:
|
||||
// 1. Docker engine prior to API Version 1.24 doesn't support attaching to another container's
|
||||
// PID namespace, and it didn't stabilize until 1.26. This check can be removed when Kubernetes'
|
||||
// minimum Docker version is at least 1.13.1 (API version 1.26).
|
||||
// 2. The administrator has overridden the default behavior by means of a kubelet flag. This is an
|
||||
// "escape hatch" to return to previous behavior of isolated namespaces and should be removed once
|
||||
// no longer needed.
|
||||
func modifyPIDNamespaceOverrides(disableSharedPID bool, version *semver.Version, hc *dockercontainer.HostConfig) {
|
||||
if !strings.HasPrefix(string(hc.PidMode), "container:") {
|
||||
return
|
||||
}
|
||||
if disableSharedPID || version.LT(semver.Version{Major: 1, Minor: 26}) {
|
||||
hc.PidMode = ""
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import (
|
|||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/blang/semver"
|
||||
dockercontainer "github.com/docker/engine-api/types/container"
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
|
@ -172,12 +173,14 @@ func TestModifyHostConfigAndNamespaceOptionsForContainer(t *testing.T) {
|
|||
Privileged: true,
|
||||
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
|
||||
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
|
||||
PidMode: dockercontainer.PidMode(sandboxNSMode),
|
||||
}
|
||||
setCapsHC := &dockercontainer.HostConfig{
|
||||
CapAdd: []string{"addCapA", "addCapB"},
|
||||
CapDrop: []string{"dropCapA", "dropCapB"},
|
||||
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
|
||||
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
|
||||
PidMode: dockercontainer.PidMode(sandboxNSMode),
|
||||
}
|
||||
setSELinuxHC := &dockercontainer.HostConfig{
|
||||
SecurityOpt: []string{
|
||||
|
@ -188,6 +191,7 @@ func TestModifyHostConfigAndNamespaceOptionsForContainer(t *testing.T) {
|
|||
},
|
||||
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
|
||||
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
|
||||
PidMode: dockercontainer.PidMode(sandboxNSMode),
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
|
@ -286,6 +290,7 @@ func TestModifyContainerNamespaceOptions(t *testing.T) {
|
|||
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
|
||||
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
|
||||
UTSMode: namespaceModeHost,
|
||||
PidMode: dockercontainer.PidMode(sandboxNSMode),
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -296,6 +301,7 @@ func TestModifyContainerNamespaceOptions(t *testing.T) {
|
|||
expected: &dockercontainer.HostConfig{
|
||||
NetworkMode: dockercontainer.NetworkMode(sandboxNSMode),
|
||||
IpcMode: dockercontainer.IpcMode(sandboxNSMode),
|
||||
PidMode: dockercontainer.PidMode(sandboxNSMode),
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -317,6 +323,63 @@ func TestModifyContainerNamespaceOptions(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestModifyContainerNamespacePIDOverride(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
disable bool
|
||||
version *semver.Version
|
||||
input, expected dockercontainer.PidMode
|
||||
}{
|
||||
{
|
||||
name: "SharedPID.Enable",
|
||||
disable: false,
|
||||
version: &semver.Version{Major: 1, Minor: 26},
|
||||
input: "container:sandbox",
|
||||
expected: "container:sandbox",
|
||||
},
|
||||
{
|
||||
name: "SharedPID.Disable",
|
||||
disable: true,
|
||||
version: &semver.Version{Major: 1, Minor: 26},
|
||||
input: "container:sandbox",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "SharedPID.OldDocker",
|
||||
disable: false,
|
||||
version: &semver.Version{Major: 1, Minor: 25},
|
||||
input: "container:sandbox",
|
||||
expected: "",
|
||||
},
|
||||
{
|
||||
name: "SharedPID.HostPid",
|
||||
disable: true,
|
||||
version: &semver.Version{Major: 1, Minor: 27},
|
||||
input: "host",
|
||||
expected: "host",
|
||||
},
|
||||
{
|
||||
name: "SharedPID.DistantFuture",
|
||||
disable: false,
|
||||
version: &semver.Version{Major: 2, Minor: 10},
|
||||
input: "container:sandbox",
|
||||
expected: "container:sandbox",
|
||||
},
|
||||
{
|
||||
name: "SharedPID.EmptyPidMode",
|
||||
disable: true,
|
||||
version: &semver.Version{Major: 1, Minor: 25},
|
||||
input: "",
|
||||
expected: "",
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
dockerCfg := &dockercontainer.HostConfig{PidMode: tc.input}
|
||||
modifyPIDNamespaceOverrides(tc.disable, tc.version, dockerCfg)
|
||||
assert.Equal(t, tc.expected, dockerCfg.PidMode, "[Test case %q]", tc.name)
|
||||
}
|
||||
}
|
||||
|
||||
func fullValidSecurityContext() *runtimeapi.LinuxContainerSecurityContext {
|
||||
return &runtimeapi.LinuxContainerSecurityContext{
|
||||
Privileged: true,
|
||||
|
|
|
@ -553,7 +553,8 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
|
|||
// Create and start the CRI shim running as a grpc server.
|
||||
streamingConfig := getStreamingConfig(kubeCfg, kubeDeps)
|
||||
ds, err := dockershim.NewDockerService(klet.dockerClient, kubeCfg.SeccompProfileRoot, kubeCfg.PodInfraContainerImage,
|
||||
streamingConfig, &pluginSettings, kubeCfg.RuntimeCgroups, kubeCfg.CgroupDriver, dockerExecHandler, dockershimRootDir)
|
||||
streamingConfig, &pluginSettings, kubeCfg.RuntimeCgroups, kubeCfg.CgroupDriver, dockerExecHandler, dockershimRootDir,
|
||||
!kubeCfg.DockerEnableSharedPID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue