mirror of https://github.com/k3s-io/k3s
Merge pull request #31169 from pweil-/userns-experimental
Automatic merge from submit-queue Default host user namespace via experimental flag @vishh @ncdc @pmorie @smarterclayton @thockin Initial thought on the implementation https://github.com/kubernetes/kubernetes/pull/30684#issuecomment-241523425 wasn't quite right. Since we need to dereference a PVC in some cases the defaulting code didn't fit nicely in the docker manager code (would've coupled it with a kube client and would've been messy). I think passing this in via the runtime config turned out cleaner. PTALpull/6/head
commit
c76fe8dcda
|
@ -434,6 +434,12 @@ type RunContainerOptions struct {
|
|||
ReadOnly bool
|
||||
// hostname for pod containers
|
||||
Hostname string
|
||||
// EnableHostUserNamespace sets userns=host when users request host namespaces (pid, ipc, net),
|
||||
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
|
||||
// or using host path volumes.
|
||||
// This should only be enabled when the container runtime is performing user remapping AND if the
|
||||
// experimental behavior is desired.
|
||||
EnableHostUserNamespace bool
|
||||
}
|
||||
|
||||
// VolumeInfo contains information about the volume.
|
||||
|
|
|
@ -57,6 +57,13 @@ const (
|
|||
containerTypeLabelContainer = "container"
|
||||
containerLogPathLabelKey = "io.kubernetes.container.logpath"
|
||||
sandboxIDLabelKey = "io.kubernetes.sandbox.id"
|
||||
|
||||
// TODO: https://github.com/kubernetes/kubernetes/pull/31169 provides experimental
|
||||
// defaulting of host user namespace that may be enabled when the docker daemon
|
||||
// is using remapped UIDs.
|
||||
// Dockershim should provide detection support for a remapping environment .
|
||||
// This should be included in the feature proposal. Defaulting may still occur according
|
||||
// to kubelet behavior and system settings in addition to any API flags that may be introduced.
|
||||
)
|
||||
|
||||
// NetworkPluginSettings is the subset of kubelet runtime args we pass
|
||||
|
|
|
@ -681,12 +681,18 @@ func (dm *DockerManager) runContainer(
|
|||
}
|
||||
}
|
||||
|
||||
userNsMode := ""
|
||||
if opts.EnableHostUserNamespace {
|
||||
userNsMode = "host"
|
||||
}
|
||||
|
||||
hc := &dockercontainer.HostConfig{
|
||||
Binds: binds,
|
||||
NetworkMode: dockercontainer.NetworkMode(netMode),
|
||||
IpcMode: dockercontainer.IpcMode(ipcMode),
|
||||
UTSMode: dockercontainer.UTSMode(utsMode),
|
||||
PidMode: dockercontainer.PidMode(pidMode),
|
||||
UsernsMode: dockercontainer.UsernsMode(userNsMode),
|
||||
ReadonlyRootfs: readOnlyRootFilesystem(container),
|
||||
Resources: dockercontainer.Resources{
|
||||
Memory: memoryLimit,
|
||||
|
|
|
@ -450,15 +450,20 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
|
|||
containerManager: kubeDeps.ContainerManager,
|
||||
nodeIP: net.ParseIP(kubeCfg.NodeIP),
|
||||
clock: clock.RealClock{},
|
||||
outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration,
|
||||
reservation: *reservation,
|
||||
enableCustomMetrics: kubeCfg.EnableCustomMetrics,
|
||||
babysitDaemons: kubeCfg.BabysitDaemons,
|
||||
enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach,
|
||||
iptClient: utilipt.New(utilexec.New(), utildbus.New(), utilipt.ProtocolIpv4),
|
||||
makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains,
|
||||
iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit),
|
||||
iptablesDropBit: int(kubeCfg.IPTablesDropBit),
|
||||
outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration,
|
||||
reservation: *reservation,
|
||||
enableCustomMetrics: kubeCfg.EnableCustomMetrics,
|
||||
babysitDaemons: kubeCfg.BabysitDaemons,
|
||||
enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach,
|
||||
iptClient: utilipt.New(utilexec.New(), utildbus.New(), utilipt.ProtocolIpv4),
|
||||
makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains,
|
||||
iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit),
|
||||
iptablesDropBit: int(kubeCfg.IPTablesDropBit),
|
||||
experimentalHostUserNamespaceDefaulting: utilconfig.DefaultFeatureGate.ExperimentalHostUserNamespaceDefaulting(),
|
||||
}
|
||||
|
||||
if klet.experimentalHostUserNamespaceDefaulting {
|
||||
glog.Infof("Experimental host user namespace defaulting is enabled.")
|
||||
}
|
||||
|
||||
if mode, err := effectiveHairpinMode(componentconfig.HairpinMode(kubeCfg.HairpinMode), kubeCfg.ContainerRuntime, kubeCfg.NetworkPluginName); err != nil {
|
||||
|
@ -1087,6 +1092,13 @@ type Kubelet struct {
|
|||
|
||||
// The handler serving CRI streaming calls (exec/attach/port-forward).
|
||||
criHandler http.Handler
|
||||
|
||||
// experimentalHostUserNamespaceDefaulting sets userns=true when users request host namespaces (pid, ipc, net),
|
||||
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
|
||||
// or using host path volumes.
|
||||
// This should only be enabled when the container runtime is performing user remapping AND if the
|
||||
// experimental behavior is desired.
|
||||
experimentalHostUserNamespaceDefaulting bool
|
||||
}
|
||||
|
||||
// setupDataDirs creates:
|
||||
|
|
|
@ -315,6 +315,11 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *api.Pod, container *api.Cont
|
|||
return nil, err
|
||||
}
|
||||
|
||||
// only do this check if the experimental behavior is enabled, otherwise allow it to default to false
|
||||
if kl.experimentalHostUserNamespaceDefaulting {
|
||||
opts.EnableHostUserNamespace = kl.enableHostUserNamespace(pod)
|
||||
}
|
||||
|
||||
return opts, nil
|
||||
}
|
||||
|
||||
|
@ -1397,3 +1402,87 @@ func (kl *Kubelet) cleanupOrphanedPodCgroups(
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// enableHostUserNamespace determines if the host user namespace should be used by the container runtime.
|
||||
// Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced
|
||||
// capability, the pod contains a privileged container, or the pod has a host path volume.
|
||||
//
|
||||
// NOTE: when if a container shares any namespace with another container it must also share the user namespace
|
||||
// or it will not have the correct capabilities in the namespace. This means that host user namespace
|
||||
// is enabled per pod, not per container.
|
||||
func (kl *Kubelet) enableHostUserNamespace(pod *api.Pod) bool {
|
||||
if hasPrivilegedContainer(pod) || hasHostNamespace(pod) ||
|
||||
hasHostVolume(pod) || hasNonNamespacedCapability(pod) || kl.hasHostMountPVC(pod) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// hasPrivilegedContainer returns true if any of the containers in the pod are privileged.
|
||||
func hasPrivilegedContainer(pod *api.Pod) bool {
|
||||
for _, c := range pod.Spec.Containers {
|
||||
if c.SecurityContext != nil &&
|
||||
c.SecurityContext.Privileged != nil &&
|
||||
*c.SecurityContext.Privileged {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container.
|
||||
func hasNonNamespacedCapability(pod *api.Pod) bool {
|
||||
for _, c := range pod.Spec.Containers {
|
||||
if c.SecurityContext != nil && c.SecurityContext.Capabilities != nil {
|
||||
for _, cap := range c.SecurityContext.Capabilities.Add {
|
||||
if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// hasHostVolume returns true if the pod spec has a HostPath volume.
|
||||
func hasHostVolume(pod *api.Pod) bool {
|
||||
for _, v := range pod.Spec.Volumes {
|
||||
if v.HostPath != nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true.
|
||||
func hasHostNamespace(pod *api.Pod) bool {
|
||||
if pod.Spec.SecurityContext == nil {
|
||||
return false
|
||||
}
|
||||
return pod.Spec.SecurityContext.HostIPC || pod.Spec.SecurityContext.HostNetwork || pod.Spec.SecurityContext.HostPID
|
||||
}
|
||||
|
||||
// hasHostMountPVC returns true if a PVC is referencing a HostPath volume.
|
||||
func (kl *Kubelet) hasHostMountPVC(pod *api.Pod) bool {
|
||||
for _, volume := range pod.Spec.Volumes {
|
||||
if volume.PersistentVolumeClaim != nil {
|
||||
pvc, err := kl.kubeClient.Core().PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName)
|
||||
if err != nil {
|
||||
glog.Warningf("unable to retrieve pvc %s:%s - %v", pod.Namespace, volume.PersistentVolumeClaim.ClaimName, err)
|
||||
continue
|
||||
}
|
||||
if pvc != nil {
|
||||
referencedVolume, err := kl.kubeClient.Core().PersistentVolumes().Get(pvc.Spec.VolumeName)
|
||||
if err != nil {
|
||||
glog.Warningf("unable to retrieve pvc %s - %v", pvc.Spec.VolumeName, err)
|
||||
continue
|
||||
}
|
||||
if referencedVolume != nil && referencedVolume.Spec.HostPath != nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package kubelet
|
|||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"sort"
|
||||
"testing"
|
||||
|
@ -28,10 +29,12 @@ import (
|
|||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/resource"
|
||||
"k8s.io/kubernetes/pkg/apimachinery/registered"
|
||||
"k8s.io/kubernetes/pkg/client/testing/core"
|
||||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
|
||||
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
|
||||
"k8s.io/kubernetes/pkg/kubelet/server/remotecommand"
|
||||
"k8s.io/kubernetes/pkg/labels"
|
||||
"k8s.io/kubernetes/pkg/runtime"
|
||||
"k8s.io/kubernetes/pkg/types"
|
||||
)
|
||||
|
||||
|
@ -1264,3 +1267,230 @@ func TestMakeDevices(t *testing.T) {
|
|||
assert.Equal(t, test.devices, makeDevices(test.container), "[test %q]", test.test)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasPrivilegedContainer(t *testing.T) {
|
||||
newBoolPtr := func(b bool) *bool {
|
||||
return &b
|
||||
}
|
||||
tests := map[string]struct {
|
||||
securityContext *api.SecurityContext
|
||||
expected bool
|
||||
}{
|
||||
"nil sc": {
|
||||
securityContext: nil,
|
||||
expected: false,
|
||||
},
|
||||
"nil privleged": {
|
||||
securityContext: &api.SecurityContext{},
|
||||
expected: false,
|
||||
},
|
||||
"false privleged": {
|
||||
securityContext: &api.SecurityContext{Privileged: newBoolPtr(false)},
|
||||
expected: false,
|
||||
},
|
||||
"true privleged": {
|
||||
securityContext: &api.SecurityContext{Privileged: newBoolPtr(true)},
|
||||
expected: true,
|
||||
},
|
||||
}
|
||||
|
||||
for k, v := range tests {
|
||||
pod := &api.Pod{
|
||||
Spec: api.PodSpec{
|
||||
Containers: []api.Container{
|
||||
{SecurityContext: v.securityContext},
|
||||
},
|
||||
},
|
||||
}
|
||||
actual := hasPrivilegedContainer(pod)
|
||||
if actual != v.expected {
|
||||
t.Errorf("%s expected %t but got %t", k, v.expected, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasHostMountPVC(t *testing.T) {
|
||||
tests := map[string]struct {
|
||||
pvError error
|
||||
pvcError error
|
||||
expected bool
|
||||
podHasPVC bool
|
||||
pvcIsHostPath bool
|
||||
}{
|
||||
"no pvc": {podHasPVC: false, expected: false},
|
||||
"error fetching pvc": {
|
||||
podHasPVC: true,
|
||||
pvcError: fmt.Errorf("foo"),
|
||||
expected: false,
|
||||
},
|
||||
"error fetching pv": {
|
||||
podHasPVC: true,
|
||||
pvError: fmt.Errorf("foo"),
|
||||
expected: false,
|
||||
},
|
||||
"host path pvc": {
|
||||
podHasPVC: true,
|
||||
pvcIsHostPath: true,
|
||||
expected: true,
|
||||
},
|
||||
"non host path pvc": {
|
||||
podHasPVC: true,
|
||||
pvcIsHostPath: false,
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
|
||||
for k, v := range tests {
|
||||
testKubelet := newTestKubelet(t, false)
|
||||
pod := &api.Pod{
|
||||
Spec: api.PodSpec{},
|
||||
}
|
||||
|
||||
volumeToReturn := &api.PersistentVolume{
|
||||
Spec: api.PersistentVolumeSpec{},
|
||||
}
|
||||
|
||||
if v.podHasPVC {
|
||||
pod.Spec.Volumes = []api.Volume{
|
||||
{
|
||||
VolumeSource: api.VolumeSource{
|
||||
PersistentVolumeClaim: &api.PersistentVolumeClaimVolumeSource{},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if v.pvcIsHostPath {
|
||||
volumeToReturn.Spec.PersistentVolumeSource = api.PersistentVolumeSource{
|
||||
HostPath: &api.HostPathVolumeSource{},
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumeclaims", func(action core.Action) (bool, runtime.Object, error) {
|
||||
return true, &api.PersistentVolumeClaim{
|
||||
Spec: api.PersistentVolumeClaimSpec{
|
||||
VolumeName: "foo",
|
||||
},
|
||||
}, v.pvcError
|
||||
})
|
||||
testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumes", func(action core.Action) (bool, runtime.Object, error) {
|
||||
return true, volumeToReturn, v.pvError
|
||||
})
|
||||
|
||||
actual := testKubelet.kubelet.hasHostMountPVC(pod)
|
||||
if actual != v.expected {
|
||||
t.Errorf("%s expected %t but got %t", k, v.expected, actual)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasNonNamespacedCapability(t *testing.T) {
|
||||
createPodWithCap := func(caps []api.Capability) *api.Pod {
|
||||
pod := &api.Pod{
|
||||
Spec: api.PodSpec{
|
||||
Containers: []api.Container{{}},
|
||||
},
|
||||
}
|
||||
|
||||
if len(caps) > 0 {
|
||||
pod.Spec.Containers[0].SecurityContext = &api.SecurityContext{
|
||||
Capabilities: &api.Capabilities{
|
||||
Add: caps,
|
||||
},
|
||||
}
|
||||
}
|
||||
return pod
|
||||
}
|
||||
|
||||
nilCaps := createPodWithCap([]api.Capability{api.Capability("foo")})
|
||||
nilCaps.Spec.Containers[0].SecurityContext = nil
|
||||
|
||||
tests := map[string]struct {
|
||||
pod *api.Pod
|
||||
expected bool
|
||||
}{
|
||||
"nil security contxt": {createPodWithCap(nil), false},
|
||||
"nil caps": {nilCaps, false},
|
||||
"namespaced cap": {createPodWithCap([]api.Capability{api.Capability("foo")}), false},
|
||||
"non-namespaced cap MKNOD": {createPodWithCap([]api.Capability{api.Capability("MKNOD")}), true},
|
||||
"non-namespaced cap SYS_TIME": {createPodWithCap([]api.Capability{api.Capability("SYS_TIME")}), true},
|
||||
"non-namespaced cap SYS_MODULE": {createPodWithCap([]api.Capability{api.Capability("SYS_MODULE")}), true},
|
||||
}
|
||||
|
||||
for k, v := range tests {
|
||||
actual := hasNonNamespacedCapability(v.pod)
|
||||
if actual != v.expected {
|
||||
t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasHostVolume(t *testing.T) {
|
||||
pod := &api.Pod{
|
||||
Spec: api.PodSpec{
|
||||
Volumes: []api.Volume{
|
||||
{
|
||||
VolumeSource: api.VolumeSource{
|
||||
HostPath: &api.HostPathVolumeSource{},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result := hasHostVolume(pod)
|
||||
if !result {
|
||||
t.Errorf("expected host volume to enable host user namespace")
|
||||
}
|
||||
|
||||
pod.Spec.Volumes[0].VolumeSource.HostPath = nil
|
||||
result = hasHostVolume(pod)
|
||||
if result {
|
||||
t.Errorf("expected nil host volume to not enable host user namespace")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasHostNamespace(t *testing.T) {
|
||||
tests := map[string]struct {
|
||||
psc *api.PodSecurityContext
|
||||
expected bool
|
||||
}{
|
||||
"nil psc": {psc: nil, expected: false},
|
||||
"host pid true": {
|
||||
psc: &api.PodSecurityContext{
|
||||
HostPID: true,
|
||||
},
|
||||
expected: true,
|
||||
},
|
||||
"host ipc true": {
|
||||
psc: &api.PodSecurityContext{
|
||||
HostIPC: true,
|
||||
},
|
||||
expected: true,
|
||||
},
|
||||
"host net true": {
|
||||
psc: &api.PodSecurityContext{
|
||||
HostNetwork: true,
|
||||
},
|
||||
expected: true,
|
||||
},
|
||||
"no host ns": {
|
||||
psc: &api.PodSecurityContext{},
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
|
||||
for k, v := range tests {
|
||||
pod := &api.Pod{
|
||||
Spec: api.PodSpec{
|
||||
SecurityContext: v.psc,
|
||||
},
|
||||
}
|
||||
actual := hasHostNamespace(pod)
|
||||
if actual != v.expected {
|
||||
t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -43,18 +43,26 @@ const (
|
|||
dynamicKubeletConfig = "DynamicKubeletConfig"
|
||||
dynamicVolumeProvisioning = "DynamicVolumeProvisioning"
|
||||
streamingProxyRedirects = "StreamingProxyRedirects"
|
||||
|
||||
// experimentalHostUserNamespaceDefaulting Default userns=host for containers
|
||||
// that are using other host namespaces, host mounts, the pod contains a privileged container,
|
||||
// or specific non-namespaced capabilities
|
||||
// (MKNOD, SYS_MODULE, SYS_TIME). This should only be enabled if user namespace remapping is enabled
|
||||
// in the docker daemon.
|
||||
experimentalHostUserNamespaceDefaultingGate = "ExperimentalHostUserNamespaceDefaulting"
|
||||
)
|
||||
|
||||
var (
|
||||
// Default values for recorded features. Every new feature gate should be
|
||||
// represented here.
|
||||
knownFeatures = map[string]featureSpec{
|
||||
allAlphaGate: {false, alpha},
|
||||
externalTrafficLocalOnly: {true, beta},
|
||||
appArmor: {true, beta},
|
||||
dynamicKubeletConfig: {false, alpha},
|
||||
dynamicVolumeProvisioning: {true, alpha},
|
||||
streamingProxyRedirects: {false, alpha},
|
||||
allAlphaGate: {false, alpha},
|
||||
externalTrafficLocalOnly: {true, beta},
|
||||
appArmor: {true, beta},
|
||||
dynamicKubeletConfig: {false, alpha},
|
||||
dynamicVolumeProvisioning: {true, alpha},
|
||||
streamingProxyRedirects: {false, alpha},
|
||||
experimentalHostUserNamespaceDefaultingGate: {false, alpha},
|
||||
}
|
||||
|
||||
// Special handling for a few gates.
|
||||
|
@ -115,6 +123,10 @@ type FeatureGate interface {
|
|||
// owner: timstclair
|
||||
// alpha: v1.5
|
||||
StreamingProxyRedirects() bool
|
||||
|
||||
// owner: @pweil-
|
||||
// alpha: v1.5
|
||||
ExperimentalHostUserNamespaceDefaulting() bool
|
||||
}
|
||||
|
||||
// featureGate implements FeatureGate as well as pflag.Value for flag parsing.
|
||||
|
@ -209,6 +221,11 @@ func (f *featureGate) StreamingProxyRedirects() bool {
|
|||
return f.lookup(streamingProxyRedirects)
|
||||
}
|
||||
|
||||
// ExperimentalHostUserNamespaceDefaulting returns value for experimentalHostUserNamespaceDefaulting
|
||||
func (f *featureGate) ExperimentalHostUserNamespaceDefaulting() bool {
|
||||
return f.lookup(experimentalHostUserNamespaceDefaultingGate)
|
||||
}
|
||||
|
||||
func (f *featureGate) lookup(key string) bool {
|
||||
defaultValue := f.known[key].enabled
|
||||
if f.enabled != nil {
|
||||
|
|
Loading…
Reference in New Issue