Merge pull request #31169 from pweil-/userns-experimental

Automatic merge from submit-queue

Default host user namespace via experimental flag

@vishh @ncdc @pmorie @smarterclayton @thockin 

Initial thought on the implementation https://github.com/kubernetes/kubernetes/pull/30684#issuecomment-241523425 wasn't quite right.  Since we need to dereference a PVC in some cases the defaulting code didn't fit nicely in the docker manager code (would've coupled it with a kube client and would've been messy).  I think passing this in via the runtime config turned out cleaner.  PTAL
pull/6/head
Kubernetes Submit Queue 2016-11-14 11:04:48 -08:00 committed by GitHub
commit c76fe8dcda
7 changed files with 382 additions and 15 deletions

View File

@ -434,6 +434,12 @@ type RunContainerOptions struct {
ReadOnly bool
// hostname for pod containers
Hostname string
// EnableHostUserNamespace sets userns=host when users request host namespaces (pid, ipc, net),
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
// or using host path volumes.
// This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired.
EnableHostUserNamespace bool
}
// VolumeInfo contains information about the volume.

View File

@ -57,6 +57,13 @@ const (
containerTypeLabelContainer = "container"
containerLogPathLabelKey = "io.kubernetes.container.logpath"
sandboxIDLabelKey = "io.kubernetes.sandbox.id"
// TODO: https://github.com/kubernetes/kubernetes/pull/31169 provides experimental
// defaulting of host user namespace that may be enabled when the docker daemon
// is using remapped UIDs.
// Dockershim should provide detection support for a remapping environment .
// This should be included in the feature proposal. Defaulting may still occur according
// to kubelet behavior and system settings in addition to any API flags that may be introduced.
)
// NetworkPluginSettings is the subset of kubelet runtime args we pass

View File

@ -681,12 +681,18 @@ func (dm *DockerManager) runContainer(
}
}
userNsMode := ""
if opts.EnableHostUserNamespace {
userNsMode = "host"
}
hc := &dockercontainer.HostConfig{
Binds: binds,
NetworkMode: dockercontainer.NetworkMode(netMode),
IpcMode: dockercontainer.IpcMode(ipcMode),
UTSMode: dockercontainer.UTSMode(utsMode),
PidMode: dockercontainer.PidMode(pidMode),
UsernsMode: dockercontainer.UsernsMode(userNsMode),
ReadonlyRootfs: readOnlyRootFilesystem(container),
Resources: dockercontainer.Resources{
Memory: memoryLimit,

View File

@ -450,15 +450,20 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
containerManager: kubeDeps.ContainerManager,
nodeIP: net.ParseIP(kubeCfg.NodeIP),
clock: clock.RealClock{},
outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration,
reservation: *reservation,
enableCustomMetrics: kubeCfg.EnableCustomMetrics,
babysitDaemons: kubeCfg.BabysitDaemons,
enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach,
iptClient: utilipt.New(utilexec.New(), utildbus.New(), utilipt.ProtocolIpv4),
makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains,
iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit),
iptablesDropBit: int(kubeCfg.IPTablesDropBit),
outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration,
reservation: *reservation,
enableCustomMetrics: kubeCfg.EnableCustomMetrics,
babysitDaemons: kubeCfg.BabysitDaemons,
enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach,
iptClient: utilipt.New(utilexec.New(), utildbus.New(), utilipt.ProtocolIpv4),
makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains,
iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit),
iptablesDropBit: int(kubeCfg.IPTablesDropBit),
experimentalHostUserNamespaceDefaulting: utilconfig.DefaultFeatureGate.ExperimentalHostUserNamespaceDefaulting(),
}
if klet.experimentalHostUserNamespaceDefaulting {
glog.Infof("Experimental host user namespace defaulting is enabled.")
}
if mode, err := effectiveHairpinMode(componentconfig.HairpinMode(kubeCfg.HairpinMode), kubeCfg.ContainerRuntime, kubeCfg.NetworkPluginName); err != nil {
@ -1087,6 +1092,13 @@ type Kubelet struct {
// The handler serving CRI streaming calls (exec/attach/port-forward).
criHandler http.Handler
// experimentalHostUserNamespaceDefaulting sets userns=true when users request host namespaces (pid, ipc, net),
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
// or using host path volumes.
// This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired.
experimentalHostUserNamespaceDefaulting bool
}
// setupDataDirs creates:

View File

@ -315,6 +315,11 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *api.Pod, container *api.Cont
return nil, err
}
// only do this check if the experimental behavior is enabled, otherwise allow it to default to false
if kl.experimentalHostUserNamespaceDefaulting {
opts.EnableHostUserNamespace = kl.enableHostUserNamespace(pod)
}
return opts, nil
}
@ -1397,3 +1402,87 @@ func (kl *Kubelet) cleanupOrphanedPodCgroups(
}
return nil
}
// enableHostUserNamespace determines if the host user namespace should be used by the container runtime.
// Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced
// capability, the pod contains a privileged container, or the pod has a host path volume.
//
// NOTE: when if a container shares any namespace with another container it must also share the user namespace
// or it will not have the correct capabilities in the namespace. This means that host user namespace
// is enabled per pod, not per container.
func (kl *Kubelet) enableHostUserNamespace(pod *api.Pod) bool {
if hasPrivilegedContainer(pod) || hasHostNamespace(pod) ||
hasHostVolume(pod) || hasNonNamespacedCapability(pod) || kl.hasHostMountPVC(pod) {
return true
}
return false
}
// hasPrivilegedContainer returns true if any of the containers in the pod are privileged.
func hasPrivilegedContainer(pod *api.Pod) bool {
for _, c := range pod.Spec.Containers {
if c.SecurityContext != nil &&
c.SecurityContext.Privileged != nil &&
*c.SecurityContext.Privileged {
return true
}
}
return false
}
// hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container.
func hasNonNamespacedCapability(pod *api.Pod) bool {
for _, c := range pod.Spec.Containers {
if c.SecurityContext != nil && c.SecurityContext.Capabilities != nil {
for _, cap := range c.SecurityContext.Capabilities.Add {
if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" {
return true
}
}
}
}
return false
}
// hasHostVolume returns true if the pod spec has a HostPath volume.
func hasHostVolume(pod *api.Pod) bool {
for _, v := range pod.Spec.Volumes {
if v.HostPath != nil {
return true
}
}
return false
}
// hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true.
func hasHostNamespace(pod *api.Pod) bool {
if pod.Spec.SecurityContext == nil {
return false
}
return pod.Spec.SecurityContext.HostIPC || pod.Spec.SecurityContext.HostNetwork || pod.Spec.SecurityContext.HostPID
}
// hasHostMountPVC returns true if a PVC is referencing a HostPath volume.
func (kl *Kubelet) hasHostMountPVC(pod *api.Pod) bool {
for _, volume := range pod.Spec.Volumes {
if volume.PersistentVolumeClaim != nil {
pvc, err := kl.kubeClient.Core().PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName)
if err != nil {
glog.Warningf("unable to retrieve pvc %s:%s - %v", pod.Namespace, volume.PersistentVolumeClaim.ClaimName, err)
continue
}
if pvc != nil {
referencedVolume, err := kl.kubeClient.Core().PersistentVolumes().Get(pvc.Spec.VolumeName)
if err != nil {
glog.Warningf("unable to retrieve pvc %s - %v", pvc.Spec.VolumeName, err)
continue
}
if referencedVolume != nil && referencedVolume.Spec.HostPath != nil {
return true
}
}
}
}
return false
}

View File

@ -19,6 +19,7 @@ package kubelet
import (
"bytes"
"errors"
"fmt"
"net"
"sort"
"testing"
@ -28,10 +29,12 @@ import (
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
"k8s.io/kubernetes/pkg/apimachinery/registered"
"k8s.io/kubernetes/pkg/client/testing/core"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
"k8s.io/kubernetes/pkg/kubelet/server/remotecommand"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/pkg/runtime"
"k8s.io/kubernetes/pkg/types"
)
@ -1264,3 +1267,230 @@ func TestMakeDevices(t *testing.T) {
assert.Equal(t, test.devices, makeDevices(test.container), "[test %q]", test.test)
}
}
func TestHasPrivilegedContainer(t *testing.T) {
newBoolPtr := func(b bool) *bool {
return &b
}
tests := map[string]struct {
securityContext *api.SecurityContext
expected bool
}{
"nil sc": {
securityContext: nil,
expected: false,
},
"nil privleged": {
securityContext: &api.SecurityContext{},
expected: false,
},
"false privleged": {
securityContext: &api.SecurityContext{Privileged: newBoolPtr(false)},
expected: false,
},
"true privleged": {
securityContext: &api.SecurityContext{Privileged: newBoolPtr(true)},
expected: true,
},
}
for k, v := range tests {
pod := &api.Pod{
Spec: api.PodSpec{
Containers: []api.Container{
{SecurityContext: v.securityContext},
},
},
}
actual := hasPrivilegedContainer(pod)
if actual != v.expected {
t.Errorf("%s expected %t but got %t", k, v.expected, actual)
}
}
}
func TestHasHostMountPVC(t *testing.T) {
tests := map[string]struct {
pvError error
pvcError error
expected bool
podHasPVC bool
pvcIsHostPath bool
}{
"no pvc": {podHasPVC: false, expected: false},
"error fetching pvc": {
podHasPVC: true,
pvcError: fmt.Errorf("foo"),
expected: false,
},
"error fetching pv": {
podHasPVC: true,
pvError: fmt.Errorf("foo"),
expected: false,
},
"host path pvc": {
podHasPVC: true,
pvcIsHostPath: true,
expected: true,
},
"non host path pvc": {
podHasPVC: true,
pvcIsHostPath: false,
expected: false,
},
}
for k, v := range tests {
testKubelet := newTestKubelet(t, false)
pod := &api.Pod{
Spec: api.PodSpec{},
}
volumeToReturn := &api.PersistentVolume{
Spec: api.PersistentVolumeSpec{},
}
if v.podHasPVC {
pod.Spec.Volumes = []api.Volume{
{
VolumeSource: api.VolumeSource{
PersistentVolumeClaim: &api.PersistentVolumeClaimVolumeSource{},
},
},
}
if v.pvcIsHostPath {
volumeToReturn.Spec.PersistentVolumeSource = api.PersistentVolumeSource{
HostPath: &api.HostPathVolumeSource{},
}
}
}
testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumeclaims", func(action core.Action) (bool, runtime.Object, error) {
return true, &api.PersistentVolumeClaim{
Spec: api.PersistentVolumeClaimSpec{
VolumeName: "foo",
},
}, v.pvcError
})
testKubelet.fakeKubeClient.AddReactor("get", "persistentvolumes", func(action core.Action) (bool, runtime.Object, error) {
return true, volumeToReturn, v.pvError
})
actual := testKubelet.kubelet.hasHostMountPVC(pod)
if actual != v.expected {
t.Errorf("%s expected %t but got %t", k, v.expected, actual)
}
}
}
func TestHasNonNamespacedCapability(t *testing.T) {
createPodWithCap := func(caps []api.Capability) *api.Pod {
pod := &api.Pod{
Spec: api.PodSpec{
Containers: []api.Container{{}},
},
}
if len(caps) > 0 {
pod.Spec.Containers[0].SecurityContext = &api.SecurityContext{
Capabilities: &api.Capabilities{
Add: caps,
},
}
}
return pod
}
nilCaps := createPodWithCap([]api.Capability{api.Capability("foo")})
nilCaps.Spec.Containers[0].SecurityContext = nil
tests := map[string]struct {
pod *api.Pod
expected bool
}{
"nil security contxt": {createPodWithCap(nil), false},
"nil caps": {nilCaps, false},
"namespaced cap": {createPodWithCap([]api.Capability{api.Capability("foo")}), false},
"non-namespaced cap MKNOD": {createPodWithCap([]api.Capability{api.Capability("MKNOD")}), true},
"non-namespaced cap SYS_TIME": {createPodWithCap([]api.Capability{api.Capability("SYS_TIME")}), true},
"non-namespaced cap SYS_MODULE": {createPodWithCap([]api.Capability{api.Capability("SYS_MODULE")}), true},
}
for k, v := range tests {
actual := hasNonNamespacedCapability(v.pod)
if actual != v.expected {
t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual)
}
}
}
func TestHasHostVolume(t *testing.T) {
pod := &api.Pod{
Spec: api.PodSpec{
Volumes: []api.Volume{
{
VolumeSource: api.VolumeSource{
HostPath: &api.HostPathVolumeSource{},
},
},
},
},
}
result := hasHostVolume(pod)
if !result {
t.Errorf("expected host volume to enable host user namespace")
}
pod.Spec.Volumes[0].VolumeSource.HostPath = nil
result = hasHostVolume(pod)
if result {
t.Errorf("expected nil host volume to not enable host user namespace")
}
}
func TestHasHostNamespace(t *testing.T) {
tests := map[string]struct {
psc *api.PodSecurityContext
expected bool
}{
"nil psc": {psc: nil, expected: false},
"host pid true": {
psc: &api.PodSecurityContext{
HostPID: true,
},
expected: true,
},
"host ipc true": {
psc: &api.PodSecurityContext{
HostIPC: true,
},
expected: true,
},
"host net true": {
psc: &api.PodSecurityContext{
HostNetwork: true,
},
expected: true,
},
"no host ns": {
psc: &api.PodSecurityContext{},
expected: false,
},
}
for k, v := range tests {
pod := &api.Pod{
Spec: api.PodSpec{
SecurityContext: v.psc,
},
}
actual := hasHostNamespace(pod)
if actual != v.expected {
t.Errorf("%s failed, expected %t but got %t", k, v.expected, actual)
}
}
}

View File

@ -43,18 +43,26 @@ const (
dynamicKubeletConfig = "DynamicKubeletConfig"
dynamicVolumeProvisioning = "DynamicVolumeProvisioning"
streamingProxyRedirects = "StreamingProxyRedirects"
// experimentalHostUserNamespaceDefaulting Default userns=host for containers
// that are using other host namespaces, host mounts, the pod contains a privileged container,
// or specific non-namespaced capabilities
// (MKNOD, SYS_MODULE, SYS_TIME). This should only be enabled if user namespace remapping is enabled
// in the docker daemon.
experimentalHostUserNamespaceDefaultingGate = "ExperimentalHostUserNamespaceDefaulting"
)
var (
// Default values for recorded features. Every new feature gate should be
// represented here.
knownFeatures = map[string]featureSpec{
allAlphaGate: {false, alpha},
externalTrafficLocalOnly: {true, beta},
appArmor: {true, beta},
dynamicKubeletConfig: {false, alpha},
dynamicVolumeProvisioning: {true, alpha},
streamingProxyRedirects: {false, alpha},
allAlphaGate: {false, alpha},
externalTrafficLocalOnly: {true, beta},
appArmor: {true, beta},
dynamicKubeletConfig: {false, alpha},
dynamicVolumeProvisioning: {true, alpha},
streamingProxyRedirects: {false, alpha},
experimentalHostUserNamespaceDefaultingGate: {false, alpha},
}
// Special handling for a few gates.
@ -115,6 +123,10 @@ type FeatureGate interface {
// owner: timstclair
// alpha: v1.5
StreamingProxyRedirects() bool
// owner: @pweil-
// alpha: v1.5
ExperimentalHostUserNamespaceDefaulting() bool
}
// featureGate implements FeatureGate as well as pflag.Value for flag parsing.
@ -209,6 +221,11 @@ func (f *featureGate) StreamingProxyRedirects() bool {
return f.lookup(streamingProxyRedirects)
}
// ExperimentalHostUserNamespaceDefaulting returns value for experimentalHostUserNamespaceDefaulting
func (f *featureGate) ExperimentalHostUserNamespaceDefaulting() bool {
return f.lookup(experimentalHostUserNamespaceDefaultingGate)
}
func (f *featureGate) lookup(key string) bool {
defaultValue := f.known[key].enabled
if f.enabled != nil {