diff --git a/pkg/api/pod/util.go b/pkg/api/pod/util.go index 32d185727f..b9f875e57a 100644 --- a/pkg/api/pod/util.go +++ b/pkg/api/pod/util.go @@ -262,6 +262,8 @@ func DropDisabledAlphaFields(podSpec *api.PodSpec) { if !utilfeature.DefaultFeatureGate.Enabled(features.RuntimeClass) && podSpec.RuntimeClassName != nil { podSpec.RuntimeClassName = nil } + + DropDisabledProcMountField(podSpec) } // DropDisabledRunAsGroupField removes disabled fields from PodSpec related @@ -284,6 +286,24 @@ func DropDisabledRunAsGroupField(podSpec *api.PodSpec) { } } +// DropDisabledProcMountField removes disabled fields from PodSpec related +// to ProcMount +func DropDisabledProcMountField(podSpec *api.PodSpec) { + if !utilfeature.DefaultFeatureGate.Enabled(features.ProcMountType) { + defProcMount := api.DefaultProcMount + for i := range podSpec.Containers { + if podSpec.Containers[i].SecurityContext != nil { + podSpec.Containers[i].SecurityContext.ProcMount = &defProcMount + } + } + for i := range podSpec.InitContainers { + if podSpec.InitContainers[i].SecurityContext != nil { + podSpec.InitContainers[i].SecurityContext.ProcMount = &defProcMount + } + } + } +} + // DropDisabledVolumeMountsAlphaFields removes disabled fields from []VolumeMount. // This should be called from PrepareForCreate/PrepareForUpdate for all resources containing a VolumeMount func DropDisabledVolumeMountsAlphaFields(volumeMounts []api.VolumeMount) { diff --git a/pkg/apis/core/types.go b/pkg/apis/core/types.go index ca3bfacec5..a9a561c5bc 100644 --- a/pkg/apis/core/types.go +++ b/pkg/apis/core/types.go @@ -4616,8 +4616,27 @@ type SecurityContext struct { // the no_new_privs flag will be set on the container process. // +optional AllowPrivilegeEscalation *bool + // ProcMount denotes the type of proc mount to use for the containers. + // The default is DefaultProcMount which uses the container runtime defaults for + // readonly paths and masked paths. + // +optional + ProcMount *ProcMountType } +type ProcMountType string + +const ( + // DefaultProcMount uses the container runtime defaults for readonly and masked + // paths for /proc. Most container runtimes mask certain paths in /proc to avoid + // accidental security exposure of special devices or information. + DefaultProcMount ProcMountType = "Default" + + // UnmaskedProcMount bypasses the default masking behavior of the container + // runtime and ensures the newly created /proc the container stays in tact with + // no modifications. + UnmaskedProcMount ProcMountType = "Unmasked" +) + // SELinuxOptions are the labels to be applied to the container. type SELinuxOptions struct { // SELinux user label diff --git a/pkg/apis/core/v1/conversion.go b/pkg/apis/core/v1/conversion.go index 64d458189a..bab07f1950 100644 --- a/pkg/apis/core/v1/conversion.go +++ b/pkg/apis/core/v1/conversion.go @@ -380,6 +380,11 @@ func Convert_core_SecurityContext_To_v1_SecurityContext(in *core.SecurityContext out.RunAsNonRoot = in.RunAsNonRoot out.ReadOnlyRootFilesystem = in.ReadOnlyRootFilesystem out.AllowPrivilegeEscalation = in.AllowPrivilegeEscalation + if in.ProcMount != nil { + pm := string(*in.ProcMount) + pmt := v1.ProcMountType(pm) + out.ProcMount = &pmt + } return nil } diff --git a/pkg/apis/core/v1/defaults.go b/pkg/apis/core/v1/defaults.go index 622e239025..1c7e2b32d6 100644 --- a/pkg/apis/core/v1/defaults.go +++ b/pkg/apis/core/v1/defaults.go @@ -93,6 +93,7 @@ func SetDefaults_Container(obj *v1.Container) { obj.TerminationMessagePolicy = v1.TerminationMessageReadFile } } + func SetDefaults_Service(obj *v1.Service) { if obj.Spec.SessionAffinity == "" { obj.Spec.SessionAffinity = v1.ServiceAffinityNone diff --git a/pkg/apis/core/v1/zz_generated.conversion.go b/pkg/apis/core/v1/zz_generated.conversion.go index c646e34f2b..88b711d0d4 100644 --- a/pkg/apis/core/v1/zz_generated.conversion.go +++ b/pkg/apis/core/v1/zz_generated.conversion.go @@ -6724,6 +6724,7 @@ func autoConvert_v1_SecurityContext_To_core_SecurityContext(in *v1.SecurityConte out.RunAsNonRoot = (*bool)(unsafe.Pointer(in.RunAsNonRoot)) out.ReadOnlyRootFilesystem = (*bool)(unsafe.Pointer(in.ReadOnlyRootFilesystem)) out.AllowPrivilegeEscalation = (*bool)(unsafe.Pointer(in.AllowPrivilegeEscalation)) + out.ProcMount = (*core.ProcMountType)(unsafe.Pointer(in.ProcMount)) return nil } @@ -6741,6 +6742,7 @@ func autoConvert_core_SecurityContext_To_v1_SecurityContext(in *core.SecurityCon out.RunAsNonRoot = (*bool)(unsafe.Pointer(in.RunAsNonRoot)) out.ReadOnlyRootFilesystem = (*bool)(unsafe.Pointer(in.ReadOnlyRootFilesystem)) out.AllowPrivilegeEscalation = (*bool)(unsafe.Pointer(in.AllowPrivilegeEscalation)) + out.ProcMount = (*v1.ProcMountType)(unsafe.Pointer(in.ProcMount)) return nil } diff --git a/pkg/apis/core/v1/zz_generated.defaults.go b/pkg/apis/core/v1/zz_generated.defaults.go index 00e0b384aa..0ea5e0fae0 100644 --- a/pkg/apis/core/v1/zz_generated.defaults.go +++ b/pkg/apis/core/v1/zz_generated.defaults.go @@ -263,6 +263,9 @@ func SetObjectDefaults_Pod(in *v1.Pod) { } } } + if a.SecurityContext != nil { + SetDefaults_SecurityContext(a.SecurityContext) + } } for i := range in.Spec.Containers { a := &in.Spec.Containers[i] @@ -305,6 +308,9 @@ func SetObjectDefaults_Pod(in *v1.Pod) { } } } + if a.SecurityContext != nil { + SetDefaults_SecurityContext(a.SecurityContext) + } } } @@ -409,6 +415,9 @@ func SetObjectDefaults_PodTemplate(in *v1.PodTemplate) { } } } + if a.SecurityContext != nil { + SetDefaults_SecurityContext(a.SecurityContext) + } } for i := range in.Template.Spec.Containers { a := &in.Template.Spec.Containers[i] @@ -451,6 +460,9 @@ func SetObjectDefaults_PodTemplate(in *v1.PodTemplate) { } } } + if a.SecurityContext != nil { + SetDefaults_SecurityContext(a.SecurityContext) + } } } @@ -557,6 +569,9 @@ func SetObjectDefaults_ReplicationController(in *v1.ReplicationController) { } } } + if a.SecurityContext != nil { + SetDefaults_SecurityContext(a.SecurityContext) + } } for i := range in.Spec.Template.Spec.Containers { a := &in.Spec.Template.Spec.Containers[i] @@ -599,6 +614,9 @@ func SetObjectDefaults_ReplicationController(in *v1.ReplicationController) { } } } + if a.SecurityContext != nil { + SetDefaults_SecurityContext(a.SecurityContext) + } } } } diff --git a/pkg/apis/policy/types.go b/pkg/apis/policy/types.go index 1af388bc0e..7b9628657f 100644 --- a/pkg/apis/policy/types.go +++ b/pkg/apis/policy/types.go @@ -228,6 +228,10 @@ type PodSecurityPolicySpec struct { // e.g. "foo.*" forbids "foo.bar", "foo.baz", etc. // +optional ForbiddenSysctls []string + // AllowedProcMountTypes is a whitelist of allowed ProcMountTypes. + // Empty or nil indicates that only the DefaultProcMountType may be used. + // +optional + AllowedProcMountTypes []api.ProcMountType } // AllowedHostPath defines the host volume conditions that will be enabled by a policy diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index c66e462ef4..7721df6da7 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -363,6 +363,12 @@ const ( // // Enable volume snapshot data source support. VolumeSnapshotDataSource utilfeature.Feature = "VolumeSnapshotDataSource" + + // owner: @jessfraz + // alpha: v1.12 + // + // Enables control over ProcMountType for containers. + ProcMountType utilfeature.Feature = "ProcMountType" ) func init() { @@ -424,6 +430,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS NodeLease: {Default: false, PreRelease: utilfeature.Alpha}, SCTPSupport: {Default: false, PreRelease: utilfeature.Alpha}, VolumeSnapshotDataSource: {Default: false, PreRelease: utilfeature.Alpha}, + ProcMountType: {Default: false, PreRelease: utilfeature.Alpha}, // inherited features from generic apiserver, relisted here to get a conflict if it is changed // unintentionally on either side: diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index f41ef7d717..b7827b960d 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -849,6 +849,8 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration, klet.nodeLeaseController = nodelease.NewController(klet.clock, klet.heartbeatClient, string(klet.nodeName), kubeCfg.NodeLeaseDurationSeconds, klet.onRepeatedHeartbeatFailure) } + klet.softAdmitHandlers.AddPodAdmitHandler(lifecycle.NewProcMountAdmitHandler(klet.containerRuntime)) + // Finally, put the most recent version of the config on the Kubelet, so // people can see how it was configured. klet.kubeletConfiguration = *kubeCfg diff --git a/pkg/kubelet/kuberuntime/security_context.go b/pkg/kubelet/kuberuntime/security_context.go index 20e11e7fca..7195174fc0 100644 --- a/pkg/kubelet/kuberuntime/security_context.go +++ b/pkg/kubelet/kuberuntime/security_context.go @@ -30,7 +30,10 @@ func (m *kubeGenericRuntimeManager) determineEffectiveSecurityContext(pod *v1.Po effectiveSc := securitycontext.DetermineEffectiveSecurityContext(pod, container) synthesized := convertToRuntimeSecurityContext(effectiveSc) if synthesized == nil { - synthesized = &runtimeapi.LinuxContainerSecurityContext{} + synthesized = &runtimeapi.LinuxContainerSecurityContext{ + MaskedPaths: securitycontext.ConvertToRuntimeMaskedPaths(effectiveSc.ProcMount), + ReadonlyPaths: securitycontext.ConvertToRuntimeReadonlyPaths(effectiveSc.ProcMount), + } } // set SeccompProfilePath. @@ -67,6 +70,9 @@ func (m *kubeGenericRuntimeManager) determineEffectiveSecurityContext(pod *v1.Po synthesized.NoNewPrivs = securitycontext.AddNoNewPrivileges(effectiveSc) + synthesized.MaskedPaths = securitycontext.ConvertToRuntimeMaskedPaths(effectiveSc.ProcMount) + synthesized.ReadonlyPaths = securitycontext.ConvertToRuntimeReadonlyPaths(effectiveSc.ProcMount) + return synthesized } diff --git a/pkg/kubelet/lifecycle/handlers.go b/pkg/kubelet/lifecycle/handlers.go index b941d85537..c0630ebb2a 100644 --- a/pkg/kubelet/lifecycle/handlers.go +++ b/pkg/kubelet/lifecycle/handlers.go @@ -230,3 +230,73 @@ func noNewPrivsRequired(pod *v1.Pod) bool { } return false } + +func NewProcMountAdmitHandler(runtime kubecontainer.Runtime) PodAdmitHandler { + return &procMountAdmitHandler{ + Runtime: runtime, + } +} + +type procMountAdmitHandler struct { + kubecontainer.Runtime +} + +func (a *procMountAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult { + // If the pod is already running or terminated, no need to recheck NoNewPrivs. + if attrs.Pod.Status.Phase != v1.PodPending { + return PodAdmitResult{Admit: true} + } + + // If the containers in a pod only need the default ProcMountType, admit it. + if procMountIsDefault(attrs.Pod) { + return PodAdmitResult{Admit: true} + } + + // Always admit runtimes except docker. + if a.Runtime.Type() != kubetypes.DockerContainerRuntime { + return PodAdmitResult{Admit: true} + } + + // Make sure docker api version is valid. + // Merged in https://github.com/moby/moby/pull/36644 + rversion, err := a.Runtime.APIVersion() + if err != nil { + return PodAdmitResult{ + Admit: false, + Reason: "ProcMount", + Message: fmt.Sprintf("Cannot enforce ProcMount: %v", err), + } + } + v, err := rversion.Compare("1.38.0") + if err != nil { + return PodAdmitResult{ + Admit: false, + Reason: "ProcMount", + Message: fmt.Sprintf("Cannot enforce ProcMount: %v", err), + } + } + // If the version is less than 1.38 it will return -1 above. + if v == -1 { + return PodAdmitResult{ + Admit: false, + Reason: "ProcMount", + Message: fmt.Sprintf("Cannot enforce ProcMount: docker runtime API version %q must be greater than or equal to 1.38", rversion.String()), + } + } + + return PodAdmitResult{Admit: true} +} + +func procMountIsDefault(pod *v1.Pod) bool { + // Iterate over pod containers and check if we are using the DefaultProcMountType + // for all containers. + for _, c := range pod.Spec.Containers { + if c.SecurityContext != nil { + if c.SecurityContext.ProcMount != nil && *c.SecurityContext.ProcMount != v1.DefaultProcMount { + return false + } + } + } + + return true +} diff --git a/pkg/security/podsecuritypolicy/provider.go b/pkg/security/podsecuritypolicy/provider.go index da82327f32..90d5f967b2 100644 --- a/pkg/security/podsecuritypolicy/provider.go +++ b/pkg/security/podsecuritypolicy/provider.go @@ -289,6 +289,21 @@ func (s *simpleProvider) ValidateContainer(pod *api.Pod, container *api.Containe allErrs = append(allErrs, field.Invalid(scPath.Child("privileged"), *privileged, "Privileged containers are not allowed")) } + procMount := sc.ProcMount() + allowedProcMounts := s.psp.Spec.AllowedProcMountTypes + if len(allowedProcMounts) == 0 { + allowedProcMounts = []api.ProcMountType{api.DefaultProcMount} + } + foundProcMountType := false + for _, pm := range allowedProcMounts { + if pm == procMount { + foundProcMountType = true + } + } + if !foundProcMountType { + allErrs = append(allErrs, field.Invalid(scPath.Child("procMount"), procMount, "ProcMountType is not allowed")) + } + allErrs = append(allErrs, s.strategies.CapabilitiesStrategy.Validate(scPath.Child("capabilities"), pod, container, sc.Capabilities())...) allErrs = append(allErrs, s.hasInvalidHostPort(container, containerPath)...) diff --git a/pkg/security/podsecuritypolicy/provider_test.go b/pkg/security/podsecuritypolicy/provider_test.go index 4ef7129ea6..e774f70f43 100644 --- a/pkg/security/podsecuritypolicy/provider_test.go +++ b/pkg/security/podsecuritypolicy/provider_test.go @@ -485,6 +485,10 @@ func TestValidateContainerFailures(t *testing.T) { var priv bool = true failPrivPod.Spec.Containers[0].SecurityContext.Privileged = &priv + failProcMountPod := defaultPod() + failProcMountPod.Spec.Containers[0].SecurityContext.ProcMount = new(api.ProcMountType) + *failProcMountPod.Spec.Containers[0].SecurityContext.ProcMount = api.UnmaskedProcMount + failCapsPod := defaultPod() failCapsPod.Spec.Containers[0].SecurityContext.Capabilities = &api.Capabilities{ Add: []api.Capability{"foo"}, @@ -540,6 +544,11 @@ func TestValidateContainerFailures(t *testing.T) { psp: defaultPSP(), expectedError: "Privileged containers are not allowed", }, + "failProcMountPSP": { + pod: failProcMountPod, + psp: defaultPSP(), + expectedError: "ProcMountType is not allowed", + }, "failCapsPSP": { pod: failCapsPod, psp: defaultPSP(), diff --git a/pkg/securitycontext/accessors.go b/pkg/securitycontext/accessors.go index 98ac6e0b92..cf372f2866 100644 --- a/pkg/securitycontext/accessors.go +++ b/pkg/securitycontext/accessors.go @@ -188,6 +188,7 @@ func (w *podSecurityContextWrapper) SetFSGroup(v *int64) { type ContainerSecurityContextAccessor interface { Capabilities() *api.Capabilities Privileged() *bool + ProcMount() api.ProcMountType SELinuxOptions() *api.SELinuxOptions RunAsUser() *int64 RunAsNonRoot() *bool @@ -257,6 +258,15 @@ func (w *containerSecurityContextWrapper) SetPrivileged(v *bool) { w.ensureContainerSC() w.containerSC.Privileged = v } +func (w *containerSecurityContextWrapper) ProcMount() api.ProcMountType { + if w.containerSC == nil { + return api.DefaultProcMount + } + if w.containerSC.ProcMount == nil { + return api.DefaultProcMount + } + return *w.containerSC.ProcMount +} func (w *containerSecurityContextWrapper) SELinuxOptions() *api.SELinuxOptions { if w.containerSC == nil { return nil @@ -356,6 +366,9 @@ func (w *effectiveContainerSecurityContextWrapper) SetPrivileged(v *bool) { w.containerSC.SetPrivileged(v) } } +func (w *effectiveContainerSecurityContextWrapper) ProcMount() api.ProcMountType { + return w.containerSC.ProcMount() +} func (w *effectiveContainerSecurityContextWrapper) SELinuxOptions() *api.SELinuxOptions { if v := w.containerSC.SELinuxOptions(); v != nil { return v diff --git a/pkg/securitycontext/fake.go b/pkg/securitycontext/fake.go index 975445bab0..3303db2126 100644 --- a/pkg/securitycontext/fake.go +++ b/pkg/securitycontext/fake.go @@ -35,8 +35,10 @@ func ValidSecurityContextWithContainerDefaults() *v1.SecurityContext { // empty container defaults. Used for testing. func ValidInternalSecurityContextWithContainerDefaults() *api.SecurityContext { priv := false + dpm := api.DefaultProcMount return &api.SecurityContext{ Capabilities: &api.Capabilities{}, Privileged: &priv, + ProcMount: &dpm, } } diff --git a/pkg/securitycontext/util.go b/pkg/securitycontext/util.go index 5ade558814..07489baf56 100644 --- a/pkg/securitycontext/util.go +++ b/pkg/securitycontext/util.go @@ -72,7 +72,7 @@ func DetermineEffectiveSecurityContext(pod *v1.Pod, container *v1.Container) *v1 containerSc := container.SecurityContext if effectiveSc == nil && containerSc == nil { - return nil + return &v1.SecurityContext{} } if effectiveSc != nil && containerSc == nil { return effectiveSc @@ -121,6 +121,11 @@ func DetermineEffectiveSecurityContext(pod *v1.Pod, container *v1.Container) *v1 *effectiveSc.AllowPrivilegeEscalation = *containerSc.AllowPrivilegeEscalation } + if containerSc.ProcMount != nil { + effectiveSc.ProcMount = new(v1.ProcMountType) + *effectiveSc.ProcMount = *containerSc.ProcMount + } + return effectiveSc } @@ -167,3 +172,52 @@ func AddNoNewPrivileges(sc *v1.SecurityContext) bool { // handle the case where defaultAllowPrivilegeEscalation is false or the user explicitly set allowPrivilegeEscalation to true/false return !*sc.AllowPrivilegeEscalation } + +var ( + // These *must* be kept in sync with moby/moby. + // https://github.com/moby/moby/blob/master/oci/defaults.go#L116-L134 + // @jessfraz will watch changes to those files upstream. + defaultMaskedPaths = []string{ + "/proc/acpi", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/proc/scsi", + "/sys/firmware", + } + defaultReadonlyPaths = []string{ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + } +) + +// ConvertToRuntimeMaskedPaths converts the ProcMountType to the specified or default +// masked paths. +func ConvertToRuntimeMaskedPaths(opt *v1.ProcMountType) []string { + if opt != nil && *opt == v1.UnmaskedProcMount { + // Unmasked proc mount should have no paths set as masked. + return []string{} + } + + // Otherwise, add the default masked paths to the runtime security context. + return defaultMaskedPaths +} + +// ConvertToRuntimeReadonlyPaths converts the ProcMountType to the specified or default +// readonly paths. +func ConvertToRuntimeReadonlyPaths(opt *v1.ProcMountType) []string { + if opt != nil && *opt == v1.UnmaskedProcMount { + // Unmasked proc mount should have no paths set as readonly. + return []string{} + } + + // Otherwise, add the default readonly paths to the runtime security context. + return defaultReadonlyPaths +} diff --git a/pkg/securitycontext/util_test.go b/pkg/securitycontext/util_test.go index 475356c407..21eafb51e4 100644 --- a/pkg/securitycontext/util_test.go +++ b/pkg/securitycontext/util_test.go @@ -17,6 +17,7 @@ limitations under the License. package securitycontext import ( + "reflect" "testing" "k8s.io/api/core/v1" @@ -123,3 +124,61 @@ func TestAddNoNewPrivileges(t *testing.T) { } } } + +func TestConvertToRuntimeMaskedPaths(t *testing.T) { + dPM := v1.DefaultProcMount + uPM := v1.UnmaskedProcMount + tests := map[string]struct { + pm *v1.ProcMountType + expect []string + }{ + "procMount nil": { + pm: nil, + expect: defaultMaskedPaths, + }, + "procMount default": { + pm: &dPM, + expect: defaultMaskedPaths, + }, + "procMount unmasked": { + pm: &uPM, + expect: []string{}, + }, + } + + for k, v := range tests { + actual := ConvertToRuntimeMaskedPaths(v.pm) + if !reflect.DeepEqual(actual, v.expect) { + t.Errorf("%s failed, expected %#v but received %#v", k, v.expect, actual) + } + } +} + +func TestConvertToRuntimeReadonlyPaths(t *testing.T) { + dPM := v1.DefaultProcMount + uPM := v1.UnmaskedProcMount + tests := map[string]struct { + pm *v1.ProcMountType + expect []string + }{ + "procMount nil": { + pm: nil, + expect: defaultReadonlyPaths, + }, + "procMount default": { + pm: &dPM, + expect: defaultReadonlyPaths, + }, + "procMount unmasked": { + pm: &uPM, + expect: []string{}, + }, + } + + for k, v := range tests { + actual := ConvertToRuntimeReadonlyPaths(v.pm) + if !reflect.DeepEqual(actual, v.expect) { + t.Errorf("%s failed, expected %#v but received %#v", k, v.expect, actual) + } + } +}