From 2597a1d97ef4d8f54b1ca661453e32794b756909 Mon Sep 17 00:00:00 2001 From: Robert Krawitz Date: Fri, 1 Feb 2019 17:15:08 -0500 Subject: [PATCH] Implement SupportNodePidsLimit, hand-tested --- cmd/kubelet/app/BUILD | 1 + cmd/kubelet/app/options/options.go | 4 +- cmd/kubelet/app/server.go | 21 +++--- pkg/features/kube_features.go | 7 ++ pkg/kubelet/apis/config/types.go | 4 +- pkg/kubelet/cm/BUILD | 1 + pkg/kubelet/cm/cgroup_manager_linux.go | 19 +++--- pkg/kubelet/cm/container_manager_linux.go | 14 ++++ .../cm/node_container_manager_linux.go | 25 +++++-- pkg/kubelet/cm/pod_container_manager_linux.go | 2 +- pkg/kubelet/cm/types.go | 2 +- pkg/kubelet/stats/BUILD | 8 ++- pkg/kubelet/stats/pidlimit/BUILD | 65 +++++++++++++++++++ pkg/kubelet/stats/pidlimit/pidlimit.go | 26 ++++++++ .../pidlimit_linux.go} | 5 +- .../pidlimit_unsupported.go} | 5 +- pkg/kubelet/stats/stats_provider.go | 6 ++ test/e2e_node/BUILD | 1 + test/e2e_node/node_container_manager_test.go | 36 ++++++++-- 19 files changed, 211 insertions(+), 41 deletions(-) create mode 100644 pkg/kubelet/stats/pidlimit/BUILD create mode 100644 pkg/kubelet/stats/pidlimit/pidlimit.go rename pkg/kubelet/stats/{stats_provider_linux.go => pidlimit/pidlimit_linux.go} (89%) rename pkg/kubelet/stats/{stats_provider_unsupported.go => pidlimit/pidlimit_unsupported.go} (83%) diff --git a/cmd/kubelet/app/BUILD b/cmd/kubelet/app/BUILD index 6a9a8b2a4a..a2140643b9 100644 --- a/cmd/kubelet/app/BUILD +++ b/cmd/kubelet/app/BUILD @@ -68,6 +68,7 @@ go_library( "//pkg/kubelet/kubeletconfig/configfiles:go_default_library", "//pkg/kubelet/server:go_default_library", "//pkg/kubelet/server/streaming:go_default_library", + "//pkg/kubelet/stats/pidlimit:go_default_library", "//pkg/kubelet/types:go_default_library", "//pkg/util/configz:go_default_library", "//pkg/util/filesystem:go_default_library", diff --git a/cmd/kubelet/app/options/options.go b/cmd/kubelet/app/options/options.go index f435215bd0..7a5db1769a 100644 --- a/cmd/kubelet/app/options/options.go +++ b/cmd/kubelet/app/options/options.go @@ -588,8 +588,8 @@ func AddKubeletConfigFlags(mainfs *pflag.FlagSet, c *kubeletconfig.KubeletConfig fs.BoolVar(&c.ProtectKernelDefaults, "protect-kernel-defaults", c.ProtectKernelDefaults, "Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.") // Node Allocatable Flags - fs.Var(flag.NewMapStringString(&c.SystemReserved), "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") - fs.Var(flag.NewMapStringString(&c.KubeReserved), "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory and local ephemeral storage for root file system are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") + fs.Var(flag.NewMapStringString(&c.SystemReserved), "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi,pid=100) pairs that describe resources reserved for non-kubernetes components. Currently only cpu, memory, and pid (process IDs) are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") + fs.Var(flag.NewMapStringString(&c.KubeReserved), "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi,pid=100) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory, local ephemeral storage for root file system, and pid (process IDs) are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]") fs.StringSliceVar(&c.EnforceNodeAllocatable, "enforce-node-allocatable", c.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptable options are 'none', 'pods', 'system-reserved', and 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' and '--kube-reserved-cgroup' must also be set, respectively. If 'none' is specified, no additional options should be set. See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/ for more details.") fs.StringVar(&c.SystemReservedCgroup, "system-reserved-cgroup", c.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']") fs.StringVar(&c.KubeReservedCgroup, "kube-reserved-cgroup", c.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']") diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index dbe21a3b00..414ef2a5f3 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -83,6 +83,7 @@ import ( "k8s.io/kubernetes/pkg/kubelet/kubeletconfig/configfiles" "k8s.io/kubernetes/pkg/kubelet/server" "k8s.io/kubernetes/pkg/kubelet/server/streaming" + "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/util/configz" utilfs "k8s.io/kubernetes/pkg/util/filesystem" @@ -1150,16 +1151,18 @@ func parseResourceList(m map[string]string) (v1.ResourceList, error) { rl := make(v1.ResourceList) for k, v := range m { switch v1.ResourceName(k) { - // CPU, memory and local storage resources are supported. - case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage: - q, err := resource.ParseQuantity(v) - if err != nil { - return nil, err + // CPU, memory, local storage, and PID resources are supported. + case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage, pidlimit.PIDs: + if v1.ResourceName(k) != pidlimit.PIDs || utilfeature.DefaultFeatureGate.Enabled(features.SupportNodePidsLimit) { + q, err := resource.ParseQuantity(v) + if err != nil { + return nil, err + } + if q.Sign() == -1 { + return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v) + } + rl[v1.ResourceName(k)] = q } - if q.Sign() == -1 { - return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v) - } - rl[v1.ResourceName(k)] = q default: return nil, fmt.Errorf("cannot reserve %q resource", k) } diff --git a/pkg/features/kube_features.go b/pkg/features/kube_features.go index d4b2ede0c7..b689c5f52e 100644 --- a/pkg/features/kube_features.go +++ b/pkg/features/kube_features.go @@ -405,6 +405,12 @@ const ( // // Enables the AWS EBS in-tree driver to AWS EBS CSI Driver migration feature. CSIMigrationAWS utilfeature.Feature = "CSIMigrationAWS" + + // owner: @RobertKrawitz + // alpha: v1.14 + // + // Implement support for limiting pids in nodes + SupportNodePidsLimit utilfeature.Feature = "SupportNodePidsLimit" ) func init() { @@ -450,6 +456,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS ResourceLimitsPriorityFunction: {Default: false, PreRelease: utilfeature.Alpha}, SupportIPVSProxyMode: {Default: true, PreRelease: utilfeature.GA}, SupportPodPidsLimit: {Default: true, PreRelease: utilfeature.Beta}, + SupportNodePidsLimit: {Default: false, PreRelease: utilfeature.Alpha}, HyperVContainer: {Default: false, PreRelease: utilfeature.Alpha}, ScheduleDaemonSetPods: {Default: true, PreRelease: utilfeature.Beta}, TokenRequest: {Default: true, PreRelease: utilfeature.Beta}, diff --git a/pkg/kubelet/apis/config/types.go b/pkg/kubelet/apis/config/types.go index 7061a62e0c..079cb19253 100644 --- a/pkg/kubelet/apis/config/types.go +++ b/pkg/kubelet/apis/config/types.go @@ -291,12 +291,12 @@ type KubeletConfiguration struct { /* the following fields are meant for Node Allocatable */ - // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs + // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G,pids=100) pairs // that describe resources reserved for non-kubernetes components. // Currently only cpu and memory are supported. // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. SystemReserved map[string]string - // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs + // A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G,pids=100) pairs // that describe resources reserved for kubernetes system components. // Currently cpu, memory and local ephemeral storage for root file system are supported. // See http://kubernetes.io/docs/user-guide/compute-resources for more detail. diff --git a/pkg/kubelet/cm/BUILD b/pkg/kubelet/cm/BUILD index 353b7c641a..ee61d44d63 100644 --- a/pkg/kubelet/cm/BUILD +++ b/pkg/kubelet/cm/BUILD @@ -73,6 +73,7 @@ go_library( "//pkg/kubelet/events:go_default_library", "//pkg/kubelet/metrics:go_default_library", "//pkg/kubelet/qos:go_default_library", + "//pkg/kubelet/stats/pidlimit:go_default_library", "//pkg/kubelet/types:go_default_library", "//pkg/util/mount:go_default_library", "//pkg/util/oom:go_default_library", diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 2eb2a167f7..3bb09f0ddd 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -257,7 +257,7 @@ func (m *cgroupManagerImpl) Exists(name CgroupName) bool { // in https://github.com/opencontainers/runc/issues/1440 // once resolved, we can remove this code. whitelistControllers := sets.NewString("cpu", "cpuacct", "cpuset", "memory", "systemd") - if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) { + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) { whitelistControllers.Insert("pids") } var missingPaths []string @@ -325,10 +325,11 @@ func getSupportedSubsystems() map[subsystem]bool { supportedSubsystems := map[subsystem]bool{ &cgroupfs.MemoryGroup{}: true, &cgroupfs.CpuGroup{}: true, + &cgroupfs.PidsGroup{}: true, } // not all hosts support hugetlb cgroup, and in the absent of hugetlb, we will fail silently by reporting no capacity. supportedSubsystems[&cgroupfs.HugetlbGroup{}] = false - if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) { + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) { supportedSubsystems[&cgroupfs.PidsGroup{}] = true } return supportedSubsystems @@ -377,9 +378,9 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont if resourceConfig.CpuPeriod != nil { resources.CpuPeriod = *resourceConfig.CpuPeriod } - if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) { - if resourceConfig.PodPidsLimit != nil { - resources.PidsLimit = *resourceConfig.PodPidsLimit + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) { + if resourceConfig.PidsLimit != nil { + resources.PidsLimit = *resourceConfig.PidsLimit } } // if huge pages are enabled, we set them in libcontainer @@ -431,8 +432,8 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() } - if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PodPidsLimit != nil { - libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil { + libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit } if err := setSupportedSubsystems(libcontainerCgroupConfig); err != nil { @@ -461,8 +462,8 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs() } - if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PodPidsLimit != nil { - libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit + if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil { + libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit } // get the manager with the specified cgroup configuration diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index af7a779173..cf84736b8e 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -53,6 +53,7 @@ import ( kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/lifecycle" "k8s.io/kubernetes/pkg/kubelet/qos" + "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" "k8s.io/kubernetes/pkg/kubelet/status" "k8s.io/kubernetes/pkg/kubelet/util/pluginwatcher" schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo" @@ -123,6 +124,8 @@ type containerManagerImpl struct { cgroupManager CgroupManager // Capacity of this node. capacity v1.ResourceList + // Capacity of this node, including internal resources. + internalCapacity v1.ResourceList // Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under. // This path include a top level container for enforcing Node Allocatable. cgroupRoot CgroupName @@ -219,6 +222,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I } var capacity = v1.ResourceList{} + var internalCapacity = v1.ResourceList{} // It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because // machine info is computed and cached once as part of cAdvisor object creation. // But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts @@ -227,6 +231,15 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I return nil, err } capacity = cadvisor.CapacityFromMachineInfo(machineInfo) + for k, v := range capacity { + internalCapacity[k] = v + } + pidlimits, err := pidlimit.Stats() + if err == nil && pidlimits != nil && pidlimits.MaxPID != nil { + internalCapacity[pidlimit.PIDs] = *resource.NewQuantity( + int64(*pidlimits.MaxPID), + resource.DecimalSI) + } // Turn CgroupRoot from a string (in cgroupfs path format) to internal CgroupName cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot) @@ -264,6 +277,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I subsystems: subsystems, cgroupManager: cgroupManager, capacity: capacity, + internalCapacity: internalCapacity, cgroupRoot: cgroupRoot, recorder: recorder, qosContainerManager: qosContainerManager, diff --git a/pkg/kubelet/cm/node_container_manager_linux.go b/pkg/kubelet/cm/node_container_manager_linux.go index 0820e78381..821fee6f0f 100644 --- a/pkg/kubelet/cm/node_container_manager_linux.go +++ b/pkg/kubelet/cm/node_container_manager_linux.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/klog" "k8s.io/kubernetes/pkg/kubelet/events" + "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) @@ -40,7 +41,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error { cgroupConfig := &CgroupConfig{ Name: cm.cgroupRoot, // The default limits for cpu shares can be very low which can lead to CPU starvation for pods. - ResourceParameters: getCgroupConfig(cm.capacity), + ResourceParameters: getCgroupConfig(cm.internalCapacity), } if cm.cgroupManager.Exists(cgroupConfig.Name) { return nil @@ -58,10 +59,10 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error { // We need to update limits on node allocatable cgroup no matter what because // default cpu shares on cgroups are low and can cause cpu starvation. - nodeAllocatable := cm.capacity + nodeAllocatable := cm.internalCapacity // Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable. if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) { - nodeAllocatable = cm.getNodeAllocatableAbsolute() + nodeAllocatable = cm.getNodeAllocatableInternalAbsolute() } klog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc) @@ -130,7 +131,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1. if cgroupConfig.ResourceParameters == nil { return fmt.Errorf("%q cgroup is not config properly", cgroupConfig.Name) } - klog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory) + klog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares, %d bytes of memory, and %d processes", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory, cgroupConfig.ResourceParameters.PidsLimit) if !cgroupManager.Exists(cgroupConfig.Name) { return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name) } @@ -157,6 +158,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { val := MilliCPUToShares(q.MilliValue()) rc.CpuShares = &val } + if q, exists := rl[pidlimit.PIDs]; exists { + val := q.Value() + rc.PidsLimit = &val + } rc.HugePageLimit = HugePageLimits(rl) return &rc @@ -166,8 +171,12 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig { // Note that not all resources that are available on the node are included in the returned list of resources. // Returns a ResourceList. func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList { + return cm.getNodeAllocatableAbsoluteImpl(cm.capacity) +} + +func (cm *containerManagerImpl) getNodeAllocatableAbsoluteImpl(capacity v1.ResourceList) v1.ResourceList { result := make(v1.ResourceList) - for k, v := range cm.capacity { + for k, v := range capacity { value := *(v.Copy()) if cm.NodeConfig.SystemReserved != nil { value.Sub(cm.NodeConfig.SystemReserved[k]) @@ -182,7 +191,13 @@ func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList { result[k] = value } return result +} +// getNodeAllocatableInternalAbsolute is similar to getNodeAllocatableAbsolute except that +// it also includes internal resources (currently process IDs). It is intended for setting +// up top level cgroups only. +func (cm *containerManagerImpl) getNodeAllocatableInternalAbsolute() v1.ResourceList { + return cm.getNodeAllocatableAbsoluteImpl(cm.internalCapacity) } // GetNodeAllocatableReservation returns amount of compute or storage resource that have to be reserved on this node from scheduling. diff --git a/pkg/kubelet/cm/pod_container_manager_linux.go b/pkg/kubelet/cm/pod_container_manager_linux.go index b91c5babe9..f0bf4df880 100644 --- a/pkg/kubelet/cm/pod_container_manager_linux.go +++ b/pkg/kubelet/cm/pod_container_manager_linux.go @@ -87,7 +87,7 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error { ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod), } if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 { - containerConfig.ResourceParameters.PodPidsLimit = &m.podPidsLimit + containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit } if err := m.cgroupManager.Create(containerConfig); err != nil { return fmt.Errorf("failed to create container for %v : %v", podContainerName, err) diff --git a/pkg/kubelet/cm/types.go b/pkg/kubelet/cm/types.go index 2e60d8a8fd..e601179744 100644 --- a/pkg/kubelet/cm/types.go +++ b/pkg/kubelet/cm/types.go @@ -34,7 +34,7 @@ type ResourceConfig struct { // HugePageLimit map from page size (in bytes) to limit (in bytes) HugePageLimit map[int64]int64 // Maximum number of pids - PodPidsLimit *int64 + PidsLimit *int64 } // CgroupName is the abstract name of a cgroup prior to any driver specific conversion. diff --git a/pkg/kubelet/stats/BUILD b/pkg/kubelet/stats/BUILD index ac9d306d2f..dd2c9dd713 100644 --- a/pkg/kubelet/stats/BUILD +++ b/pkg/kubelet/stats/BUILD @@ -10,8 +10,6 @@ go_library( "helper.go", "log_metrics_provider.go", "stats_provider.go", - "stats_provider_linux.go", - "stats_provider_unsupported.go", ], importpath = "k8s.io/kubernetes/pkg/kubelet/stats", visibility = ["//visibility:public"], @@ -26,6 +24,7 @@ go_library( "//pkg/kubelet/leaky:go_default_library", "//pkg/kubelet/pod:go_default_library", "//pkg/kubelet/server/stats:go_default_library", + "//pkg/kubelet/stats/pidlimit:go_default_library", "//pkg/kubelet/status:go_default_library", "//pkg/kubelet/types:go_default_library", "//pkg/volume:go_default_library", @@ -52,7 +51,10 @@ filegroup( filegroup( name = "all-srcs", - srcs = [":package-srcs"], + srcs = [ + ":package-srcs", + "//pkg/kubelet/stats/pidlimit:all-srcs", + ], tags = ["automanaged"], visibility = ["//visibility:public"], ) diff --git a/pkg/kubelet/stats/pidlimit/BUILD b/pkg/kubelet/stats/pidlimit/BUILD new file mode 100644 index 0000000000..d88396b7d6 --- /dev/null +++ b/pkg/kubelet/stats/pidlimit/BUILD @@ -0,0 +1,65 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "go_default_library", + srcs = [ + "pidlimit.go", + "pidlimit_linux.go", + "pidlimit_unsupported.go", + ], + importpath = "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit", + visibility = ["//visibility:public"], + deps = [ + "//staging/src/k8s.io/api/core/v1:go_default_library", + ] + select({ + "@io_bazel_rules_go//go/platform:android": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:darwin": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:dragonfly": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:freebsd": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:linux": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + "//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:nacl": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:netbsd": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:openbsd": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:plan9": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:solaris": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "@io_bazel_rules_go//go/platform:windows": [ + "//pkg/kubelet/apis/stats/v1alpha1:go_default_library", + ], + "//conditions:default": [], + }), +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], + visibility = ["//visibility:public"], +) diff --git a/pkg/kubelet/stats/pidlimit/pidlimit.go b/pkg/kubelet/stats/pidlimit/pidlimit.go new file mode 100644 index 0000000000..7356fa9c41 --- /dev/null +++ b/pkg/kubelet/stats/pidlimit/pidlimit.go @@ -0,0 +1,26 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pidlimit + +import ( + "k8s.io/api/core/v1" +) + +const ( + // PIDs is the (internal) name for this resource + PIDs v1.ResourceName = "pid" +) diff --git a/pkg/kubelet/stats/stats_provider_linux.go b/pkg/kubelet/stats/pidlimit/pidlimit_linux.go similarity index 89% rename from pkg/kubelet/stats/stats_provider_linux.go rename to pkg/kubelet/stats/pidlimit/pidlimit_linux.go index f36890fc26..d537f48f8b 100644 --- a/pkg/kubelet/stats/stats_provider_linux.go +++ b/pkg/kubelet/stats/pidlimit/pidlimit_linux.go @@ -16,7 +16,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package stats +package pidlimit import ( "io/ioutil" @@ -28,7 +28,8 @@ import ( statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" ) -func (p *StatsProvider) RlimitStats() (*statsapi.RlimitStats, error) { +// Stats provides basic information about max and current process count +func Stats() (*statsapi.RlimitStats, error) { rlimit := &statsapi.RlimitStats{} if content, err := ioutil.ReadFile("/proc/sys/kernel/pid_max"); err == nil { diff --git a/pkg/kubelet/stats/stats_provider_unsupported.go b/pkg/kubelet/stats/pidlimit/pidlimit_unsupported.go similarity index 83% rename from pkg/kubelet/stats/stats_provider_unsupported.go rename to pkg/kubelet/stats/pidlimit/pidlimit_unsupported.go index a7b00d9db0..825c785220 100644 --- a/pkg/kubelet/stats/stats_provider_unsupported.go +++ b/pkg/kubelet/stats/pidlimit/pidlimit_unsupported.go @@ -16,12 +16,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -package stats +package pidlimit import ( statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" ) -func (p *StatsProvider) RlimitStats() (*statsapi.RlimitStats, error) { +// Stats provides basic information about max and current process count +func Stats() (*statsapi.RlimitStats, error) { return nil, nil } diff --git a/pkg/kubelet/stats/stats_provider.go b/pkg/kubelet/stats/stats_provider.go index b06c7a8ad8..c0d98f900f 100644 --- a/pkg/kubelet/stats/stats_provider.go +++ b/pkg/kubelet/stats/stats_provider.go @@ -28,6 +28,7 @@ import ( kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/server/stats" + "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" "k8s.io/kubernetes/pkg/kubelet/status" ) @@ -96,6 +97,11 @@ type rlimitStatsProvider interface { RlimitStats() (*statsapi.RlimitStats, error) } +// RlimitStats returns base information about process count +func (p *StatsProvider) RlimitStats() (*statsapi.RlimitStats, error) { + return pidlimit.Stats() +} + // GetCgroupStats returns the stats of the cgroup with the cgroupName. Note that // this function doesn't generate filesystem stats. func (p *StatsProvider) GetCgroupStats(cgroupName string, updateStats bool) (*statsapi.ContainerStats, *statsapi.NetworkStats, error) { diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index f9177afa7d..e643dc2953 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -165,6 +165,7 @@ go_test( ] + select({ "@io_bazel_rules_go//go/platform:linux": [ "//cmd/kubeadm/app/util/system:go_default_library", + "//pkg/kubelet/stats/pidlimit:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/runtime:go_default_library", "//staging/src/k8s.io/apimachinery/pkg/util/yaml:go_default_library", diff --git a/test/e2e_node/node_container_manager_test.go b/test/e2e_node/node_container_manager_test.go index 904eec89a0..a25a5bd19f 100644 --- a/test/e2e_node/node_container_manager_test.go +++ b/test/e2e_node/node_container_manager_test.go @@ -29,8 +29,10 @@ import ( "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/pkg/features" kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" "k8s.io/kubernetes/pkg/kubelet/cm" + "k8s.io/kubernetes/pkg/kubelet/stats/pidlimit" "k8s.io/kubernetes/test/e2e/framework" . "github.com/onsi/ginkgo" @@ -38,14 +40,17 @@ import ( ) func setDesiredConfiguration(initialConfig *kubeletconfig.KubeletConfiguration) { + initialConfig.FeatureGates[string(features.SupportNodePidsLimit)] = true initialConfig.EnforceNodeAllocatable = []string{"pods", kubeReservedCgroup, systemReservedCgroup} initialConfig.SystemReserved = map[string]string{ string(v1.ResourceCPU): "100m", string(v1.ResourceMemory): "100Mi", + string(pidlimit.PIDs): "1000", } initialConfig.KubeReserved = map[string]string{ string(v1.ResourceCPU): "100m", string(v1.ResourceMemory): "100Mi", + string(pidlimit.PIDs): "738", } initialConfig.EvictionHard = map[string]string{"memory.available": "100Mi"} // Necessary for allocatable cgroup creation. @@ -81,8 +86,8 @@ func expectFileValToEqual(filePath string, expectedValue, delta int64) error { return nil } -func getAllocatableLimits(cpu, memory string, capacity v1.ResourceList) (*resource.Quantity, *resource.Quantity) { - var allocatableCPU, allocatableMemory *resource.Quantity +func getAllocatableLimits(cpu, memory, pids string, capacity v1.ResourceList) (*resource.Quantity, *resource.Quantity, *resource.Quantity) { + var allocatableCPU, allocatableMemory, allocatablePIDs *resource.Quantity // Total cpu reservation is 200m. for k, v := range capacity { if k == v1.ResourceCPU { @@ -94,7 +99,13 @@ func getAllocatableLimits(cpu, memory string, capacity v1.ResourceList) (*resour allocatableMemory.Sub(resource.MustParse(memory)) } } - return allocatableCPU, allocatableMemory + // Process IDs are not a node allocatable, so we have to do this ad hoc + pidlimits, err := pidlimit.Stats() + if err == nil && pidlimits != nil && pidlimits.MaxPID != nil { + allocatablePIDs = resource.NewQuantity(int64(*pidlimits.MaxPID), resource.DecimalSI) + allocatablePIDs.Sub(resource.MustParse(pids)) + } + return allocatableCPU, allocatableMemory, allocatablePIDs } const ( @@ -189,7 +200,7 @@ func runTest(f *framework.Framework) error { } node := nodeList.Items[0] capacity := node.Status.Capacity - allocatableCPU, allocatableMemory := getAllocatableLimits("200m", "200Mi", capacity) + allocatableCPU, allocatableMemory, allocatablePIDs := getAllocatableLimits("200m", "200Mi", "1738", capacity) // Total Memory reservation is 200Mi excluding eviction thresholds. // Expect CPU shares on node allocatable cgroup to equal allocatable. if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["cpu"], "kubepods", "cpu.shares"), int64(cm.MilliCPUToShares(allocatableCPU.MilliValue())), 10); err != nil { @@ -199,11 +210,16 @@ func runTest(f *framework.Framework) error { if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], "kubepods", "memory.limit_in_bytes"), allocatableMemory.Value(), 0); err != nil { return err } + // Expect PID limit on node allocatable cgroup to equal allocatable. + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], "kubepods", "pids.max"), allocatablePIDs.Value(), 0); err != nil { + return err + } // Check that Allocatable reported to scheduler includes eviction thresholds. schedulerAllocatable := node.Status.Allocatable // Memory allocatable should take into account eviction thresholds. - allocatableCPU, allocatableMemory = getAllocatableLimits("200m", "300Mi", capacity) + // Process IDs are not a scheduler resource and as such cannot be tested here. + allocatableCPU, allocatableMemory, _ = getAllocatableLimits("200m", "300Mi", "1738", capacity) // Expect allocatable to include all resources in capacity. if len(schedulerAllocatable) != len(capacity) { return fmt.Errorf("Expected all resources in capacity to be found in allocatable") @@ -232,6 +248,11 @@ func runTest(f *framework.Framework) error { if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupManager.Name(kubeReservedCgroupName), "memory.limit_in_bytes"), kubeReservedMemory.Value(), 0); err != nil { return err } + // Expect process ID limit kube reserved cgroup to equal configured value `738`. + kubeReservedPIDs := resource.MustParse(currentConfig.KubeReserved[string(pidlimit.PIDs)]) + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupManager.Name(kubeReservedCgroupName), "pids.max"), kubeReservedPIDs.Value(), 0); err != nil { + return err + } systemReservedCgroupName := cm.NewCgroupName(cm.RootCgroupName, systemReservedCgroup) if !cgroupManager.Exists(systemReservedCgroupName) { return fmt.Errorf("Expected system reserved cgroup Does not exist") @@ -246,5 +267,10 @@ func runTest(f *framework.Framework) error { if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["memory"], cgroupManager.Name(systemReservedCgroupName), "memory.limit_in_bytes"), systemReservedMemory.Value(), 0); err != nil { return err } + // Expect process ID limit system reserved cgroup to equal configured value `1000`. + systemReservedPIDs := resource.MustParse(currentConfig.SystemReserved[string(pidlimit.PIDs)]) + if err := expectFileValToEqual(filepath.Join(subsystems.MountPoints["pids"], cgroupManager.Name(systemReservedCgroupName), "pids.max"), systemReservedPIDs.Value(), 0); err != nil { + return err + } return nil }