From 8b440c6424f3cee5a97fd4575b2e09524f142b70 Mon Sep 17 00:00:00 2001 From: David Ashpole Date: Mon, 14 Jan 2019 09:41:36 -0800 Subject: [PATCH] Fix PidPressure, make it evict by priority, and add fork-bomb node e2e test --- pkg/kubelet/eviction/api/types.go | 1 + pkg/kubelet/eviction/eviction_manager.go | 2 +- pkg/kubelet/eviction/helpers.go | 11 +++ pkg/kubelet/eviction/helpers_test.go | 12 +-- test/e2e_node/BUILD | 1 + test/e2e_node/eviction_test.go | 93 ++++++++++++++++++++---- 6 files changed, 98 insertions(+), 22 deletions(-) diff --git a/pkg/kubelet/eviction/api/types.go b/pkg/kubelet/eviction/api/types.go index d32ba9b30f..1dd2a42a94 100644 --- a/pkg/kubelet/eviction/api/types.go +++ b/pkg/kubelet/eviction/api/types.go @@ -63,6 +63,7 @@ var OpForSignal = map[Signal]ThresholdOperator{ SignalNodeFsInodesFree: OpLessThan, SignalImageFsAvailable: OpLessThan, SignalImageFsInodesFree: OpLessThan, + SignalPIDAvailable: OpLessThan, } // ThresholdValue is a value holder that abstracts literal versus percentage based quantity diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 7e1ede9eeb..801fc8106f 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -157,7 +157,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd return lifecycle.PodAdmitResult{ Admit: false, Reason: Reason, - Message: fmt.Sprintf(nodeLowMessageFmt, m.nodeConditions), + Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions), } } diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 6b5d7eb7db..e7c2ddcb19 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -40,6 +40,8 @@ const ( Reason = "Evicted" // nodeLowMessageFmt is the message for evictions due to resource pressure. nodeLowMessageFmt = "The node was low on resource: %v. " + // nodeConditionMessageFmt is the message for evictions due to resource pressure. + nodeConditionMessageFmt = "The node had condition: %v. " // containerMessageFmt provides additional information for containers exceeding requests containerMessageFmt = "Container %s was using %s, which exceeds its request of %s. " // containerEphemeralStorageMessageFmt provides additional information for containers which have exceeded their ES limit @@ -50,6 +52,8 @@ const ( emptyDirMessageFmt = "Usage of EmptyDir volume %q exceeds the limit %q. " // inodes, number. internal to this module, used to account for local disk inode consumption. resourceInodes v1.ResourceName = "inodes" + // resourcePids, number. internal to this module, used to account for local pid consumption. + resourcePids v1.ResourceName = "pids" // OffendingContainersKey is the key in eviction event annotations for the list of container names which exceeded their requests OffendingContainersKey = "offending_containers" // OffendingContainersUsageKey is the key in eviction event annotations for the list of usage of containers which exceeded their requests @@ -84,6 +88,7 @@ func init() { signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes + signalToResource[evictionapi.SignalPIDAvailable] = resourcePids } // validSignal returns true if the signal is supported. @@ -674,6 +679,11 @@ func rankMemoryPressure(pods []*v1.Pod, stats statsFunc) { orderedBy(exceedMemoryRequests(stats), priority, memory(stats)).Sort(pods) } +// rankPIDPressure orders the input pods by priority in response to PID pressure. +func rankPIDPressure(pods []*v1.Pod, stats statsFunc) { + orderedBy(priority).Sort(pods) +} + // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats. func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) rankFunc { return func(pods []*v1.Pod, stats statsFunc) { @@ -987,6 +997,7 @@ func buildSignalToRankFunc(withImageFs bool) map[evictionapi.Signal]rankFunc { signalToRankFunc := map[evictionapi.Signal]rankFunc{ evictionapi.SignalMemoryAvailable: rankMemoryPressure, evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure, + evictionapi.SignalPIDAvailable: rankPIDPressure, } // usage of an imagefs is optional if withImageFs { diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index b627f7f40d..fa01f68ea2 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -943,13 +943,13 @@ func TestSortByEvictionPriority(t *testing.T) { expected: []evictionapi.Threshold{}, }, { - name: "memory first, PID last", + name: "memory first", thresholds: []evictionapi.Threshold{ { - Signal: evictionapi.SignalPIDAvailable, + Signal: evictionapi.SignalNodeFsAvailable, }, { - Signal: evictionapi.SignalNodeFsAvailable, + Signal: evictionapi.SignalPIDAvailable, }, { Signal: evictionapi.SignalMemoryAvailable, @@ -968,13 +968,13 @@ func TestSortByEvictionPriority(t *testing.T) { }, }, { - name: "allocatable memory first, PID last", + name: "allocatable memory first", thresholds: []evictionapi.Threshold{ { - Signal: evictionapi.SignalPIDAvailable, + Signal: evictionapi.SignalNodeFsAvailable, }, { - Signal: evictionapi.SignalNodeFsAvailable, + Signal: evictionapi.SignalPIDAvailable, }, { Signal: evictionapi.SignalAllocatableMemoryAvailable, diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index 98cb572379..f8533a620e 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -122,6 +122,7 @@ go_test( "//pkg/kubelet/cm/cpuset:go_default_library", "//pkg/kubelet/container:go_default_library", "//pkg/kubelet/eviction:go_default_library", + "//pkg/kubelet/eviction/api:go_default_library", "//pkg/kubelet/images:go_default_library", "//pkg/kubelet/kubeletconfig:go_default_library", "//pkg/kubelet/kubeletconfig/status:go_default_library", diff --git a/test/e2e_node/eviction_test.go b/test/e2e_node/eviction_test.go index 6a4ae16de4..a698285b3d 100644 --- a/test/e2e_node/eviction_test.go +++ b/test/e2e_node/eviction_test.go @@ -33,6 +33,7 @@ import ( kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config" stats "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" "k8s.io/kubernetes/pkg/kubelet/eviction" + evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" kubeletmetrics "k8s.io/kubernetes/pkg/kubelet/metrics" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/test/e2e/framework" @@ -78,7 +79,7 @@ var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive][NodeF if inodesFree <= inodesConsumed { framework.Skipf("Too few inodes free on the host for the InodeEviction test to run") } - initialConfig.EvictionHard = map[string]string{"nodefs.inodesFree": fmt.Sprintf("%d", inodesFree-inodesConsumed)} + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)} initialConfig.EvictionMinimumReclaim = map[string]string{} }) runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logInodeMetrics, []podEvictSpec{ @@ -114,7 +115,7 @@ var _ = framework.KubeDescribe("ImageGCNoEviction [Slow] [Serial] [Disruptive][N if inodesFree <= inodesConsumed { framework.Skipf("Too few inodes free on the host for the InodeEviction test to run") } - initialConfig.EvictionHard = map[string]string{"nodefs.inodesFree": fmt.Sprintf("%d", inodesFree-inodesConsumed)} + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsInodesFree): fmt.Sprintf("%d", inodesFree-inodesConsumed)} initialConfig.EvictionMinimumReclaim = map[string]string{} }) // Consume enough inodes to induce disk pressure, @@ -173,7 +174,7 @@ var _ = framework.KubeDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive diskConsumed := resource.MustParse("100Mi") summary := eventuallyGetSummary() availableBytes := *(summary.Node.Fs.AvailableBytes) - initialConfig.EvictionHard = map[string]string{"nodefs.available": fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))} + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))} initialConfig.EvictionMinimumReclaim = map[string]string{} }) runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ @@ -205,14 +206,14 @@ var _ = framework.KubeDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disrup if availableBytes <= uint64(diskConsumed.Value()) { framework.Skipf("Too little disk free on the host for the LocalStorageSoftEviction test to run") } - initialConfig.EvictionSoft = map[string]string{"nodefs.available": fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))} - initialConfig.EvictionSoftGracePeriod = map[string]string{"nodefs.available": "1m"} + initialConfig.EvictionSoft = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))} + initialConfig.EvictionSoftGracePeriod = map[string]string{string(evictionapi.SignalNodeFsAvailable): "1m"} // Defer to the pod default grace period initialConfig.EvictionMaxPodGracePeriod = 30 initialConfig.EvictionMinimumReclaim = map[string]string{} // Ensure that pods are not evicted because of the eviction-hard threshold // setting a threshold to 0% disables; non-empty map overrides default value (necessary due to omitempty) - initialConfig.EvictionHard = map[string]string{"memory.available": "0%"} + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalMemoryAvailable): "0%"} }) runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logDiskMetrics, []podEvictSpec{ { @@ -234,7 +235,7 @@ var _ = framework.KubeDescribe("LocalStorageCapacityIsolationEviction [Slow] [Se Context(fmt.Sprintf(testContextFmt, "evictions due to pod local storage violations"), func() { tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { // setting a threshold to 0% disables; non-empty map overrides default value (necessary due to omitempty) - initialConfig.EvictionHard = map[string]string{"memory.available": "0%"} + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalMemoryAvailable): "0%"} }) sizeLimit := resource.MustParse("100Mi") useOverLimit := 101 /* Mb */ @@ -297,7 +298,7 @@ var _ = framework.KubeDescribe("PriorityMemoryEvictionOrdering [Slow] [Serial] [ if availableBytes <= uint64(memoryConsumed.Value()) { framework.Skipf("Too little memory free on the host for the PriorityMemoryEvictionOrdering test to run") } - initialConfig.EvictionHard = map[string]string{"memory.available": fmt.Sprintf("%d", availableBytes-uint64(memoryConsumed.Value()))} + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalMemoryAvailable): fmt.Sprintf("%d", availableBytes-uint64(memoryConsumed.Value()))} initialConfig.EvictionMinimumReclaim = map[string]string{} }) BeforeEach(func() { @@ -354,7 +355,7 @@ var _ = framework.KubeDescribe("PriorityLocalStorageEvictionOrdering [Slow] [Ser if availableBytes <= uint64(diskConsumed.Value()) { framework.Skipf("Too little disk free on the host for the PriorityLocalStorageEvictionOrdering test to run") } - initialConfig.EvictionHard = map[string]string{"nodefs.available": fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))} + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalNodeFsAvailable): fmt.Sprintf("%d", availableBytes-uint64(diskConsumed.Value()))} initialConfig.EvictionMinimumReclaim = map[string]string{} }) BeforeEach(func() { @@ -392,6 +393,47 @@ var _ = framework.KubeDescribe("PriorityLocalStorageEvictionOrdering [Slow] [Ser }) }) +// PriorityPidEvictionOrdering tests that the node emits pid pressure in response to a fork bomb, and evicts pods by priority +var _ = framework.KubeDescribe("PriorityPidEvictionOrdering [Slow] [Serial] [Disruptive][NodeFeature:Eviction]", func() { + f := framework.NewDefaultFramework("pidpressure-eviction-test") + pressureTimeout := 2 * time.Minute + expectedNodeCondition := v1.NodePIDPressure + expectedStarvedResource := noStarvedResource + + highPriorityClassName := f.BaseName + "-high-priority" + highPriority := int32(999999999) + + Context(fmt.Sprintf(testContextFmt, expectedNodeCondition), func() { + tempSetCurrentKubeletConfig(f, func(initialConfig *kubeletconfig.KubeletConfiguration) { + pidsConsumed := int64(10000) + summary := eventuallyGetSummary() + availablePids := *(summary.Node.Rlimit.MaxPID) - *(summary.Node.Rlimit.NumOfRunningProcesses) + initialConfig.EvictionHard = map[string]string{string(evictionapi.SignalPIDAvailable): fmt.Sprintf("%d", availablePids-pidsConsumed)} + initialConfig.EvictionMinimumReclaim = map[string]string{} + }) + BeforeEach(func() { + _, err := f.ClientSet.SchedulingV1beta1().PriorityClasses().Create(&schedulerapi.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: highPriorityClassName}, Value: highPriority}) + Expect(err == nil || errors.IsAlreadyExists(err)).To(BeTrue()) + }) + AfterEach(func() { + err := f.ClientSet.SchedulingV1beta1().PriorityClasses().Delete(highPriorityClassName, &metav1.DeleteOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + specs := []podEvictSpec{ + { + evictionPriority: 1, + pod: pidConsumingPod("fork-bomb-container", 12000), + }, + { + evictionPriority: 0, + pod: innocentPod(), + }, + } + specs[1].pod.Spec.PriorityClassName = highPriorityClassName + runEvictionTest(f, pressureTimeout, expectedNodeCondition, expectedStarvedResource, logPidMetrics, specs) + }) +}) + // Struct used by runEvictionTest that specifies the pod, and when that pod should be evicted, relative to other pods type podEvictSpec struct { // P0 should never be evicted, P1 shouldn't evict before P2, etc. @@ -722,6 +764,17 @@ func logMemoryMetrics() { } } +func logPidMetrics() { + summary, err := getNodeSummary() + if err != nil { + framework.Logf("Error getting summary: %v", err) + return + } + if summary.Node.Rlimit != nil && summary.Node.Rlimit.MaxPID != nil && summary.Node.Rlimit.NumOfRunningProcesses != nil { + framework.Logf("Node.Rlimit.MaxPID: %d, Node.Rlimit.RunningProcesses: %d", *summary.Node.Rlimit.MaxPID, *summary.Node.Rlimit.NumOfRunningProcesses) + } +} + func eventuallyGetSummary() (s *stats.Summary) { Eventually(func() error { summary, err := getNodeSummary() @@ -764,23 +817,33 @@ const ( ) func inodeConsumingPod(name string, numFiles int, volumeSource *v1.VolumeSource) *v1.Pod { + path := "" + if volumeSource != nil { + path = volumeMountPath + } // Each iteration creates an empty file - return podWithCommand(volumeSource, v1.ResourceRequirements{}, numFiles, name, "touch %s${i}.txt; sleep 0.001") + return podWithCommand(volumeSource, v1.ResourceRequirements{}, numFiles, name, fmt.Sprintf("touch %s${i}.txt; sleep 0.001;", filepath.Join(path, "file"))) } func diskConsumingPod(name string, diskConsumedMB int, volumeSource *v1.VolumeSource, resources v1.ResourceRequirements) *v1.Pod { + path := "" + if volumeSource != nil { + path = volumeMountPath + } // Each iteration writes 1 Mb, so do diskConsumedMB iterations. - return podWithCommand(volumeSource, resources, diskConsumedMB, name, "dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null") + return podWithCommand(volumeSource, resources, diskConsumedMB, name, fmt.Sprintf("dd if=/dev/urandom of=%s${i} bs=1048576 count=1 2>/dev/null;", filepath.Join(path, "file"))) +} + +func pidConsumingPod(name string, numProcesses int) *v1.Pod { + // Each iteration forks once, but creates two processes + return podWithCommand(nil, v1.ResourceRequirements{}, numProcesses/2, name, "(while true; do sleep 5; done)&") } // podWithCommand returns a pod with the provided volumeSource and resourceRequirements. -// If a volumeSource is provided, then the volumeMountPath to the volume is inserted into the provided command. func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirements, iterations int, name, command string) *v1.Pod { - path := "" volumeMounts := []v1.VolumeMount{} volumes := []v1.Volume{} if volumeSource != nil { - path = volumeMountPath volumeMounts = []v1.VolumeMount{{MountPath: volumeMountPath, Name: volumeName}} volumes = []v1.Volume{{Name: volumeName, VolumeSource: *volumeSource}} } @@ -795,7 +858,7 @@ func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirem Command: []string{ "sh", "-c", - fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s; i=$(($i+1)); done; while true; do sleep 5; done", iterations, fmt.Sprintf(command, filepath.Join(path, "file"))), + fmt.Sprintf("i=0; while [ $i -lt %d ]; do %s i=$(($i+1)); done; while true; do sleep 5; done", iterations, command), }, Resources: resources, VolumeMounts: volumeMounts,