remove flaky label from eviction tests

2018-01-16 11:22:17 -08:00 · 2018-01-16 11:22:17 -08:00 · a436a3fe26
parent c24faeddcc
commit a436a3fe26
3 changed files with 55 additions and 295 deletions
--- a/test/e2e_node/BUILD
+++ b/test/e2e_node/BUILD
@ -98,7 +98,6 @@ go_test(
        "kubelet_test.go",
        "lifecycle_hook_test.go",
        "log_path_test.go",
        "memory_eviction_test.go",
        "mirror_pod_test.go",
        "pods_container_manager_test.go",
        "runtime_conformance_test.go",
--- a/test/e2e_node/eviction_test.go
+++ b/test/e2e_node/eviction_test.go
@ -19,6 +19,7 @@ package e2e_node
 import (
 	"fmt"
 	"path/filepath"
 	"strconv"
 	"time"
 	"k8s.io/api/core/v1"
@ -55,7 +56,7 @@ const (
 // InodeEviction tests that the node responds to node disk pressure by evicting only responsible pods.
 // Node disk pressure is induced by consuming all inodes on the node.
-var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
+var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive]", func() {
 	f := framework.NewDefaultFramework("inode-eviction-test")
 	expectedNodeCondition := v1.NodeDiskPressure
 	pressureTimeout := 15 * time.Minute
@ -90,7 +91,7 @@ var _ = framework.KubeDescribe("InodeEviction [Slow] [Serial] [Disruptive] [Flak
 // MemoryAllocatableEviction tests that the node responds to node memory pressure by evicting only responsible pods.
 // Node memory pressure is only encountered because we reserve the majority of the node's capacity via kube-reserved.
-var _ = framework.KubeDescribe("MemoryAllocatableEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
+var _ = framework.KubeDescribe("MemoryAllocatableEviction [Slow] [Serial] [Disruptive]", func() {
 	f := framework.NewDefaultFramework("memory-allocatable-eviction-test")
 	expectedNodeCondition := v1.NodeMemoryPressure
 	pressureTimeout := 10 * time.Minute
@ -122,7 +123,7 @@ var _ = framework.KubeDescribe("MemoryAllocatableEviction [Slow] [Serial] [Disru
 // LocalStorageEviction tests that the node responds to node disk pressure by evicting only responsible pods
 // Disk pressure is induced by running pods which consume disk space.
-var _ = framework.KubeDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
+var _ = framework.KubeDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive]", func() {
 	f := framework.NewDefaultFramework("localstorage-eviction-test")
 	pressureTimeout := 10 * time.Minute
 	expectedNodeCondition := v1.NodeDiskPressure
@ -150,7 +151,7 @@ var _ = framework.KubeDescribe("LocalStorageEviction [Slow] [Serial] [Disruptive
 // LocalStorageEviction tests that the node responds to node disk pressure by evicting only responsible pods
 // Disk pressure is induced by running pods which consume disk space, which exceed the soft eviction threshold.
 // Note: This test's purpose is to test Soft Evictions.  Local storage was chosen since it is the least costly to run.
-var _ = framework.KubeDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disruptive] [Flaky]", func() {
+var _ = framework.KubeDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disruptive]", func() {
 	f := framework.NewDefaultFramework("localstorage-eviction-test")
 	pressureTimeout := 10 * time.Minute
 	expectedNodeCondition := v1.NodeDiskPressure
@ -184,7 +185,7 @@ var _ = framework.KubeDescribe("LocalStorageSoftEviction [Slow] [Serial] [Disrup
 })
 // LocalStorageCapacityIsolationEviction tests that container and volume local storage limits are enforced through evictions
-var _ = framework.KubeDescribe("LocalStorageCapacityIsolationEviction [Slow] [Serial] [Disruptive] [Flaky] [Feature:LocalStorageCapacityIsolation]", func() {
+var _ = framework.KubeDescribe("LocalStorageCapacityIsolationEviction [Slow] [Serial] [Disruptive] [Feature:LocalStorageCapacityIsolation]", func() {
 	f := framework.NewDefaultFramework("localstorage-eviction-test")
 	evictionTestTimeout := 10 * time.Minute
 	Context(fmt.Sprintf(testContextFmt, "evictions due to pod local storage violations"), func() {
@ -236,7 +237,7 @@ var _ = framework.KubeDescribe("LocalStorageCapacityIsolationEviction [Slow] [Se
 // PriorityMemoryEvictionOrdering tests that the node responds to node memory pressure by evicting pods.
 // This test tests that the guaranteed pod is never evicted, and that the lower-priority pod is evicted before
 // the higher priority pod.
-var _ = framework.KubeDescribe("PriorityMemoryEvictionOrdering [Slow] [Serial] [Disruptive] [Flaky]", func() {
+var _ = framework.KubeDescribe("PriorityMemoryEvictionOrdering [Slow] [Serial] [Disruptive]", func() {
 	f := framework.NewDefaultFramework("priority-memory-eviction-ordering-test")
 	expectedNodeCondition := v1.NodeMemoryPressure
 	pressureTimeout := 10 * time.Minute
@ -282,7 +283,7 @@ var _ = framework.KubeDescribe("PriorityMemoryEvictionOrdering [Slow] [Serial] [
 // PriorityLocalStorageEvictionOrdering tests that the node responds to node disk pressure by evicting pods.
 // This test tests that the guaranteed pod is never evicted, and that the lower-priority pod is evicted before
 // the higher priority pod.
-var _ = framework.KubeDescribe("PriorityLocalStorageEvictionOrdering [Slow] [Serial] [Disruptive] [Flaky]", func() {
+var _ = framework.KubeDescribe("PriorityLocalStorageEvictionOrdering [Slow] [Serial] [Disruptive]", func() {
 	f := framework.NewDefaultFramework("priority-disk-eviction-ordering-test")
 	expectedNodeCondition := v1.NodeDiskPressure
 	pressureTimeout := 10 * time.Minute
@ -668,3 +669,50 @@ func podWithCommand(volumeSource *v1.VolumeSource, resources v1.ResourceRequirem
 		},
 	}
 }
 func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
 	env := []v1.EnvVar{
 		{
 			Name: "MEMORY_LIMIT",
 			ValueFrom: &v1.EnvVarSource{
 				ResourceFieldRef: &v1.ResourceFieldSelector{
 					Resource: "limits.memory",
 				},
 			},
 		},
 	}
 	// If there is a limit specified, pass 80% of it for -mem-total, otherwise use the downward API
 	// to pass limits.memory, which will be the total memory available.
 	// This helps prevent a guaranteed pod from triggering an OOM kill due to it's low memory limit,
 	// which will cause the test to fail inappropriately.
 	var memLimit string
 	if limit, ok := res.Limits[v1.ResourceMemory]; ok {
 		memLimit = strconv.Itoa(int(
 			float64(limit.Value()) * 0.8))
 	} else {
 		memLimit = "$(MEMORY_LIMIT)"
 	}
 	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: podName,
 		},
 		Spec: v1.PodSpec{
 			RestartPolicy: v1.RestartPolicyNever,
 			Containers: []v1.Container{
 				{
 					Name:            ctnName,
 					Image:           "k8s.gcr.io/stress:v1",
 					ImagePullPolicy: "Always",
 					Env:             env,
 					// 60 min timeout * 60s / tick per 10s = 360 ticks before timeout => ~11.11Mi/tick
 					// to fill ~4Gi of memory, so initial ballpark 12Mi/tick.
 					// We might see flakes due to timeout if the total memory on the nodes increases.
 					Args:      []string{"-mem-alloc-size", "12Mi", "-mem-alloc-sleep", "10s", "-mem-total", memLimit},
 					Resources: res,
 				},
 			},
 		},
 	}
 }
--- a/test/e2e_node/memory_eviction_test.go
+++ b/test/e2e_node/memory_eviction_test.go
@ -1,287 +0,0 @@
 /*
 Copyright 2016 The Kubernetes Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package e2e_node
 import (
 	"fmt"
 	"strconv"
 	"time"
 	"github.com/golang/glog"
 	"k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	nodeutil "k8s.io/kubernetes/pkg/api/v1/node"
 	"k8s.io/kubernetes/pkg/kubelet/apis/kubeletconfig"
 	"k8s.io/kubernetes/test/e2e/framework"
 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/gomega"
 )
 // Eviction Policy is described here:
 // https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md
 var _ = framework.KubeDescribe("MemoryEviction [Slow] [Serial] [Disruptive]", func() {
 	var (
 		evictionHard = map[string]string{"memory.available": "40%"}
 	)
 	f := framework.NewDefaultFramework("eviction-test")
 	// This is a dummy context to wrap the outer AfterEach, which will run after the inner AfterEach.
 	// We want to list all of the node and pod events, including any that occur while waiting for
 	// memory pressure reduction, even if we time out while waiting.
 	Context("", func() {
 		AfterEach(func() {
 			// Print events
 			logNodeEvents(f)
 			logPodEvents(f)
 		})
 		Context("", func() {
 			tempSetCurrentKubeletConfig(f, func(c *kubeletconfig.KubeletConfiguration) {
 				c.EvictionHard = evictionHard
 			})
 			Context("when there is memory pressure", func() {
 				AfterEach(func() {
 					// Wait for the memory pressure condition to disappear from the node status before continuing.
 					By("waiting for the memory pressure condition on the node to disappear before ending the test.")
 					Eventually(func() error {
 						nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
 						if err != nil {
 							return fmt.Errorf("tried to get node list but got error: %v", err)
 						}
 						// Assuming that there is only one node, because this is a node e2e test.
 						if len(nodeList.Items) != 1 {
 							return fmt.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
 						}
 						node := nodeList.Items[0]
 						_, pressure := nodeutil.GetNodeCondition(&node.Status, v1.NodeMemoryPressure)
 						if pressure != nil && pressure.Status == v1.ConditionTrue {
 							return fmt.Errorf("node is still reporting memory pressure condition: %s", pressure)
 						}
 						return nil
 					}, 5*time.Minute, 15*time.Second).Should(BeNil())
 					// Check available memory after condition disappears, just in case:
 					// Wait for available memory to decrease to a reasonable level before ending the test.
 					// This helps prevent interference with tests that start immediately after this one.
 					By("waiting for available memory to decrease to a reasonable level before ending the test.")
 					Eventually(func() error {
 						summary, err := getNodeSummary()
 						if err != nil {
 							return err
 						}
 						if summary.Node.Memory.AvailableBytes == nil {
 							return fmt.Errorf("summary.Node.Memory.AvailableBytes was nil, cannot get memory stats.")
 						}
 						if summary.Node.Memory.WorkingSetBytes == nil {
 							return fmt.Errorf("summary.Node.Memory.WorkingSetBytes was nil, cannot get memory stats.")
 						}
 						avail := *summary.Node.Memory.AvailableBytes
 						wset := *summary.Node.Memory.WorkingSetBytes
 						// memory limit = avail + wset
 						limit := avail + wset
 						halflimit := limit / 2
 						// Wait for at least half of memory limit to be available
 						if avail >= halflimit {
 							return nil
 						}
 						return fmt.Errorf("current available memory is: %d bytes. Expected at least %d bytes available.", avail, halflimit)
 					}, 5*time.Minute, 15*time.Second).Should(BeNil())
 					// TODO(mtaufen): 5 minute wait to stop flaky test bleeding while we figure out what is actually going on.
 					//                If related to pressure transition period in eviction manager, probably only need to wait
 					//                just over 30s becasue that is the transition period set for node e2e tests. But since we
 					//                know 5 min works and we don't know if transition period is the problem, wait 5 min for now.
 					time.Sleep(5 * time.Minute)
 					// Finally, try starting a new pod and wait for it to be scheduled and running.
 					// This is the final check to try to prevent interference with subsequent tests.
 					podName := "admit-best-effort-pod"
 					f.PodClient().CreateSync(&v1.Pod{
 						ObjectMeta: metav1.ObjectMeta{
 							Name: podName,
 						},
 						Spec: v1.PodSpec{
 							RestartPolicy: v1.RestartPolicyNever,
 							Containers: []v1.Container{
 								{
 									Image: framework.GetPauseImageNameForHostArch(),
 									Name:  podName,
 								},
 							},
 						},
 					})
 				})
 				It("should evict pods in the correct order (besteffort first, then burstable, then guaranteed)", func() {
 					By("creating a guaranteed pod, a burstable pod, and a besteffort pod.")
 					// A pod is guaranteed only when requests and limits are specified for all the containers and they are equal.
 					guaranteed := getMemhogPod("guaranteed-pod", "guaranteed", v1.ResourceRequirements{
 						Requests: v1.ResourceList{
 							v1.ResourceCPU:    resource.MustParse("100m"),
 							v1.ResourceMemory: resource.MustParse("100Mi"),
 						},
 						Limits: v1.ResourceList{
 							v1.ResourceCPU:    resource.MustParse("100m"),
 							v1.ResourceMemory: resource.MustParse("100Mi"),
 						}})
 					guaranteed = f.PodClient().CreateSync(guaranteed)
 					glog.Infof("pod created with name: %s", guaranteed.Name)
 					// A pod is burstable if limits and requests do not match across all containers.
 					burstable := getMemhogPod("burstable-pod", "burstable", v1.ResourceRequirements{
 						Requests: v1.ResourceList{
 							v1.ResourceCPU:    resource.MustParse("100m"),
 							v1.ResourceMemory: resource.MustParse("100Mi"),
 						}})
 					burstable = f.PodClient().CreateSync(burstable)
 					glog.Infof("pod created with name: %s", burstable.Name)
 					// A pod is besteffort if none of its containers have specified any requests or limits	.
 					besteffort := getMemhogPod("besteffort-pod", "besteffort", v1.ResourceRequirements{})
 					besteffort = f.PodClient().CreateSync(besteffort)
 					glog.Infof("pod created with name: %s", besteffort.Name)
 					// We poll until timeout or all pods are killed.
 					// Inside the func, we check that all pods are in a valid phase with
 					// respect to the eviction order of best effort, then burstable, then guaranteed.
 					By("polling the Status.Phase of each pod and checking for violations of the eviction order.")
 					Eventually(func() error {
 						gteed, gtErr := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(guaranteed.Name, metav1.GetOptions{})
 						framework.ExpectNoError(gtErr, fmt.Sprintf("getting pod %s", guaranteed.Name))
 						gteedPh := gteed.Status.Phase
 						burst, buErr := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(burstable.Name, metav1.GetOptions{})
 						framework.ExpectNoError(buErr, fmt.Sprintf("getting pod %s", burstable.Name))
 						burstPh := burst.Status.Phase
 						best, beErr := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(besteffort.Name, metav1.GetOptions{})
 						framework.ExpectNoError(beErr, fmt.Sprintf("getting pod %s", besteffort.Name))
 						bestPh := best.Status.Phase
 						glog.Infof("pod phase: guaranteed: %v, burstable: %v, besteffort: %v", gteedPh, burstPh, bestPh)
 						// NOTE/TODO(mtaufen): This should help us debug why burstable appears to fail before besteffort in some
 						//                     scenarios. We have seen some evidence that the eviction manager has in fact done the
 						//                     right thing and evicted the besteffort first, and attempted to change the besteffort phase
 						//                     to "Failed" when it evicts it, but that for some reason the test isn't seeing the updated
 						//                     phase. I'm trying to confirm or deny this.
 						//                     The eviction manager starts trying to evict things when the node comes under memory
 						//                     pressure, and the eviction manager reports this information in the pressure condition. If we
 						//                     see the eviction manager reporting a pressure condition for a while without the besteffort failing,
 						//                     and we see that the manager did in fact evict the besteffort (this should be in the Kubelet log), we
 						//                     will have more reason to believe the phase is out of date.
 						nodeList, err := f.ClientSet.CoreV1().Nodes().List(metav1.ListOptions{})
 						if err != nil {
 							glog.Errorf("tried to get node list but got error: %v", err)
 						}
 						if len(nodeList.Items) != 1 {
 							glog.Errorf("expected 1 node, but see %d. List: %v", len(nodeList.Items), nodeList.Items)
 						}
 						node := nodeList.Items[0]
 						_, pressure := nodeutil.GetNodeCondition(&node.Status, v1.NodeMemoryPressure)
 						glog.Infof("node pressure condition: %s", pressure)
 						// NOTE/TODO(mtaufen): Also log (at least temporarily) the actual memory consumption on the node.
 						//                     I used this to plot memory usage from a successful test run and it looks the
 						//                     way I would expect. I want to see what the plot from a flake looks like.
 						summary, err := getNodeSummary()
 						if err != nil {
 							return err
 						}
 						if summary.Node.Memory.WorkingSetBytes != nil {
 							wset := *summary.Node.Memory.WorkingSetBytes
 							glog.Infof("Node's working set is (bytes): %v", wset)
 						}
 						if bestPh == v1.PodRunning {
 							Expect(burstPh).NotTo(Equal(v1.PodFailed), "burstable pod failed before best effort pod")
 							Expect(gteedPh).NotTo(Equal(v1.PodFailed), "guaranteed pod failed before best effort pod")
 						} else if burstPh == v1.PodRunning {
 							Expect(gteedPh).NotTo(Equal(v1.PodFailed), "guaranteed pod failed before burstable pod")
 						}
 						// When both besteffort and burstable have been evicted, the test has completed.
 						if bestPh == v1.PodFailed && burstPh == v1.PodFailed {
 							return nil
 						}
 						return fmt.Errorf("besteffort and burstable have not yet both been evicted.")
 					}, 60*time.Minute, 5*time.Second).Should(BeNil())
 				})
 			})
 		})
 	})
 })
 func getMemhogPod(podName string, ctnName string, res v1.ResourceRequirements) *v1.Pod {
 	env := []v1.EnvVar{
 		{
 			Name: "MEMORY_LIMIT",
 			ValueFrom: &v1.EnvVarSource{
 				ResourceFieldRef: &v1.ResourceFieldSelector{
 					Resource: "limits.memory",
 				},
 			},
 		},
 	}
 	// If there is a limit specified, pass 80% of it for -mem-total, otherwise use the downward API
 	// to pass limits.memory, which will be the total memory available.
 	// This helps prevent a guaranteed pod from triggering an OOM kill due to it's low memory limit,
 	// which will cause the test to fail inappropriately.
 	var memLimit string
 	if limit, ok := res.Limits[v1.ResourceMemory]; ok {
 		memLimit = strconv.Itoa(int(
 			float64(limit.Value()) * 0.8))
 	} else {
 		memLimit = "$(MEMORY_LIMIT)"
 	}
 	return &v1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: podName,
 		},
 		Spec: v1.PodSpec{
 			RestartPolicy: v1.RestartPolicyNever,
 			Containers: []v1.Container{
 				{
 					Name:            ctnName,
 					Image:           "gcr.io/google-containers/stress:v1",
 					ImagePullPolicy: "Always",
 					Env:             env,
 					// 60 min timeout * 60s / tick per 10s = 360 ticks before timeout => ~11.11Mi/tick
 					// to fill ~4Gi of memory, so initial ballpark 12Mi/tick.
 					// We might see flakes due to timeout if the total memory on the nodes increases.
 					Args:      []string{"-mem-alloc-size", "12Mi", "-mem-alloc-sleep", "10s", "-mem-total", memLimit},
 					Resources: res,
 				},
 			},
 		},
 	}
 }