mirror of https://github.com/k3s-io/k3s
Merge pull request #41803 from wojtek-t/allowed_not_running_pods
Automatic merge from submit-queue (batch tested with PRs 41844, 41803, 39116, 41129, 41240) Allow for not-ready pods in large clusters This is to workaround issues with non-starting pods in large clusters in roughly 1/3rd of runs.pull/6/head
commit
af4513cd3f
|
@ -137,7 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
|
|||
// test pods from running, and tests that ensure all pods are running and
|
||||
// ready will fail).
|
||||
podStartupTimeout := framework.TestContext.SystemPodsStartupTimeout
|
||||
if err := framework.WaitForPodsRunningReady(c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), podStartupTimeout, framework.ImagePullerLabels, true); err != nil {
|
||||
// TODO: In large clusters, we often observe a non-starting pods due to
|
||||
// #41007. To avoid those pods preventing the whole test runs (and just
|
||||
// wasting the whole run), we allow for some not-ready pods (with the
|
||||
// number equal to the number of allowed not-ready nodes).
|
||||
if err := framework.WaitForPodsRunningReady(c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), podStartupTimeout, framework.ImagePullerLabels, true); err != nil {
|
||||
framework.DumpAllNamespaceInfo(c, metav1.NamespaceSystem)
|
||||
framework.LogFailedContainers(c, metav1.NamespaceSystem, framework.Logf)
|
||||
runKubernetesServiceTestContainer(c, metav1.NamespaceDefault)
|
||||
|
|
|
@ -495,8 +495,7 @@ func WaitForPodsSuccess(c clientset.Interface, ns string, successPodLabels map[s
|
|||
// and some in Success. This is to allow the client to decide if "Success"
|
||||
// means "Ready" or not.
|
||||
// If skipSucceeded is true, any pods that are Succeeded are not counted.
|
||||
func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, timeout time.Duration, ignoreLabels map[string]string, skipSucceeded bool) error {
|
||||
|
||||
func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods, allowedNotReadyPods int32, timeout time.Duration, ignoreLabels map[string]string, skipSucceeded bool) error {
|
||||
ignoreSelector := labels.SelectorFromSet(ignoreLabels)
|
||||
start := time.Now()
|
||||
Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
|
||||
|
@ -504,6 +503,7 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, ti
|
|||
wg := sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
var waitForSuccessError error
|
||||
var ignoreNotReady bool
|
||||
badPods := []v1.Pod{}
|
||||
desiredPods := 0
|
||||
go func() {
|
||||
|
@ -544,6 +544,7 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, ti
|
|||
return false, nil
|
||||
}
|
||||
nOk := int32(0)
|
||||
notReady := int32(0)
|
||||
badPods = []v1.Pod{}
|
||||
desiredPods = len(podList.Items)
|
||||
for _, pod := range podList.Items {
|
||||
|
@ -564,6 +565,7 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, ti
|
|||
return false, errors.New("unexpected Succeeded pod state")
|
||||
case pod.Status.Phase != v1.PodFailed:
|
||||
Logf("The status of Pod %s is %s (Ready = false), waiting for it to be either Running (with Ready = true) or Failed", pod.ObjectMeta.Name, pod.Status.Phase)
|
||||
notReady++
|
||||
badPods = append(badPods, pod)
|
||||
default:
|
||||
if _, ok := pod.Annotations[v1.CreatedByAnnotation]; !ok {
|
||||
|
@ -581,10 +583,14 @@ func WaitForPodsRunningReady(c clientset.Interface, ns string, minPods int32, ti
|
|||
if replicaOk == replicas && nOk >= minPods && len(badPods) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
ignoreNotReady = (notReady <= allowedNotReadyPods)
|
||||
logPodStates(badPods)
|
||||
return false, nil
|
||||
}) != nil {
|
||||
return errors.New(errorBadPodsStates(badPods, desiredPods, ns, "RUNNING and READY", timeout))
|
||||
if !ignoreNotReady {
|
||||
return errors.New(errorBadPodsStates(badPods, desiredPods, ns, "RUNNING and READY", timeout))
|
||||
}
|
||||
Logf("Number of not-ready pods is allowed.")
|
||||
}
|
||||
wg.Wait()
|
||||
if waitForSuccessError != nil {
|
||||
|
|
|
@ -68,7 +68,7 @@ var _ = framework.KubeDescribe("Mesos", func() {
|
|||
nodelist := framework.GetReadySchedulableNodesOrDie(client)
|
||||
const ns = "static-pods"
|
||||
numpods := int32(len(nodelist.Items))
|
||||
framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, wait.ForeverTestTimeout, map[string]string{}, false),
|
||||
framework.ExpectNoError(framework.WaitForPodsRunningReady(client, ns, numpods, 0, wait.ForeverTestTimeout, map[string]string{}, false),
|
||||
fmt.Sprintf("number of static pods in namespace %s is %d", ns, numpods))
|
||||
})
|
||||
|
||||
|
|
|
@ -237,7 +237,7 @@ var _ = framework.KubeDescribe("Nodes [Disruptive]", func() {
|
|||
// Many e2e tests assume that the cluster is fully healthy before they start. Wait until
|
||||
// the cluster is restored to health.
|
||||
By("waiting for system pods to successfully restart")
|
||||
err := framework.WaitForPodsRunningReady(c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout, ignoreLabels, true)
|
||||
err := framework.WaitForPodsRunningReady(c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout, ignoreLabels, true)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
By("waiting for image prepulling pods to complete")
|
||||
framework.WaitForPodsSuccess(c, metav1.NamespaceSystem, framework.ImagePullerLabels, imagePrePullingTimeout)
|
||||
|
|
|
@ -90,7 +90,7 @@ var _ = framework.KubeDescribe("SchedulerPredicates [Serial]", func() {
|
|||
}
|
||||
}
|
||||
|
||||
err = framework.WaitForPodsRunningReady(cs, metav1.NamespaceSystem, int32(systemPodsNo), framework.PodReadyBeforeTimeout, ignoreLabels, true)
|
||||
err = framework.WaitForPodsRunningReady(cs, metav1.NamespaceSystem, int32(systemPodsNo), 0, framework.PodReadyBeforeTimeout, ignoreLabels, true)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
for _, node := range nodeList.Items {
|
||||
|
|
Loading…
Reference in New Issue