From a284927713ca71d694890f1d6c17684c0a5eadf4 Mon Sep 17 00:00:00 2001 From: Isaac Hollander McCreery Date: Mon, 11 Jan 2016 15:30:23 -0800 Subject: [PATCH] Add 5-minute sleep in GKE for dead tunnels to be removed --- test/e2e/reboot.go | 11 +++++++++++ test/e2e/resize_nodes.go | 12 ++++++++++++ 2 files changed, 23 insertions(+) diff --git a/test/e2e/reboot.go b/test/e2e/reboot.go index b63fe39fb4..8fcd1d51ec 100644 --- a/test/e2e/reboot.go +++ b/test/e2e/reboot.go @@ -68,6 +68,17 @@ var _ = Describe("Reboot [Disruptive]", func() { Logf("event for %v: %v %v: %v", e.InvolvedObject.Name, e.Source, e.Reason, e.Message) } } + // In GKE, our current tunneling setup has the potential to hold on to a broken tunnel (from a + // rebooted/deleted node) for up to 5 minutes before all tunnels are dropped and recreated. Most tests + // make use of some proxy feature to verify functionality. So, if a reboot test runs right before a test + // that tries to get logs, for example, we may get unlucky and try to use a closed tunnel to a node that + // was recently rebooted. There's no good way to poll for proxies being closed, so we sleep. + // + // TODO(cjcullen) reduce this sleep (#19314) + if providerIs("gke") { + By("waiting 5 minutes for all dead tunnels to be dropped") + time.Sleep(5 * time.Minute) + } }) f = NewFramework("reboot") diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index d40d789d83..eb51b19a98 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -422,6 +422,18 @@ var _ = Describe("Nodes [Disruptive]", func() { if err := resizeGroup(testContext.CloudConfig.NumNodes); err != nil { Failf("Couldn't restore the original node instance group size: %v", err) } + // In GKE, our current tunneling setup has the potential to hold on to a broken tunnel (from a + // rebooted/deleted node) for up to 5 minutes before all tunnels are dropped and recreated. + // Most tests make use of some proxy feature to verify functionality. So, if a reboot test runs + // right before a test that tries to get logs, for example, we may get unlucky and try to use a + // closed tunnel to a node that was recently rebooted. There's no good way to poll for proxies + // being closed, so we sleep. + // + // TODO(cjcullen) reduce this sleep (#19314) + if providerIs("gke") { + By("waiting 5 minutes for all dead tunnels to be dropped") + time.Sleep(5 * time.Minute) + } if err := waitForGroupSize(testContext.CloudConfig.NumNodes); err != nil { Failf("Couldn't restore the original node instance group size: %v", err) }