Create chaosmonkey package, use ServiceTestJig for upgrade tests

2016-04-06 15:12:31 -07:00 · 2016-04-06 15:12:31 -07:00 · 4e5e1b8ac5
parent f29d597d02
commit 4e5e1b8ac5
4 changed files with 362 additions and 274 deletions
--- a/test/e2e/chaosmonkey/chaosmonkey.go
+++ b/test/e2e/chaosmonkey/chaosmonkey.go
@ -0,0 +1,149 @@
+/*
+Copyright 2016 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package chaosmonkey
+
+import . "github.com/onsi/ginkgo"
+
+// Disruption is the type to construct a chaosmonkey with; see Do for more information.
+type Disruption func()
+
+// Test is the type to register with a chaosmonkey.  A test will run asynchronously across the
+// chaosmonkey's Disruption.  A Test takes a Semaphore as an argument.  It should call sem.Ready()
+// once it's ready for the disruption to start and should then wait until sem.StopCh (which is a
+// <-chan struct{}) is closed, which signals that the disruption is over.  It should then clean up
+// and return.  See Do and Semaphore for more information.
+type Test func(sem *Semaphore)
+
+// Interface can be implemented if you prefer to define tests without dealing with a Semaphore.  You
+// may define a struct that implements Interface's three methods (Setup, Test, and Teardown) and
+// RegisterInterface.  See RegisterInterface for more information.
+type Interface interface {
+	Setup()
+	Test(stopCh <-chan struct{})
+	Teardown()
+}
+
+type chaosmonkey struct {
+	disruption Disruption
+	tests      []Test
+}
+
+// New creates and returns a chaosmonkey, with which the caller should register Tests and call Do.
+// See Do for more information.
+func New(disruption Disruption) *chaosmonkey {
+	return &chaosmonkey{
+		disruption,
+		[]Test{},
+	}
+}
+
+// Register registers the given Test with the chaosmonkey, so that the test will run over the
+// Disruption.
+func (cm *chaosmonkey) Register(test Test) {
+	cm.tests = append(cm.tests, test)
+}
+
+// RegisterInterface registers the given Interface with the chaosmonkey, so the chaosmonkey will
+// call Setup, Test, and Teardown properly.  Test can tell that the Disruption is finished when
+// stopCh is closed.
+func (cm *chaosmonkey) RegisterInterface(in Interface) {
+	cm.Register(func(sem *Semaphore) {
+		in.Setup()
+		sem.Ready()
+		in.Test(sem.StopCh)
+		in.Teardown()
+	})
+}
+
+// Do performs the Disruption while testing the registered Tests.  Once the caller has registered
+// all Tests with the chaosmonkey, they call Do.  Do starts each registered test asynchronously and
+// waits for each test to signal that it is ready by calling sem.Ready().  Do will then do the
+// Disruption, and when it's complete, close sem.StopCh to signal to the registered Tests that the
+// Disruption is over, and wait for all Tests to return.
+func (cm *chaosmonkey) Do() {
+	sems := []*Semaphore{}
+	// All semaphores have the same StopCh.
+	stopCh := make(chan struct{})
+
+	for _, test := range cm.tests {
+		sem := newSemaphore(stopCh)
+		sems = append(sems, sem)
+		go func() {
+			defer GinkgoRecover()
+			defer sem.done()
+			test(sem)
+		}()
+	}
+
+	By("Waiting for all async tests to be ready")
+	for _, sem := range sems {
+		sem.waitForReadyOrDone()
+	}
+
+	By("Starting disruption")
+	cm.disruption()
+	By("Disruption complete; stopping async validations")
+	close(stopCh)
+	By("Waiting for async validations to complete")
+	for _, sem := range sems {
+		sem.waitForDone()
+	}
+}
+
+// Semaphore is taken by a Test and provides: Ready(), for the Test to call when it's ready for the
+// disruption to start; and StopCh, the closure of which signals to the Test that the disruption is
+// finished.
+type Semaphore struct {
+	readyCh chan struct{}
+	StopCh  <-chan struct{}
+	doneCh  chan struct{}
+}
+
+func newSemaphore(stopCh <-chan struct{}) *Semaphore {
+	// We don't want to block on Ready() or done()
+	return &Semaphore {
+		make(chan struct{}, 1),
+		stopCh,
+		make(chan struct{}, 1),
+	}
+}
+
+// Ready is called by the Test to signal that the Test is ready for the disruption to start.
+func (sem *Semaphore) Ready() {
+	close(sem.readyCh)
+}
+
+// done is an internal method for Go to defer, both to wait for all tests to return, but also to
+// sense if a test panicked before calling Ready.  See waitForReadyOrDone.
+func (sem *Semaphore) done() {
+	close(sem.doneCh)
+}
+
+// We would like to just check if all tests are ready, but if they fail (which Ginkgo implements as
+// a panic), they may not have called Ready().  We check done as well to see if the function has
+// already returned; if it has, we don't care if it's ready, and just continue.
+func (sem *Semaphore) waitForReadyOrDone() {
+	select {
+	case <-sem.readyCh:
+	case <-sem.doneCh:
+	}
+}
+
+// waitForDone is an internal method for Go to wait on all Tests returning.
+func (sem *Semaphore) waitForDone() {
+	<-sem.doneCh
+}
--- a/test/e2e/chaosmonkey/chaosmonkey_test.go
+++ b/test/e2e/chaosmonkey/chaosmonkey_test.go
@ -0,0 +1,53 @@
+/*
+Copyright 2016 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package chaosmonkey
+
+import (
+	"sync/atomic"
+	"testing"
+)
+
+func TestDoWithPanic(t *testing.T) {
+	var counter int64 = 0
+	cm := New(func() {})
+	tests := []Test{
+		// No panic
+		func(sem *Semaphore) {
+			defer atomic.AddInt64(&counter, 1)
+			sem.Ready()
+		},
+		// Panic after sem.Ready()
+		func(sem *Semaphore) {
+			defer atomic.AddInt64(&counter, 1)
+			sem.Ready()
+			panic("Panic after calling sem.Ready()")
+		},
+		// Panic before sem.Ready()
+		func(sem *Semaphore) {
+			defer atomic.AddInt64(&counter, 1)
+			panic("Panic before calling sem.Ready()")
+		},
+	}
+	for _, test := range tests {
+		cm.Register(test)
+	}
+	cm.Do()
+	// Check that all funcs in tests were called.
+	if int(counter) != len(tests) {
+		t.Errorf("Expected counter to be %v, but it was %v", len(tests), counter)
+	}
+}
--- a/test/e2e/cluster_upgrade.go
+++ b/test/e2e/cluster_upgrade.go
@ -1,5 +1,5 @@
 /*
-Copyright 2015 The Kubernetes Authors All rights reserved.
+Copyright 2016 The Kubernetes Authors All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -20,30 +20,118 @@ import (
 	"bytes"
 	"fmt"
 	"io"
-	"net/http"
 	"os"
 	"os/exec"
 	"path"
 	"strings"
-	"sync"
 	"time"

 	"k8s.io/kubernetes/pkg/api"
 	client "k8s.io/kubernetes/pkg/client/unversioned"
 	"k8s.io/kubernetes/pkg/util/wait"
+	"k8s.io/kubernetes/test/e2e/chaosmonkey"
 	"k8s.io/kubernetes/test/e2e/framework"

 	. "github.com/onsi/ginkgo"
-	. "github.com/onsi/gomega"
 )

+// TODO(mikedanese): Add setup, validate, and teardown for:
+//  - secrets
+//  - volumes
+//  - persistent volumes
+var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() {
+	f := framework.NewDefaultFramework("cluster-upgrade")
+
+	framework.KubeDescribe("master upgrade", func() {
+		It("should maintain responsive services [Feature:MasterUpgrade]", func() {
+			cm := chaosmonkey.New(func() {
+				v, err := realVersion(framework.TestContext.UpgradeTarget)
+				framework.ExpectNoError(err)
+				framework.ExpectNoError(masterUpgrade(v))
+				framework.ExpectNoError(checkMasterVersion(f.Client, v))
+			})
+			cm.Register(func(sem *chaosmonkey.Semaphore) {
+				// Close over f.
+				testServiceRemainsUp(f, sem)
+			})
+			cm.Do()
+		})
+	})
+
+	framework.KubeDescribe("node upgrade", func() {
+		It("should maintain a functioning cluster [Feature:NodeUpgrade]", func() {
+			cm := chaosmonkey.New(func() {
+				v, err := realVersion(framework.TestContext.UpgradeTarget)
+				framework.ExpectNoError(err)
+				framework.ExpectNoError(nodeUpgrade(f, v))
+				framework.ExpectNoError(checkNodesVersions(f.Client, v))
+			})
+			cm.Register(func(sem *chaosmonkey.Semaphore) {
+				// Close over f.
+				testServiceUpBeforeAndAfter(f, sem)
+			})
+			cm.Do()
+		})
+
+		It("should maintain responsive services [Feature:ExperimentalNodeUpgrade]", func() {
+			cm := chaosmonkey.New(func() {
+				v, err := realVersion(framework.TestContext.UpgradeTarget)
+				framework.ExpectNoError(err)
+				framework.ExpectNoError(nodeUpgrade(f, v))
+				framework.ExpectNoError(checkNodesVersions(f.Client, v))
+			})
+			cm.Register(func(sem *chaosmonkey.Semaphore) {
+				// Close over f.
+				testServiceRemainsUp(f, sem)
+			})
+			cm.Do()
+		})
+	})
+
+	framework.KubeDescribe("cluster upgrade", func() {
+		It("should maintain a functioning cluster [Feature:ClusterUpgrade]", func() {
+			cm := chaosmonkey.New(func() {
+				v, err := realVersion(framework.TestContext.UpgradeTarget)
+				framework.ExpectNoError(err)
+				framework.ExpectNoError(masterUpgrade(v))
+				framework.ExpectNoError(checkMasterVersion(f.Client, v))
+				framework.ExpectNoError(nodeUpgrade(f, v))
+				framework.ExpectNoError(checkNodesVersions(f.Client, v))
+			})
+			cm.Register(func(sem *chaosmonkey.Semaphore) {
+				// Close over f.
+				testServiceUpBeforeAndAfter(f, sem)
+			})
+			cm.Do()
+		})
+
+		It("should maintain responsive services [Feature:ExperimentalClusterUpgrade]", func() {
+			cm := chaosmonkey.New(func() {
+				v, err := realVersion(framework.TestContext.UpgradeTarget)
+				framework.ExpectNoError(err)
+				framework.ExpectNoError(masterUpgrade(v))
+				framework.ExpectNoError(checkMasterVersion(f.Client, v))
+				framework.ExpectNoError(nodeUpgrade(f, v))
+				framework.ExpectNoError(checkNodesVersions(f.Client, v))
+			})
+			cm.Register(func(sem *chaosmonkey.Semaphore) {
+				// Close over f.
+				testServiceRemainsUp(f, sem)
+			})
+			cm.Do()
+		})
+	})
+})
+
 // realVersion turns a version constant s into a version string deployable on
 // GKE.  See hack/get-build.sh for more information.
 func realVersion(s string) (string, error) {
+	framework.Logf(fmt.Sprintf("Getting real version for %q", s))
 	v, _, err := runCmd(path.Join(framework.TestContext.RepoRoot, "hack/get-build.sh"), "-v", s)
 	if err != nil {
 		return v, err
 	}
+	framework.Logf("Version for %q is %q", s, v)
 	return strings.TrimPrefix(strings.TrimSpace(v), "v"), nil
 }

@ -80,7 +168,7 @@ func masterUpgradeGKE(v string) error {
 	return err
 }

-var nodeUpgrade = func(f *framework.Framework, replicas int32, v string) error {
+var nodeUpgrade = func(f *framework.Framework, v string) error {
 	// Perform the upgrade.
 	var err error
 	switch framework.TestContext.Provider {
@ -95,7 +183,7 @@ var nodeUpgrade = func(f *framework.Framework, replicas int32, v string) error {
 		return err
 	}

-	// Wait for it to complete and validate nodes and pods are healthy.
+	// Wait for it to complete and validate nodes are healthy.
 	//
 	// TODO(ihmccreery) We shouldn't have to wait for nodes to be ready in
 	// GKE; the operation shouldn't return until they all are.
@ -103,8 +191,7 @@ var nodeUpgrade = func(f *framework.Framework, replicas int32, v string) error {
 	if _, err := checkNodesReady(f.Client, restartNodeReadyAgainTimeout, framework.TestContext.CloudConfig.NumNodes); err != nil {
 		return err
 	}
-	framework.Logf("Waiting up to %v for all pods to be running and ready after the upgrade", restartPodReadyAgainTimeout)
-	return framework.WaitForPodsRunningReady(f.Namespace.Name, replicas, restartPodReadyAgainTimeout)
+	return nil
 }

 func nodeUpgradeGCE(rawV string) error {
@ -170,191 +257,77 @@ func nodeUpgradeGKE(v string) error {
 	return err
 }

-var _ = framework.KubeDescribe("Upgrade [Feature:Upgrade]", func() {
+func testServiceUpBeforeAndAfter(f *framework.Framework, sem *chaosmonkey.Semaphore) {
+	testService(f, sem, false)
+}

-	svcName, replicas := "baz", int32(2)
-	var rcName, ip, v string
-	var ingress api.LoadBalancerIngress
+func testServiceRemainsUp(f *framework.Framework, sem *chaosmonkey.Semaphore) {
+	testService(f, sem, true)
+}

-	BeforeEach(func() {
-		// The version is determined once at the beginning of the test so that
-		// the master and nodes won't be skewed if the value changes during the
-		// test.
-		By(fmt.Sprintf("Getting real version for %q", framework.TestContext.UpgradeTarget))
-		var err error
-		v, err = realVersion(framework.TestContext.UpgradeTarget)
-		framework.ExpectNoError(err)
-		framework.Logf("Version for %q is %q", framework.TestContext.UpgradeTarget, v)
+// testService is a helper for testServiceUpBeforeAndAfter and testServiceRemainsUp with a flag for testDuringUpgrade
+//
+// TODO(ihmccreery) remove this abstraction once testServiceUpBeforeAndAfter is no longer needed, because node upgrades
+// maintain a responsive service.
+func testService(f *framework.Framework, sem *chaosmonkey.Semaphore, testDuringUpgrade bool) {
+	// Setup
+	serviceName := "service-test"
+
+	jig := NewServiceTestJig(f.Client, serviceName)
+	// nodeIP := pickNodeIP(jig.Client) // for later
+
+	By("creating a TCP service " + serviceName + " with type=LoadBalancer in namespace " + f.Namespace.Name)
+	// TODO it's weird that we have to do this and then wait WaitForLoadBalancer which changes
+	// tcpService.
+	tcpService := jig.CreateTCPServiceOrFail(f.Namespace.Name, func(s *api.Service) {
+		s.Spec.Type = api.ServiceTypeLoadBalancer
 	})
+	tcpService = jig.WaitForLoadBalancerOrFail(f.Namespace.Name, tcpService.Name)
+	jig.SanityCheckService(tcpService, api.ServiceTypeLoadBalancer)

-	f := framework.NewDefaultFramework("cluster-upgrade")
-	var w *ServiceTestFixture
-	BeforeEach(func() {
-		By("Setting up the service, RC, and pods")
-		w = NewServerTest(f.Client, f.Namespace.Name, svcName)
-		rc := w.CreateWebserverRC(replicas)
-		rcName = rc.ObjectMeta.Name
-		svc := w.BuildServiceSpec()
-		svc.Spec.Type = api.ServiceTypeLoadBalancer
-		w.CreateService(svc)
+	// Get info to hit it with
+	tcpIngressIP := getIngressPoint(&tcpService.Status.LoadBalancer.Ingress[0])
+	svcPort := tcpService.Spec.Ports[0].Port

-		By("Waiting for the service to become reachable")
-		result, err := waitForLoadBalancerIngress(f.Client, svcName, f.Namespace.Name)
-		Expect(err).NotTo(HaveOccurred())
-		ingresses := result.Status.LoadBalancer.Ingress
-		if len(ingresses) != 1 {
-			framework.Failf("Was expecting only 1 ingress IP but got %d (%v): %v", len(ingresses), ingresses, result)
-		}
-		ingress = ingresses[0]
-		framework.Logf("Got load balancer ingress point %v", ingress)
-		ip = ingress.IP
-		if ip == "" {
-			ip = ingress.Hostname
-		}
-		testLoadBalancerReachable(ingress, 80)
+	By("creating pod to be part of service " + serviceName)
+	// TODO newRCTemplate only allows for the creation of one replica... that probably won't
+	// work so well.
+	jig.RunOrFail(f.Namespace.Name, nil)

-		// TODO(mikedanese): Add setup, validate, and teardown for:
-		//  - secrets
-		//  - volumes
-		//  - persistent volumes
-	})
+	// Hit it once before considering ourselves ready
+	By("hitting the pod through the service's LoadBalancer")
+	jig.TestReachableHTTP(tcpIngressIP, svcPort, loadBalancerLagTimeoutDefault)

-	AfterEach(func() {
-		w.Cleanup()
-	})
+	sem.Ready()

-	framework.KubeDescribe("master upgrade", func() {
-		It("should maintain responsive services [Feature:MasterUpgrade]", func() {
-			By("Validating cluster before master upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-			By("Performing a master upgrade")
-			testUpgrade(ip, v, masterUpgrade)
-			By("Checking master version")
-			framework.ExpectNoError(checkMasterVersion(f.Client, v))
-			By("Validating cluster after master upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-		})
-	})
+	if testDuringUpgrade {
+		// Continuous validation
+		wait.Until(func() {
+			By("hitting the pod through the service's LoadBalancer")
+			// TODO this is way too long of a timeout; make it shorter since we've already
+			// validated it's working.
+			jig.TestReachableHTTP(tcpIngressIP, svcPort, loadBalancerLagTimeoutDefault)
+		}, 3*time.Second, sem.StopCh)
+	} else {
+		// Block until chaosmonkey is done
+		By("waiting for upgrade to finish without checking if service remains up")
+		<-sem.StopCh
+	}

-	framework.KubeDescribe("node upgrade", func() {
-		It("should maintain a functioning cluster [Feature:NodeUpgrade]", func() {
-			By("Validating cluster before node upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-			By("Performing a node upgrade")
-			// Circumnavigate testUpgrade, since services don't necessarily stay up.
-			framework.Logf("Starting upgrade")
-			framework.ExpectNoError(nodeUpgrade(f, replicas, v))
-			framework.Logf("Upgrade complete")
-			By("Checking node versions")
-			framework.ExpectNoError(checkNodesVersions(f.Client, v))
-			By("Validating cluster after node upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-		})
+	// TODO(ihmccreery) We maybe shouldn't have to wait for the pods to be running again.  I
+	// pulled this over from the NodeUpgrade test, but I'm not sure what the need for it there
+	// was.
+	//
+	// Also 1 is a magic number from newRCTemplate.
+	framework.Logf("Waiting up to %v for all pods to be running and ready after the upgrade", restartPodReadyAgainTimeout)
+	framework.ExpectNoError(framework.WaitForPodsRunningReady(f.Namespace.Name, 1, restartPodReadyAgainTimeout))

-		It("should maintain responsive services [Feature:ExperimentalNodeUpgrade]", func() {
-			By("Validating cluster before node upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-			By("Performing a node upgrade")
-			testUpgrade(ip, v, func(v string) error {
-				return nodeUpgrade(f, replicas, v)
-			})
-			By("Checking node versions")
-			framework.ExpectNoError(checkNodesVersions(f.Client, v))
-			By("Validating cluster after node upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-		})
-	})
-
-	framework.KubeDescribe("cluster upgrade", func() {
-		It("should maintain responsive services [Feature:ClusterUpgrade]", func() {
-			By("Validating cluster before master upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-			By("Performing a master upgrade")
-			testUpgrade(ip, v, masterUpgrade)
-			By("Checking master version")
-			framework.ExpectNoError(checkMasterVersion(f.Client, v))
-			By("Validating cluster after master upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-
-			By("Validating cluster before node upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-			By("Performing a node upgrade")
-			// Circumnavigate testUpgrade, since services don't necessarily stay up.
-			framework.Logf("Starting upgrade")
-			framework.ExpectNoError(nodeUpgrade(f, replicas, v))
-			framework.Logf("Upgrade complete")
-			By("Checking node versions")
-			framework.ExpectNoError(checkNodesVersions(f.Client, v))
-			By("Validating cluster after node upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-		})
-
-		It("should maintain responsive services [Feature:ExperimentalClusterUpgrade]", func() {
-			By("Validating cluster before master upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-			By("Performing a master upgrade")
-			testUpgrade(ip, v, masterUpgrade)
-			By("Checking master version")
-			framework.ExpectNoError(checkMasterVersion(f.Client, v))
-			By("Validating cluster after master upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-
-			By("Validating cluster before node upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-			By("Performing a node upgrade")
-			testUpgrade(ip, v, func(v string) error {
-				return nodeUpgrade(f, replicas, v)
-			})
-			By("Checking node versions")
-			framework.ExpectNoError(checkNodesVersions(f.Client, v))
-			By("Validating cluster after node upgrade")
-			framework.ExpectNoError(validate(f, svcName, rcName, ingress, replicas))
-		})
-	})
-})
-
-func testUpgrade(ip, v string, upF func(v string) error) {
-	framework.Logf("Starting async validation")
-	httpClient := http.Client{Timeout: 2 * time.Second}
-	done := make(chan struct{}, 1)
-	// Let's make sure we've finished the heartbeat before shutting things down.
-	var wg sync.WaitGroup
-	go wait.Until(func() {
-		defer GinkgoRecover()
-		wg.Add(1)
-		defer wg.Done()
-
-		if err := wait.Poll(framework.Poll, framework.SingleCallTimeout, func() (bool, error) {
-			r, err := httpClient.Get("http://" + ip)
-			if err != nil {
-				framework.Logf("Error reaching %s: %v", ip, err)
-				return false, nil
-			}
-			if r.StatusCode < http.StatusOK || r.StatusCode >= http.StatusNotFound {
-				framework.Logf("Bad response; status: %d, response: %v", r.StatusCode, r)
-				return false, nil
-			}
-			return true, nil
-		}); err != nil {
-			// We log the error here because the test will fail at the very end
-			// because this validation runs in another goroutine. Without this,
-			// a failure is very confusing to track down because from the logs
-			// everything looks fine.
-			msg := fmt.Sprintf("Failed to contact service during upgrade: %v", err)
-			framework.Logf(msg)
-			framework.Failf(msg)
-		}
-	}, 200*time.Millisecond, done)
-
-	framework.Logf("Starting upgrade")
-	framework.ExpectNoError(upF(v))
-	done <- struct{}{}
-	framework.Logf("Stopping async validation")
-	wg.Wait()
-	framework.Logf("Upgrade complete")
+	// Validation
+	jig.SanityCheckService(tcpService, api.ServiceTypeLoadBalancer)
 }

 func checkMasterVersion(c *client.Client, want string) error {
+	framework.Logf("Checking master version")
 	v, err := c.Discovery().ServerVersion()
 	if err != nil {
 		return fmt.Errorf("checkMasterVersion() couldn't get the master version: %v", err)
@ -418,6 +391,9 @@ func runCmd(command string, args ...string) (string, string, error) {
 	cmd := exec.Command(command, args...)
 	// We also output to the OS stdout/stderr to aid in debugging in case cmd
 	// hangs and never returns before the test gets killed.
+	//
+	// This creates some ugly output because gcloud doesn't always provide
+	// newlines.
 	cmd.Stdout = io.MultiWriter(os.Stdout, &bout)
 	cmd.Stderr = io.MultiWriter(os.Stderr, &berr)
 	err := cmd.Run()
@ -429,51 +405,17 @@ func runCmd(command string, args ...string) (string, string, error) {
 	return stdout, stderr, nil
 }

-func validate(f *framework.Framework, svcNameWant, rcNameWant string, ingress api.LoadBalancerIngress, podsWant int32) error {
-	framework.Logf("Beginning cluster validation")
-	// Verify RC.
-	rcs, err := f.Client.ReplicationControllers(f.Namespace.Name).List(api.ListOptions{})
-	if err != nil {
-		return fmt.Errorf("error listing RCs: %v", err)
-	}
-	if len(rcs.Items) != 1 {
-		return fmt.Errorf("wanted 1 RC with name %s, got %d", rcNameWant, len(rcs.Items))
-	}
-	if got := rcs.Items[0].Name; got != rcNameWant {
-		return fmt.Errorf("wanted RC name %q, got %q", rcNameWant, got)
-	}
-
-	// Verify pods.
-	if err := framework.VerifyPods(f.Client, f.Namespace.Name, rcNameWant, false, podsWant); err != nil {
-		return fmt.Errorf("failed to find %d %q pods: %v", podsWant, rcNameWant, err)
-	}
-
-	// Verify service.
-	svc, err := f.Client.Services(f.Namespace.Name).Get(svcNameWant)
-	if err != nil {
-		return fmt.Errorf("error getting service %s: %v", svcNameWant, err)
-	}
-	if svcNameWant != svc.Name {
-		return fmt.Errorf("wanted service name %q, got %q", svcNameWant, svc.Name)
-	}
-	// TODO(mikedanese): Make testLoadBalancerReachable return an error.
-	testLoadBalancerReachable(ingress, 80)
-
-	framework.Logf("Cluster validation succeeded")
-	return nil
-}
-
 // migRollingUpdate starts a MIG rolling update, upgrading the nodes to a new
 // instance template named tmpl, and waits up to nt times the number of nodes
 // for it to complete.
 func migRollingUpdate(tmpl string, nt time.Duration) error {
-	By(fmt.Sprintf("starting the MIG rolling update to %s", tmpl))
+	framework.Logf(fmt.Sprintf("starting the MIG rolling update to %s", tmpl))
 	id, err := migRollingUpdateStart(tmpl, nt)
 	if err != nil {
 		return fmt.Errorf("couldn't start the MIG rolling update: %v", err)
 	}

-	By(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id))
+	framework.Logf(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id))
 	if err := migRollingUpdatePoll(id, nt); err != nil {
 		return fmt.Errorf("err waiting until update completed: %v", err)
 	}
@ -611,61 +553,3 @@ func migRollingUpdatePoll(id string, nt time.Duration) error {
 	framework.Logf("MIG rolling update complete after %v", time.Since(start))
 	return nil
 }
-
-func testLoadBalancerReachable(ingress api.LoadBalancerIngress, port int) bool {
-	loadBalancerLagTimeout := loadBalancerLagTimeoutDefault
-	if framework.ProviderIs("aws") {
-		loadBalancerLagTimeout = loadBalancerLagTimeoutAWS
-	}
-	return testLoadBalancerReachableInTime(ingress, port, loadBalancerLagTimeout)
-}
-
-func testLoadBalancerReachableInTime(ingress api.LoadBalancerIngress, port int, timeout time.Duration) bool {
-	ip := ingress.IP
-	if ip == "" {
-		ip = ingress.Hostname
-	}
-
-	return testReachableInTime(conditionFuncDecorator(ip, port, testReachableHTTP, "/", "test-webserver"), timeout)
-
-}
-
-func conditionFuncDecorator(ip string, port int, fn func(string, int, string, string) (bool, error), request string, expect string) wait.ConditionFunc {
-	return func() (bool, error) {
-		return fn(ip, port, request, expect)
-	}
-}
-
-func testReachableInTime(testFunc wait.ConditionFunc, timeout time.Duration) bool {
-	By(fmt.Sprintf("Waiting up to %v", timeout))
-	err := wait.PollImmediate(framework.Poll, timeout, testFunc)
-	if err != nil {
-		Expect(err).NotTo(HaveOccurred(), "Error waiting")
-		return false
-	}
-	return true
-}
-
-func waitForLoadBalancerIngress(c *client.Client, serviceName, namespace string) (*api.Service, error) {
-	// TODO: once support ticket 21807001 is resolved, reduce this timeout
-	// back to something reasonable
-	const timeout = 20 * time.Minute
-	var service *api.Service
-	By(fmt.Sprintf("waiting up to %v for service %s in namespace %s to have a LoadBalancer ingress point", timeout, serviceName, namespace))
-	i := 1
-	for start := time.Now(); time.Since(start) < timeout; time.Sleep(3 * time.Second) {
-		service, err := c.Services(namespace).Get(serviceName)
-		if err != nil {
-			framework.Logf("Get service failed, ignoring for 5s: %v", err)
-			continue
-		}
-		if len(service.Status.LoadBalancer.Ingress) > 0 {
-			return service, nil
-		}
-		if i%5 == 0 {
-			framework.Logf("Waiting for service %s in namespace %s to have a LoadBalancer ingress point (%v)", serviceName, namespace, time.Since(start))
-		}
-		i++
-	}
-	return service, fmt.Errorf("service %s in namespace %s doesn't have a LoadBalancer ingress point after %.2f seconds", serviceName, namespace, timeout.Seconds())
-}
--- a/test/e2e/service.go
+++ b/test/e2e/service.go
@ -481,6 +481,8 @@ var _ = framework.KubeDescribe("Services", func() {

 		// Change the services to LoadBalancer.

+		// Here we test that LoadBalancers can receive static IP addresses.  This isn't
+		// necessary, but is an additional feature this monolithic test checks.
 		requestedIP := ""
 		staticIPName := ""
 		if framework.ProviderIs("gce", "gke") {