2015-06-09 01:59:13 +00:00
/ *
Copyright 2015 The Kubernetes Authors All rights reserved .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package e2e
import (
2015-06-17 07:13:26 +00:00
"bytes"
2015-06-09 01:59:13 +00:00
"fmt"
2015-06-30 00:23:22 +00:00
"io"
2015-06-09 01:59:13 +00:00
"net/http"
2015-06-30 00:23:22 +00:00
"os"
2015-06-09 01:59:13 +00:00
"os/exec"
"path"
2015-06-17 07:13:26 +00:00
"strings"
2015-06-09 01:59:13 +00:00
"sync"
"time"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/api"
2015-08-13 19:01:50 +00:00
client "k8s.io/kubernetes/pkg/client/unversioned"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/util/wait"
2015-06-09 01:59:13 +00:00
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
2015-06-19 22:17:28 +00:00
const (
// version applies to upgrades; kube-push always pushes local binaries.
versionURLFmt = "https://storage.googleapis.com/kubernetes-release/%s/%s.txt"
)
2015-10-12 23:11:12 +00:00
// realVersion turns a version constant s into a version string deployable on
// GKE. See hack/get-build.sh for more information.
2015-06-19 22:17:28 +00:00
func realVersion ( s string ) ( string , error ) {
2015-10-12 23:11:12 +00:00
v , _ , err := runCmd ( path . Join ( testContext . RepoRoot , "hack/get-build.sh" ) , "-v" , s )
if err != nil {
return v , err
2015-06-19 22:17:28 +00:00
}
2015-10-12 23:11:12 +00:00
return strings . TrimPrefix ( strings . TrimSpace ( v ) , "v" ) , nil
2015-06-19 22:17:28 +00:00
}
2015-06-17 07:13:26 +00:00
// The following upgrade functions are passed into the framework below and used
// to do the actual upgrades.
2015-06-19 22:17:28 +00:00
var masterUpgrade = func ( v string ) error {
switch testContext . Provider {
case "gce" :
2015-06-24 01:06:44 +00:00
return masterUpgradeGCE ( v )
2015-06-19 22:17:28 +00:00
case "gke" :
return masterUpgradeGKE ( v )
default :
return fmt . Errorf ( "masterUpgrade() is not implemented for provider %s" , testContext . Provider )
}
}
2015-06-24 01:06:44 +00:00
func masterUpgradeGCE ( rawV string ) error {
v := "v" + rawV
_ , _ , err := runCmd ( path . Join ( testContext . RepoRoot , "hack/e2e-internal/e2e-upgrade.sh" ) , "-M" , v )
2015-06-19 22:17:28 +00:00
return err
}
2015-06-17 07:13:26 +00:00
2015-06-19 22:17:28 +00:00
func masterUpgradeGKE ( v string ) error {
Logf ( "Upgrading master to %q" , v )
2015-09-03 18:19:25 +00:00
_ , _ , err := runCmd ( "gcloud" , "container" ,
2015-06-19 22:17:28 +00:00
fmt . Sprintf ( "--project=%s" , testContext . CloudConfig . ProjectID ) ,
fmt . Sprintf ( "--zone=%s" , testContext . CloudConfig . Zone ) ,
"clusters" ,
"upgrade" ,
testContext . CloudConfig . Cluster ,
"--master" ,
2015-12-11 03:22:40 +00:00
fmt . Sprintf ( "--cluster-version=%s" , v ) ,
"--quiet" )
2015-06-17 07:13:26 +00:00
return err
}
2015-06-19 22:17:28 +00:00
var masterPush = func ( _ string ) error {
2015-08-26 17:05:34 +00:00
// TODO(mikedanese): Make master push use the provided version.
2015-06-19 22:17:28 +00:00
_ , _ , err := runCmd ( path . Join ( testContext . RepoRoot , "hack/e2e-internal/e2e-push.sh" ) , "-m" )
2015-06-17 07:13:26 +00:00
return err
}
2015-10-27 13:07:51 +00:00
var nodeUpgrade = func ( f * Framework , replicas int , v string ) error {
2015-06-19 22:17:28 +00:00
// Perform the upgrade.
var err error
switch testContext . Provider {
case "gce" :
2015-06-24 01:06:44 +00:00
err = nodeUpgradeGCE ( v )
2015-06-19 22:17:28 +00:00
case "gke" :
err = nodeUpgradeGKE ( v )
default :
err = fmt . Errorf ( "nodeUpgrade() is not implemented for provider %s" , testContext . Provider )
}
2015-06-17 07:13:26 +00:00
if err != nil {
return err
}
2015-06-19 22:17:28 +00:00
// Wait for it to complete and validate nodes and pods are healthy.
2015-06-17 07:13:26 +00:00
Logf ( "Waiting up to %v for all nodes to be ready after the upgrade" , restartNodeReadyAgainTimeout )
if _ , err := checkNodesReady ( f . Client , restartNodeReadyAgainTimeout , testContext . CloudConfig . NumNodes ) ; err != nil {
return err
}
Logf ( "Waiting up to %v for all pods to be running and ready after the upgrade" , restartPodReadyAgainTimeout )
return waitForPodsRunningReady ( f . Namespace . Name , replicas , restartPodReadyAgainTimeout )
}
2015-06-24 01:06:44 +00:00
func nodeUpgradeGCE ( rawV string ) error {
v := "v" + rawV
Logf ( "Preparing node upgarde by creating new instance template for %q" , v )
stdout , _ , err := runCmd ( path . Join ( testContext . RepoRoot , "hack/e2e-internal/e2e-upgrade.sh" ) , "-P" , v )
2015-06-19 22:17:28 +00:00
if err != nil {
return err
}
tmpl := strings . TrimSpace ( stdout )
Logf ( "Performing a node upgrade to %q; waiting at most %v per node" , tmpl , restartPerNodeTimeout )
if err := migRollingUpdate ( tmpl , restartPerNodeTimeout ) ; err != nil {
return fmt . Errorf ( "error doing node upgrade via a migRollingUpdate to %s: %v" , tmpl , err )
}
return nil
}
func nodeUpgradeGKE ( v string ) error {
Logf ( "Upgrading nodes to %q" , v )
2015-09-03 18:19:25 +00:00
_ , _ , err := runCmd ( "gcloud" , "container" ,
2015-06-19 22:17:28 +00:00
fmt . Sprintf ( "--project=%s" , testContext . CloudConfig . ProjectID ) ,
fmt . Sprintf ( "--zone=%s" , testContext . CloudConfig . Zone ) ,
"clusters" ,
"upgrade" ,
testContext . CloudConfig . Cluster ,
2015-12-11 03:22:40 +00:00
fmt . Sprintf ( "--cluster-version=%s" , v ) ,
"--quiet" )
2015-06-19 22:17:28 +00:00
return err
}
2016-01-26 23:38:44 +00:00
var _ = Describe ( "Upgrade [Feature:Upgrade]" , func ( ) {
2015-11-12 22:30:06 +00:00
svcName , replicas := "baz" , 2
var rcName , ip , v string
var ingress api . LoadBalancerIngress
BeforeEach ( func ( ) {
// The version is determined once at the beginning of the test so that
// the master and nodes won't be skewed if the value changes during the
// test.
By ( fmt . Sprintf ( "Getting real version for %q" , testContext . UpgradeTarget ) )
var err error
v , err = realVersion ( testContext . UpgradeTarget )
expectNoError ( err )
Logf ( "Version for %q is %q" , testContext . UpgradeTarget , v )
} )
2015-06-19 22:17:28 +00:00
2015-11-12 22:30:06 +00:00
f := NewFramework ( "cluster-upgrade" )
2016-01-29 22:56:37 +00:00
var w * ServiceTestFixture
2015-11-12 22:30:06 +00:00
BeforeEach ( func ( ) {
By ( "Setting up the service, RC, and pods" )
2015-09-28 20:57:58 +00:00
w = NewServerTest ( f . Client , f . Namespace . Name , svcName )
2015-11-12 22:30:06 +00:00
rc := w . CreateWebserverRC ( replicas )
rcName = rc . ObjectMeta . Name
svc := w . BuildServiceSpec ( )
svc . Spec . Type = api . ServiceTypeLoadBalancer
w . CreateService ( svc )
By ( "Waiting for the service to become reachable" )
result , err := waitForLoadBalancerIngress ( f . Client , svcName , f . Namespace . Name )
Expect ( err ) . NotTo ( HaveOccurred ( ) )
ingresses := result . Status . LoadBalancer . Ingress
if len ( ingresses ) != 1 {
Failf ( "Was expecting only 1 ingress IP but got %d (%v): %v" , len ( ingresses ) , ingresses , result )
}
ingress = ingresses [ 0 ]
Logf ( "Got load balancer ingress point %v" , ingress )
ip = ingress . IP
if ip == "" {
ip = ingress . Hostname
}
testLoadBalancerReachable ( ingress , 80 )
// TODO(mikedanese): Add setup, validate, and teardown for:
// - secrets
// - volumes
// - persistent volumes
} )
AfterEach ( func ( ) {
w . Cleanup ( )
} )
2015-06-09 01:59:13 +00:00
2015-11-12 22:30:06 +00:00
Describe ( "kube-push" , func ( ) {
It ( "of master should maintain responsive services" , func ( ) {
By ( "Validating cluster before master upgrade" )
expectNoError ( validate ( f , svcName , rcName , ingress , replicas ) )
By ( "Performing a master upgrade" )
testMasterUpgrade ( ip , v , masterPush )
By ( "Validating cluster after master upgrade" )
expectNoError ( validate ( f , svcName , rcName , ingress , replicas ) )
2015-06-09 01:59:13 +00:00
} )
2015-11-12 22:30:06 +00:00
} )
Describe ( "upgrade-master" , func ( ) {
It ( "should maintain responsive services" , func ( ) {
By ( "Validating cluster before master upgrade" )
expectNoError ( validate ( f , svcName , rcName , ingress , replicas ) )
By ( "Performing a master upgrade" )
testMasterUpgrade ( ip , v , masterUpgrade )
By ( "Checking master version" )
expectNoError ( checkMasterVersion ( f . Client , v ) )
By ( "Validating cluster after master upgrade" )
expectNoError ( validate ( f , svcName , rcName , ingress , replicas ) )
2015-06-09 01:59:13 +00:00
} )
2015-11-12 22:30:06 +00:00
} )
2015-06-09 01:59:13 +00:00
2015-11-12 22:30:06 +00:00
Describe ( "upgrade-cluster" , func ( ) {
var tmplBefore , tmplAfter string
BeforeEach ( func ( ) {
if providerIs ( "gce" ) {
By ( "Getting the node template before the upgrade" )
var err error
tmplBefore , err = migTemplate ( )
expectNoError ( err )
}
2015-06-09 01:59:13 +00:00
} )
2015-11-12 22:30:06 +00:00
AfterEach ( func ( ) {
if providerIs ( "gce" ) {
By ( "Cleaning up any unused node templates" )
var err error
tmplAfter , err = migTemplate ( )
if err != nil {
Logf ( "Could not get node template post-upgrade; may have leaked template %s" , tmplBefore )
return
2015-06-19 22:17:28 +00:00
}
2015-11-12 22:30:06 +00:00
if tmplBefore == tmplAfter {
// The node upgrade failed so there's no need to delete
// anything.
Logf ( "Node template %s is still in use; not cleaning up" , tmplBefore )
return
2015-06-17 07:13:26 +00:00
}
2015-11-12 22:30:06 +00:00
Logf ( "Deleting node template %s" , tmplBefore )
if _ , _ , err := retryCmd ( "gcloud" , "compute" , "instance-templates" ,
fmt . Sprintf ( "--project=%s" , testContext . CloudConfig . ProjectID ) ,
"delete" ,
tmplBefore ) ; err != nil {
Logf ( "gcloud compute instance-templates delete %s call failed with err: %v" , tmplBefore , err )
Logf ( "May have leaked instance template %q" , tmplBefore )
}
}
} )
It ( "should maintain a functioning cluster" , func ( ) {
2016-02-25 21:37:16 +00:00
By ( "Validating cluster before node upgrade" )
2015-11-12 22:30:06 +00:00
expectNoError ( validate ( f , svcName , rcName , ingress , replicas ) )
By ( "Performing a node upgrade" )
testNodeUpgrade ( f , nodeUpgrade , replicas , v )
By ( "Validating cluster after node upgrade" )
expectNoError ( validate ( f , svcName , rcName , ingress , replicas ) )
2015-06-17 07:13:26 +00:00
} )
} )
} )
2015-06-19 22:17:28 +00:00
func testMasterUpgrade ( ip , v string , mUp func ( v string ) error ) {
2015-06-17 07:13:26 +00:00
Logf ( "Starting async validation" )
2015-06-09 01:59:13 +00:00
httpClient := http . Client { Timeout : 2 * time . Second }
done := make ( chan struct { } , 1 )
// Let's make sure we've finished the heartbeat before shutting things down.
var wg sync . WaitGroup
2016-02-02 10:57:06 +00:00
go wait . Until ( func ( ) {
2015-06-09 01:59:13 +00:00
defer GinkgoRecover ( )
wg . Add ( 1 )
defer wg . Done ( )
2015-06-17 07:13:26 +00:00
if err := wait . Poll ( poll , singleCallTimeout , func ( ) ( bool , error ) {
2015-06-09 01:59:13 +00:00
r , err := httpClient . Get ( "http://" + ip )
if err != nil {
2015-06-17 07:13:26 +00:00
Logf ( "Error reaching %s: %v" , ip , err )
2015-06-09 01:59:13 +00:00
return false , nil
}
if r . StatusCode < http . StatusOK || r . StatusCode >= http . StatusNotFound {
2015-06-17 07:13:26 +00:00
Logf ( "Bad response; status: %d, response: %v" , r . StatusCode , r )
2015-06-09 01:59:13 +00:00
return false , nil
}
return true , nil
2015-06-17 07:13:26 +00:00
} ) ; err != nil {
// We log the error here because the test will fail at the very end
// because this validation runs in another goroutine. Without this,
// a failure is very confusing to track down because from the logs
// everything looks fine.
msg := fmt . Sprintf ( "Failed to contact service during master upgrade: %v" , err )
Logf ( msg )
Failf ( msg )
}
2015-06-09 01:59:13 +00:00
} , 200 * time . Millisecond , done )
2015-06-17 07:13:26 +00:00
Logf ( "Starting master upgrade" )
2015-06-19 22:17:28 +00:00
expectNoError ( mUp ( v ) )
2015-06-09 01:59:13 +00:00
done <- struct { } { }
Logf ( "Stopping async validation" )
wg . Wait ( )
2015-06-17 07:13:26 +00:00
Logf ( "Master upgrade complete" )
}
2015-06-24 01:06:44 +00:00
func checkMasterVersion ( c * client . Client , want string ) error {
2016-01-05 19:55:34 +00:00
v , err := c . Discovery ( ) . ServerVersion ( )
2015-06-24 01:06:44 +00:00
if err != nil {
return fmt . Errorf ( "checkMasterVersion() couldn't get the master version: %v" , err )
}
// We do prefix trimming and then matching because:
// want looks like: 0.19.3-815-g50e67d4
// got looks like: v0.19.3-815-g50e67d4034e858-dirty
got := strings . TrimPrefix ( v . GitVersion , "v" )
if ! strings . HasPrefix ( got , want ) {
return fmt . Errorf ( "master had kube-apiserver version %s which does not start with %s" ,
got , want )
}
Logf ( "Master is at version %s" , want )
return nil
}
2015-10-27 13:07:51 +00:00
func testNodeUpgrade ( f * Framework , nUp func ( f * Framework , n int , v string ) error , replicas int , v string ) {
2015-06-17 07:13:26 +00:00
Logf ( "Starting node upgrade" )
2015-06-19 22:17:28 +00:00
expectNoError ( nUp ( f , replicas , v ) )
2015-06-17 07:13:26 +00:00
Logf ( "Node upgrade complete" )
2015-06-24 01:06:44 +00:00
By ( "Checking node versions" )
expectNoError ( checkNodesVersions ( f . Client , v ) )
Logf ( "All nodes are at version %s" , v )
}
2015-06-17 07:13:26 +00:00
2015-06-24 01:06:44 +00:00
func checkNodesVersions ( c * client . Client , want string ) error {
2015-12-10 14:35:58 +00:00
l := ListSchedulableNodesOrDie ( c )
2015-06-24 01:06:44 +00:00
for _ , n := range l . Items {
// We do prefix trimming and then matching because:
// want looks like: 0.19.3-815-g50e67d4
// kv/kvp look like: v0.19.3-815-g50e67d4034e858-dirty
kv , kpv := strings . TrimPrefix ( n . Status . NodeInfo . KubeletVersion , "v" ) ,
strings . TrimPrefix ( n . Status . NodeInfo . KubeProxyVersion , "v" )
if ! strings . HasPrefix ( kv , want ) {
return fmt . Errorf ( "node %s had kubelet version %s which does not start with %s" ,
n . ObjectMeta . Name , kv , want )
}
if ! strings . HasPrefix ( kpv , want ) {
return fmt . Errorf ( "node %s had kube-proxy version %s which does not start with %s" ,
n . ObjectMeta . Name , kpv , want )
}
}
return nil
2015-06-19 22:17:28 +00:00
}
2015-06-09 01:59:13 +00:00
2015-06-19 22:17:28 +00:00
// retryCmd runs cmd using args and retries it for up to singleCallTimeout if
// it returns an error. It returns stdout and stderr.
func retryCmd ( command string , args ... string ) ( string , string , error ) {
var err error
stdout , stderr := "" , ""
wait . Poll ( poll , singleCallTimeout , func ( ) ( bool , error ) {
stdout , stderr , err = runCmd ( command , args ... )
if err != nil {
Logf ( "Got %v" , err )
return false , nil
}
return true , nil
} )
return stdout , stderr , err
2015-06-09 01:59:13 +00:00
}
2015-06-30 00:23:22 +00:00
// runCmd runs cmd using args and returns its stdout and stderr. It also outputs
// cmd's stdout and stderr to their respective OS streams.
2015-12-11 03:22:40 +00:00
//
// TODO(ihmccreery) This function should either be moved into util.go or
// removed; other e2e's use bare exe.Command.
2015-06-19 22:17:28 +00:00
func runCmd ( command string , args ... string ) ( string , string , error ) {
Logf ( "Running %s %v" , command , args )
2015-06-17 07:13:26 +00:00
var bout , berr bytes . Buffer
2015-06-19 22:17:28 +00:00
cmd := exec . Command ( command , args ... )
2015-06-30 00:23:22 +00:00
// We also output to the OS stdout/stderr to aid in debugging in case cmd
2015-10-20 02:41:58 +00:00
// hangs and never returns before the test gets killed.
2015-06-30 00:23:22 +00:00
cmd . Stdout = io . MultiWriter ( os . Stdout , & bout )
cmd . Stderr = io . MultiWriter ( os . Stderr , & berr )
2015-06-17 07:13:26 +00:00
err := cmd . Run ( )
stdout , stderr := bout . String ( ) , berr . String ( )
if err != nil {
return "" , "" , fmt . Errorf ( "error running %s %v; got error %v, stdout %q, stderr %q" ,
2015-06-19 22:17:28 +00:00
command , args , err , stdout , stderr )
2015-06-09 01:59:13 +00:00
}
2015-06-17 07:13:26 +00:00
return stdout , stderr , nil
2015-06-09 01:59:13 +00:00
}
2015-10-27 13:07:51 +00:00
func validate ( f * Framework , svcNameWant , rcNameWant string , ingress api . LoadBalancerIngress , podsWant int ) error {
2015-06-17 07:13:26 +00:00
Logf ( "Beginning cluster validation" )
// Verify RC.
2015-12-10 09:39:03 +00:00
rcs , err := f . Client . ReplicationControllers ( f . Namespace . Name ) . List ( api . ListOptions { } )
2015-06-17 07:13:26 +00:00
if err != nil {
return fmt . Errorf ( "error listing RCs: %v" , err )
}
if len ( rcs . Items ) != 1 {
return fmt . Errorf ( "wanted 1 RC with name %s, got %d" , rcNameWant , len ( rcs . Items ) )
}
if got := rcs . Items [ 0 ] . Name ; got != rcNameWant {
return fmt . Errorf ( "wanted RC name %q, got %q" , rcNameWant , got )
}
// Verify pods.
if err := verifyPods ( f . Client , f . Namespace . Name , rcNameWant , false , podsWant ) ; err != nil {
return fmt . Errorf ( "failed to find %d %q pods: %v" , podsWant , rcNameWant , err )
}
// Verify service.
svc , err := f . Client . Services ( f . Namespace . Name ) . Get ( svcNameWant )
if err != nil {
return fmt . Errorf ( "error getting service %s: %v" , svcNameWant , err )
2015-06-09 01:59:13 +00:00
}
2015-06-17 07:13:26 +00:00
if svcNameWant != svc . Name {
return fmt . Errorf ( "wanted service name %q, got %q" , svcNameWant , svc . Name )
2015-06-09 01:59:13 +00:00
}
2015-08-26 17:05:34 +00:00
// TODO(mikedanese): Make testLoadBalancerReachable return an error.
2015-06-17 07:13:26 +00:00
testLoadBalancerReachable ( ingress , 80 )
Logf ( "Cluster validation succeeded" )
return nil
2015-06-09 01:59:13 +00:00
}
2015-07-09 22:51:40 +00:00
// migRollingUpdate starts a MIG rolling update, upgrading the nodes to a new
2015-10-20 02:41:58 +00:00
// instance template named tmpl, and waits up to nt times the number of nodes
2015-07-09 22:51:40 +00:00
// for it to complete.
func migRollingUpdate ( tmpl string , nt time . Duration ) error {
By ( fmt . Sprintf ( "starting the MIG rolling update to %s" , tmpl ) )
id , err := migRollingUpdateStart ( tmpl , nt )
if err != nil {
return fmt . Errorf ( "couldn't start the MIG rolling update: %v" , err )
}
By ( fmt . Sprintf ( "polling the MIG rolling update (%s) until it completes" , id ) )
if err := migRollingUpdatePoll ( id , nt ) ; err != nil {
return fmt . Errorf ( "err waiting until update completed: %v" , err )
}
return nil
}
2015-10-20 02:41:58 +00:00
// migTemplate (GCE/GKE-only) returns the name of the MIG template that the
2015-07-09 22:51:40 +00:00
// nodes of the cluster use.
func migTemplate ( ) ( string , error ) {
var errLast error
var templ string
key := "instanceTemplate"
if wait . Poll ( poll , singleCallTimeout , func ( ) ( bool , error ) {
2015-08-26 17:05:34 +00:00
// TODO(mikedanese): make this hit the compute API directly instead of
2015-07-09 22:51:40 +00:00
// shelling out to gcloud.
2015-07-24 02:55:13 +00:00
// An `instance-groups managed describe` call outputs what we want to stdout.
output , _ , err := retryCmd ( "gcloud" , "compute" , "instance-groups" , "managed" ,
2015-07-09 22:51:40 +00:00
fmt . Sprintf ( "--project=%s" , testContext . CloudConfig . ProjectID ) ,
2015-07-24 02:55:13 +00:00
"describe" ,
2015-09-28 21:53:02 +00:00
fmt . Sprintf ( "--zone=%s" , testContext . CloudConfig . Zone ) ,
2015-07-24 02:55:13 +00:00
testContext . CloudConfig . NodeInstanceGroup )
2015-07-09 22:51:40 +00:00
if err != nil {
2015-07-22 11:40:22 +00:00
errLast = fmt . Errorf ( "gcloud compute instance-groups managed describe call failed with err: %v" , err )
2015-07-09 22:51:40 +00:00
return false , nil
}
// The 'describe' call probably succeeded; parse the output and try to
// find the line that looks like "instanceTemplate: url/to/<templ>" and
// return <templ>.
if val := parseKVLines ( output , key ) ; len ( val ) > 0 {
url := strings . Split ( val , "/" )
templ = url [ len ( url ) - 1 ]
Logf ( "MIG group %s using template: %s" , testContext . CloudConfig . NodeInstanceGroup , templ )
return true , nil
}
errLast = fmt . Errorf ( "couldn't find %s in output to get MIG template. Output: %s" , key , output )
return false , nil
} ) != nil {
return "" , fmt . Errorf ( "migTemplate() failed with last error: %v" , errLast )
}
return templ , nil
}
// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ
// as the new template, waiting up to nt per node, and returns the ID of that
// update.
func migRollingUpdateStart ( templ string , nt time . Duration ) ( string , error ) {
var errLast error
var id string
prefix , suffix := "Started [" , "]."
if err := wait . Poll ( poll , singleCallTimeout , func ( ) ( bool , error ) {
2015-08-26 17:05:34 +00:00
// TODO(mikedanese): make this hit the compute API directly instead of
2015-07-09 22:51:40 +00:00
// shelling out to gcloud.
2015-08-26 17:05:34 +00:00
// NOTE(mikedanese): If you are changing this gcloud command, update
2015-07-09 22:51:40 +00:00
// cluster/gce/upgrade.sh to match this EXACTLY.
2015-07-24 02:55:13 +00:00
// A `rolling-updates start` call outputs what we want to stderr.
2015-10-20 02:41:58 +00:00
_ , output , err := retryCmd ( "gcloud" , append ( migUpdateCmdBase ( ) ,
2015-07-23 00:07:15 +00:00
"rolling-updates" ,
2015-07-09 22:51:40 +00:00
fmt . Sprintf ( "--project=%s" , testContext . CloudConfig . ProjectID ) ,
fmt . Sprintf ( "--zone=%s" , testContext . CloudConfig . Zone ) ,
"start" ,
// Required args.
fmt . Sprintf ( "--group=%s" , testContext . CloudConfig . NodeInstanceGroup ) ,
fmt . Sprintf ( "--template=%s" , templ ) ,
// Optional args to fine-tune behavior.
fmt . Sprintf ( "--instance-startup-timeout=%ds" , int ( nt . Seconds ( ) ) ) ,
// NOTE: We can speed up this process by increasing
// --max-num-concurrent-instances.
fmt . Sprintf ( "--max-num-concurrent-instances=%d" , 1 ) ,
fmt . Sprintf ( "--max-num-failed-instances=%d" , 0 ) ,
2015-07-24 02:55:13 +00:00
fmt . Sprintf ( "--min-instance-update-time=%ds" , 0 ) ) ... )
2015-07-09 22:51:40 +00:00
if err != nil {
2015-07-23 00:07:15 +00:00
errLast = fmt . Errorf ( "rolling-updates call failed with err: %v" , err )
2015-07-09 22:51:40 +00:00
return false , nil
}
// The 'start' call probably succeeded; parse the output and try to find
// the line that looks like "Started [url/to/<id>]." and return <id>.
for _ , line := range strings . Split ( output , "\n" ) {
// As a sanity check, ensure the line starts with prefix and ends
// with suffix.
if strings . Index ( line , prefix ) != 0 || strings . Index ( line , suffix ) != len ( line ) - len ( suffix ) {
continue
}
url := strings . Split ( strings . TrimSuffix ( strings . TrimPrefix ( line , prefix ) , suffix ) , "/" )
id = url [ len ( url ) - 1 ]
Logf ( "Started MIG rolling update; ID: %s" , id )
return true , nil
}
errLast = fmt . Errorf ( "couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s" ,
prefix , suffix , output )
return false , nil
} ) ; err != nil {
return "" , fmt . Errorf ( "migRollingUpdateStart() failed with last error: %v" , errLast )
}
return id , nil
}
2015-07-23 00:07:15 +00:00
// migUpdateCmdBase gets the base of the MIG rolling update command--i.e., all
// pieces of the gcloud command that come after "gcloud" but before
// "rolling-updates". Examples of returned values are:
//
// {preview"}
//
// {"alpha", "compute"}
//
2015-08-26 17:05:34 +00:00
// TODO(mikedanese): Remove this hack on July 29, 2015 when the migration to
2015-07-23 00:07:15 +00:00
// `gcloud alpha compute rolling-updates` is complete.
2015-10-20 02:41:58 +00:00
func migUpdateCmdBase ( ) [ ] string {
2015-07-23 00:07:15 +00:00
b := [ ] string { "preview" }
a := [ ] string { "rolling-updates" , "-h" }
if err := exec . Command ( "gcloud" , append ( b , a ... ) ... ) . Run ( ) ; err != nil {
b = [ ] string { "alpha" , "compute" }
}
return b
}
2015-07-09 22:51:40 +00:00
// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling
// update with ID id until it is complete. It returns an error if this takes
// longer than nt times the number of nodes.
func migRollingUpdatePoll ( id string , nt time . Duration ) error {
// Two keys and a val.
status , progress , done := "status" , "statusMessage" , "ROLLED_OUT"
start , timeout := time . Now ( ) , nt * time . Duration ( testContext . CloudConfig . NumNodes )
var errLast error
Logf ( "Waiting up to %v for MIG rolling update to complete." , timeout )
if wait . Poll ( restartPoll , timeout , func ( ) ( bool , error ) {
2015-07-24 02:55:13 +00:00
// A `rolling-updates describe` call outputs what we want to stdout.
2015-10-20 02:41:58 +00:00
output , _ , err := retryCmd ( "gcloud" , append ( migUpdateCmdBase ( ) ,
2015-07-23 00:07:15 +00:00
"rolling-updates" ,
2015-07-09 22:51:40 +00:00
fmt . Sprintf ( "--project=%s" , testContext . CloudConfig . ProjectID ) ,
fmt . Sprintf ( "--zone=%s" , testContext . CloudConfig . Zone ) ,
"describe" ,
2015-07-24 02:55:13 +00:00
id ) ... )
2015-07-09 22:51:40 +00:00
if err != nil {
errLast = fmt . Errorf ( "Error calling rolling-updates describe %s: %v" , id , err )
Logf ( "%v" , errLast )
return false , nil
}
// The 'describe' call probably succeeded; parse the output and try to
// find the line that looks like "status: <status>" and see whether it's
// done.
Logf ( "Waiting for MIG rolling update: %s (%v elapsed)" ,
parseKVLines ( output , progress ) , time . Since ( start ) )
if st := parseKVLines ( output , status ) ; st == done {
return true , nil
}
return false , nil
} ) != nil {
return fmt . Errorf ( "timeout waiting %v for MIG rolling update to complete. Last error: %v" , timeout , errLast )
}
Logf ( "MIG rolling update complete after %v" , time . Since ( start ) )
return nil
}
2016-01-30 04:35:32 +00:00
func testLoadBalancerReachable ( ingress api . LoadBalancerIngress , port int ) bool {
2016-02-10 14:31:52 +00:00
loadBalancerLagTimeout := loadBalancerLagTimeoutDefault
if providerIs ( "aws" ) {
loadBalancerLagTimeout = loadBalancerLagTimeoutAWS
}
2016-01-30 04:35:32 +00:00
return testLoadBalancerReachableInTime ( ingress , port , loadBalancerLagTimeout )
}
func testLoadBalancerReachableInTime ( ingress api . LoadBalancerIngress , port int , timeout time . Duration ) bool {
ip := ingress . IP
if ip == "" {
ip = ingress . Hostname
}
return testReachableInTime ( conditionFuncDecorator ( ip , port , testReachableHTTP , "/" , "test-webserver" ) , timeout )
}
func conditionFuncDecorator ( ip string , port int , fn func ( string , int , string , string ) ( bool , error ) , request string , expect string ) wait . ConditionFunc {
return func ( ) ( bool , error ) {
return fn ( ip , port , request , expect )
}
}
func testReachableInTime ( testFunc wait . ConditionFunc , timeout time . Duration ) bool {
By ( fmt . Sprintf ( "Waiting up to %v" , timeout ) )
err := wait . PollImmediate ( poll , timeout , testFunc )
if err != nil {
Expect ( err ) . NotTo ( HaveOccurred ( ) , "Error waiting" )
return false
}
return true
}
func waitForLoadBalancerIngress ( c * client . Client , serviceName , namespace string ) ( * api . Service , error ) {
// TODO: once support ticket 21807001 is resolved, reduce this timeout
// back to something reasonable
const timeout = 20 * time . Minute
var service * api . Service
By ( fmt . Sprintf ( "waiting up to %v for service %s in namespace %s to have a LoadBalancer ingress point" , timeout , serviceName , namespace ) )
i := 1
for start := time . Now ( ) ; time . Since ( start ) < timeout ; time . Sleep ( 3 * time . Second ) {
service , err := c . Services ( namespace ) . Get ( serviceName )
if err != nil {
Logf ( "Get service failed, ignoring for 5s: %v" , err )
continue
}
if len ( service . Status . LoadBalancer . Ingress ) > 0 {
return service , nil
}
if i % 5 == 0 {
Logf ( "Waiting for service %s in namespace %s to have a LoadBalancer ingress point (%v)" , serviceName , namespace , time . Since ( start ) )
}
i ++
}
return service , fmt . Errorf ( "service %s in namespace %s doesn't have a LoadBalancer ingress point after %.2f seconds" , serviceName , namespace , timeout . Seconds ( ) )
}