mirror of https://github.com/k3s-io/k3s
E2E test node upgrade (to same version)
parent
a8a3e9d0c7
commit
a6c47a07de
|
@ -57,7 +57,7 @@ type WaitFunc func() <-chan struct{}
|
|||
// placed on the channel and once more when the channel is closed. If c
|
||||
// returns an error the loop ends and that error is returned, and if c returns
|
||||
// true the loop ends and nil is returned. ErrWaitTimeout will be returned if
|
||||
// the channel is closed without c every returning true.
|
||||
// the channel is closed without c ever returning true.
|
||||
func WaitFor(wait WaitFunc, c ConditionFunc) error {
|
||||
w := wait()
|
||||
for {
|
||||
|
|
|
@ -30,28 +30,17 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
// How long to pause between polling node or pod status.
|
||||
poll = 5 * time.Second
|
||||
|
||||
// How long nodes have to be "ready" before the reboot. They should already
|
||||
// be "ready" before the test starts, so this is small.
|
||||
nodeReadyInitialTimeout = 20 * time.Second
|
||||
|
||||
// How long pods have to be "ready" before the reboot. They should already
|
||||
// be "ready" before the test starts, so this is small.
|
||||
podReadyBeforeTimeout = 20 * time.Second
|
||||
|
||||
// How long a node is allowed to go from "Ready" to "NotReady" after a
|
||||
// reboot is issued before the test is considered failed.
|
||||
rebootNotReadyTimeout = 2 * time.Minute
|
||||
rebootNodeNotReadyTimeout = 2 * time.Minute
|
||||
|
||||
// How long a node is allowed to go from "NotReady" to "Ready" after a
|
||||
// reboot is issued and it is found to be "NotReady" before the test is
|
||||
// considered failed.
|
||||
rebootReadyAgainTimeout = 5 * time.Minute
|
||||
rebootNodeReadyAgainTimeout = 5 * time.Minute
|
||||
|
||||
// How long pods have to be "ready" after the reboot.
|
||||
podReadyAgainTimeout = 5 * time.Minute
|
||||
rebootPodReadyAgainTimeout = 5 * time.Minute
|
||||
)
|
||||
|
||||
var _ = Describe("Reboot", func() {
|
||||
|
@ -105,7 +94,7 @@ func testReboot(c *client.Client, rebootCmd string) {
|
|||
}
|
||||
|
||||
// Get all nodes, and kick off the test on each.
|
||||
nodelist, err := c.Nodes().List(labels.Everything(), fields.Everything())
|
||||
nodelist, err := listNodes(c, labels.Everything(), fields.Everything())
|
||||
if err != nil {
|
||||
Failf("Error getting nodes: %v", err)
|
||||
}
|
||||
|
@ -159,6 +148,10 @@ func issueSSHCommand(node *api.Node, provider, cmd string) error {
|
|||
// It returns true through result only if all of the steps pass; at the first
|
||||
// failed step, it will return false through result and not run the rest.
|
||||
func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) {
|
||||
// Setup
|
||||
ps := newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
|
||||
defer ps.Stop()
|
||||
|
||||
// Get the node initially.
|
||||
Logf("Getting %s", name)
|
||||
node, err := c.Nodes().Get(name)
|
||||
|
@ -175,22 +168,16 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan
|
|||
}
|
||||
|
||||
// Get all the pods on the node.
|
||||
podList, err := c.Pods(api.NamespaceDefault).List(
|
||||
labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
|
||||
if err != nil {
|
||||
Logf("Error getting pods for node %s: %v", name, err)
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
podNames := make([]string, len(podList.Items))
|
||||
for i, p := range podList.Items {
|
||||
pods := ps.List()
|
||||
podNames := make([]string, len(pods))
|
||||
for i, p := range pods {
|
||||
podNames[i] = p.ObjectMeta.Name
|
||||
}
|
||||
Logf("Node %s has %d pods: %v", name, len(podNames), podNames)
|
||||
|
||||
// For each pod, we do a sanity check to ensure it's running / healthy
|
||||
// now, as that's what we'll be checking later.
|
||||
if !checkPodsRunning(c, podNames, podReadyBeforeTimeout) {
|
||||
if !checkPodsRunningReady(c, podNames, podReadyBeforeTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
@ -202,20 +189,20 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan
|
|||
}
|
||||
|
||||
// Wait for some kind of "not ready" status.
|
||||
if !waitForNodeToBeNotReady(c, name, rebootNotReadyTimeout) {
|
||||
if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
// Wait for some kind of "ready" status.
|
||||
if !waitForNodeToBeReady(c, name, rebootReadyAgainTimeout) {
|
||||
if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure all of the pods that we found on this node before the reboot are
|
||||
// running / healthy.
|
||||
if !checkPodsRunning(c, podNames, podReadyAgainTimeout) {
|
||||
if !checkPodsRunningReady(c, podNames, rebootPodReadyAgainTimeout) {
|
||||
result <- false
|
||||
return
|
||||
}
|
||||
|
@ -223,72 +210,3 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan
|
|||
Logf("Reboot successful on node %s", name)
|
||||
result <- true
|
||||
}
|
||||
|
||||
// checkPodsRunning returns whether all pods whose names are listed in podNames
|
||||
// are running.
|
||||
func checkPodsRunning(c *client.Client, podNames []string, timeout time.Duration) bool {
|
||||
desc := "running and ready"
|
||||
Logf("Waiting up to %v for the following pods to be %s: %s", timeout, desc, podNames)
|
||||
result := make(chan bool, len(podNames))
|
||||
for ix := range podNames {
|
||||
// Launch off pod readiness checkers.
|
||||
go func(name string) {
|
||||
err := waitForPodCondition(c, api.NamespaceDefault, name, desc,
|
||||
poll, timeout, podRunningReady)
|
||||
result <- err == nil
|
||||
}(podNames[ix])
|
||||
}
|
||||
// Wait for them all to finish.
|
||||
success := true
|
||||
// TODO(mbforbes): Change to `for range` syntax and remove logging once we
|
||||
// support only Go >= 1.4.
|
||||
for _, podName := range podNames {
|
||||
if !<-result {
|
||||
Logf("Pod %s failed to be %s.", podName, desc)
|
||||
success = false
|
||||
}
|
||||
}
|
||||
Logf("Wanted all pods to be %s. Result: %t. Pods: %v", desc, success, podNames)
|
||||
return success
|
||||
}
|
||||
|
||||
// waitForNodeToBeReady returns whether node name is ready within timeout.
|
||||
func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool {
|
||||
return waitForNodeToBe(c, name, true, timeout)
|
||||
}
|
||||
|
||||
// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the
|
||||
// readiness condition is anything but ready, e.g false or unknown) within
|
||||
// timeout.
|
||||
func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool {
|
||||
return waitForNodeToBe(c, name, false, timeout)
|
||||
}
|
||||
|
||||
// waitForNodeToBe returns whether node name's readiness state matches wantReady
|
||||
// within timeout. If wantReady is true, it will ensure the node is ready; if
|
||||
// it's false, it ensures the node is in any state other than ready (e.g. not
|
||||
// ready or unknown).
|
||||
func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool {
|
||||
Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady)
|
||||
for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
|
||||
node, err := c.Nodes().Get(name)
|
||||
if err != nil {
|
||||
Logf("Couldn't get node %s", name)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check the node readiness condition (logging all).
|
||||
for i, cond := range node.Status.Conditions {
|
||||
Logf("Node %s condition %d/%d: type: %v, status: %v",
|
||||
name, i+1, len(node.Status.Conditions), cond.Type, cond.Status)
|
||||
// Ensure that the condition type is readiness and the status
|
||||
// matches as desired.
|
||||
if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady {
|
||||
Logf("Successfully found node %s readiness to be %t", name, wantReady)
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout)
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -0,0 +1,383 @@
|
|||
/*
|
||||
Copyright 2015 The Kubernetes Authors All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
|
||||
|
||||
. "github.com/onsi/ginkgo"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
const (
|
||||
// How long each node is given during a process that restarts all nodes
|
||||
// before the test is considered failed. (Note that the total time to
|
||||
// restart all nodes will be this number times the nubmer of nodes.)
|
||||
restartPerNodeTimeout = 5 * time.Minute
|
||||
|
||||
// How often to poll the statues of a restart.
|
||||
restartPoll = 20 * time.Second
|
||||
|
||||
// How long a node is allowed to become "Ready" after it is restarted before
|
||||
// the test is considered failed.
|
||||
restartNodeReadyAgainTimeout = 5 * time.Minute
|
||||
|
||||
// How long a pod is allowed to become "running" and "ready" after a node
|
||||
// restart before test is considered failed.
|
||||
restartPodReadyAgainTimeout = 5 * time.Minute
|
||||
)
|
||||
|
||||
var _ = Describe("Restart", func() {
|
||||
var c *client.Client
|
||||
var ps *podStore
|
||||
|
||||
BeforeEach(func() {
|
||||
var err error
|
||||
c, err = loadClient()
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
ps = newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.Everything())
|
||||
})
|
||||
|
||||
AfterEach(func() {
|
||||
ps.Stop()
|
||||
})
|
||||
|
||||
It("should restart all nodes and ensure all nodes and pods recover", func() {
|
||||
// This test requires the ability to restart all nodes, so the provider
|
||||
// check must be identical to that call.
|
||||
provider := testContext.Provider
|
||||
nn := testContext.CloudConfig.NumNodes
|
||||
if !providerIs("gce") {
|
||||
By(fmt.Sprintf("Skipping reboot test, which is not implemented for %s", provider))
|
||||
return
|
||||
}
|
||||
|
||||
By("ensuring all nodes are ready")
|
||||
nodeNamesBefore, err := checkNodesReady(c, nodeReadyInitialTimeout, nn)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Logf("Got the following nodes before restart: %v", nodeNamesBefore)
|
||||
|
||||
By("ensuring all pods are running and ready")
|
||||
pods := ps.List()
|
||||
podNamesBefore := make([]string, len(pods))
|
||||
for i, p := range pods {
|
||||
podNamesBefore[i] = p.ObjectMeta.Name
|
||||
}
|
||||
if !checkPodsRunningReady(c, podNamesBefore, podReadyBeforeTimeout) {
|
||||
Failf("At least one pod wasn't running and ready at test start.")
|
||||
}
|
||||
|
||||
By("restarting all of the nodes")
|
||||
err = restartNodes(provider, restartPerNodeTimeout)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
By("ensuring all nodes are ready after the restart")
|
||||
nodeNamesAfter, err := checkNodesReady(c, restartNodeReadyAgainTimeout, nn)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Logf("Got the following nodes after restart: %v", nodeNamesAfter)
|
||||
|
||||
// Make sure that we have the same number of nodes. We're not checking
|
||||
// that the names match because that's implementation specific.
|
||||
By("ensuring the same number of nodes exist after the restart")
|
||||
if len(nodeNamesBefore) != len(nodeNamesAfter) {
|
||||
Failf("Had %d nodes before nodes were restarted, but now only have %d",
|
||||
len(nodeNamesBefore), len(nodeNamesAfter))
|
||||
}
|
||||
|
||||
// Make sure that we have the same number of pods. We're not checking
|
||||
// that the names match because they are recreated with different names
|
||||
// across node restarts.
|
||||
By("ensuring the same number of pods are running and ready after restart")
|
||||
podCheckStart := time.Now()
|
||||
podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart)
|
||||
if !checkPodsRunningReady(c, podNamesAfter, remaining) {
|
||||
Failf("At least one pod wasn't running and ready after the restart.")
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
// waitForNPods tries to list pods using c until it finds expect of them,
|
||||
// returning their names if it can do so before timeout.
|
||||
func waitForNPods(ps *podStore, expect int, timeout time.Duration) ([]string, error) {
|
||||
// Loop until we find expect pods or timeout is passed.
|
||||
var pods []*api.Pod
|
||||
var errLast error
|
||||
found := wait.Poll(poll, timeout, func() (bool, error) {
|
||||
pods = ps.List()
|
||||
if len(pods) != expect {
|
||||
errLast = fmt.Errorf("expected to find %d pods but found only %d", expect, len(pods))
|
||||
Logf("Error getting pods: %v", errLast)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}) == nil
|
||||
// Extract the names of all found pods.
|
||||
podNames := make([]string, len(pods))
|
||||
for i, p := range pods {
|
||||
podNames[i] = p.ObjectMeta.Name
|
||||
}
|
||||
if !found {
|
||||
return podNames, fmt.Errorf("couldn't find %d pods within %v; last error: %v",
|
||||
expect, timeout, errLast)
|
||||
}
|
||||
return podNames, nil
|
||||
}
|
||||
|
||||
// checkNodesReady waits up to nt for expect nodes accessed by c to be ready,
|
||||
// returning an error if this doesn't happen in time. It returns the names of
|
||||
// nodes it finds.
|
||||
func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string, error) {
|
||||
// First, keep getting all of the nodes until we get the number we expect.
|
||||
var nodeList *api.NodeList
|
||||
var errLast error
|
||||
start := time.Now()
|
||||
found := wait.Poll(poll, nt, func() (bool, error) {
|
||||
// Even though listNodes(...) has its own retries, a rolling-update
|
||||
// (GCE/GKE implementation of restart) can complete before the apiserver
|
||||
// knows about all of the nodes. Thus, we retry the list nodes call
|
||||
// until we get the expected number of nodes.
|
||||
nodeList, errLast = listNodes(c, labels.Everything(), fields.Everything())
|
||||
if errLast != nil {
|
||||
return false, nil
|
||||
}
|
||||
if len(nodeList.Items) != expect {
|
||||
errLast = fmt.Errorf("expected to find %d nodes but found only %d", expect, len(nodeList.Items))
|
||||
Logf("%v", errLast)
|
||||
return false, nil
|
||||
}
|
||||
return true, nil
|
||||
}) == nil
|
||||
nodeNames := make([]string, len(nodeList.Items))
|
||||
for i, n := range nodeList.Items {
|
||||
nodeNames[i] = n.ObjectMeta.Name
|
||||
}
|
||||
if !found {
|
||||
return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v",
|
||||
expect, nt, errLast)
|
||||
}
|
||||
|
||||
// Next, ensure in parallel that all the nodes are ready. We subtract the
|
||||
// time we spent waiting above.
|
||||
timeout := nt - time.Since(start)
|
||||
result := make(chan bool, len(nodeList.Items))
|
||||
for _, n := range nodeNames {
|
||||
n := n
|
||||
go func() { result <- waitForNodeToBeReady(c, n, timeout) }()
|
||||
}
|
||||
failed := false
|
||||
// TODO(mbforbes): Change to `for range` syntax once we support only Go
|
||||
// >= 1.4.
|
||||
for i := range nodeList.Items {
|
||||
_ = i
|
||||
if !<-result {
|
||||
failed = true
|
||||
}
|
||||
}
|
||||
if failed {
|
||||
return nodeNames, fmt.Errorf("at least one node failed to be ready")
|
||||
}
|
||||
return nodeNames, nil
|
||||
}
|
||||
|
||||
// restartNodes uses provider to do a restart of all nodes in the cluster,
|
||||
// allowing up to nt per node.
|
||||
func restartNodes(provider string, nt time.Duration) error {
|
||||
switch provider {
|
||||
case "gce":
|
||||
return migRollingUpdate(nt)
|
||||
default:
|
||||
return fmt.Errorf("restartNodes(...) not implemented for %s", provider)
|
||||
}
|
||||
}
|
||||
|
||||
// migRollingUpdate starts a MIG rolling update and waits up to nt times the
|
||||
// nubmer of nodes for it to complete.
|
||||
func migRollingUpdate(nt time.Duration) error {
|
||||
By("getting the name of the template for the managed instance group")
|
||||
templ, err := migTemplate()
|
||||
if err != nil {
|
||||
return fmt.Errorf("couldn't get MIG template name: %v", err)
|
||||
}
|
||||
|
||||
By("starting the managed instance group rolling update")
|
||||
id, err := migRollingUpdateStart(templ, nt)
|
||||
if err != nil {
|
||||
return fmt.Errorf("couldn't start the MIG rolling update: %v", err)
|
||||
}
|
||||
|
||||
By("polling the managed instance group rolling update until it completes")
|
||||
if err := migRollingUpdatePoll(id, nt); err != nil {
|
||||
return fmt.Errorf("err waiting until update completed: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// migTemlate (GCE/GKE-only) returns the name of the MIG template that the
|
||||
// nodes of the cluster use.
|
||||
func migTemplate() (string, error) {
|
||||
var errLast error
|
||||
var templ string
|
||||
key := "instanceTemplate"
|
||||
if wait.Poll(poll, singleCallTimeout, func() (bool, error) {
|
||||
// TODO(mbforbes): make this hit the compute API directly instead of
|
||||
// shelling out to gcloud.
|
||||
o, err := exec.Command("gcloud", "preview", "managed-instance-groups",
|
||||
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
|
||||
fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
|
||||
"describe",
|
||||
testContext.CloudConfig.NodeInstanceGroup).CombinedOutput()
|
||||
if err != nil {
|
||||
errLast = fmt.Errorf("gcloud preview managed-instance-groups describe call failed with err: %v", err)
|
||||
return false, nil
|
||||
}
|
||||
output := string(o)
|
||||
|
||||
// The 'describe' call probably succeeded; parse the output and try to
|
||||
// find the line that looks like "instanceTemplate: url/to/<templ>" and
|
||||
// return <templ>.
|
||||
if val := parseKVLines(output, key); len(val) > 0 {
|
||||
url := strings.Split(val, "/")
|
||||
templ = url[len(url)-1]
|
||||
Logf("MIG group %s using template: %s", testContext.CloudConfig.NodeInstanceGroup, templ)
|
||||
return true, nil
|
||||
}
|
||||
errLast = fmt.Errorf("couldn't find %s in output to get MIG template. Output: %s", key, output)
|
||||
return false, nil
|
||||
}) != nil {
|
||||
return "", fmt.Errorf("migTemplate() failed with last error: %v", errLast)
|
||||
}
|
||||
return templ, nil
|
||||
}
|
||||
|
||||
// migRollingUpdateStart (GCE/GKE-only) starts a MIG rolling update using templ
|
||||
// as the new template, waiting up to nt per node, and returns the ID of that
|
||||
// update.
|
||||
func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
|
||||
var errLast error
|
||||
var id string
|
||||
prefix, suffix := "Started [", "]."
|
||||
if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) {
|
||||
// TODO(mbforbes): make this hit the compute API directly instead of
|
||||
// shelling out to gcloud.
|
||||
o, err := exec.Command("gcloud", "preview", "rolling-updates",
|
||||
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
|
||||
fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
|
||||
"start",
|
||||
// Required args.
|
||||
fmt.Sprintf("--group=%s", testContext.CloudConfig.NodeInstanceGroup),
|
||||
fmt.Sprintf("--template=%s", templ),
|
||||
// Optional args to fine-tune behavior.
|
||||
fmt.Sprintf("--instance-startup-timeout=%ds", int(nt.Seconds())),
|
||||
// NOTE: We can speed up this process by increasing
|
||||
// --max-num-concurrent-instances.
|
||||
fmt.Sprintf("--max-num-concurrent-instances=%d", 1),
|
||||
fmt.Sprintf("--max-num-failed-instances=%d", 0),
|
||||
fmt.Sprintf("--min-instance-update-time=%ds", 0)).CombinedOutput()
|
||||
if err != nil {
|
||||
errLast = fmt.Errorf("gcloud preview rolling-updates call failed with err: %v", err)
|
||||
return false, nil
|
||||
}
|
||||
output := string(o)
|
||||
|
||||
// The 'start' call probably succeeded; parse the output and try to find
|
||||
// the line that looks like "Started [url/to/<id>]." and return <id>.
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
// As a sanity check, ensure the line starts with prefix and ends
|
||||
// with suffix.
|
||||
if strings.Index(line, prefix) != 0 || strings.Index(line, suffix) != len(line)-len(suffix) {
|
||||
continue
|
||||
}
|
||||
url := strings.Split(strings.TrimSuffix(strings.TrimPrefix(line, prefix), suffix), "/")
|
||||
id = url[len(url)-1]
|
||||
Logf("Started MIG rolling update; ID: %s", id)
|
||||
return true, nil
|
||||
}
|
||||
errLast = fmt.Errorf("couldn't find line like '%s ... %s' in output to MIG rolling-update start. Output: %s",
|
||||
prefix, suffix, output)
|
||||
return false, nil
|
||||
}); err != nil {
|
||||
return "", fmt.Errorf("migRollingUpdateStart() failed with last error: %v", errLast)
|
||||
}
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// migRollingUpdatePoll (CKE/GKE-only) polls the progress of the MIG rolling
|
||||
// update with ID id until it is complete. It returns an error if this takes
|
||||
// longer than nt times the number of nodes.
|
||||
func migRollingUpdatePoll(id string, nt time.Duration) error {
|
||||
// Two keys and a val.
|
||||
status, progress, done := "status", "statusMessage", "ROLLED_OUT"
|
||||
start, timeout := time.Now(), nt*time.Duration(testContext.CloudConfig.NumNodes)
|
||||
var errLast error
|
||||
Logf("Waiting up to %v for MIG rolling update to complete.", timeout)
|
||||
if wait.Poll(restartPoll, timeout, func() (bool, error) {
|
||||
o, err := exec.Command("gcloud", "preview", "rolling-updates",
|
||||
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
|
||||
fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
|
||||
"describe",
|
||||
id).CombinedOutput()
|
||||
if err != nil {
|
||||
errLast = fmt.Errorf("Error calling rolling-updates describe %s: %v", id, err)
|
||||
Logf("%v", errLast)
|
||||
return false, nil
|
||||
}
|
||||
output := string(o)
|
||||
|
||||
// The 'describe' call probably succeeded; parse the output and try to
|
||||
// find the line that looks like "status: <status>" and see whether it's
|
||||
// done.
|
||||
Logf("Waiting for MIG rolling update: %s (%v elapsed)",
|
||||
parseKVLines(output, progress), time.Since(start))
|
||||
if st := parseKVLines(output, status); st == done {
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
}) != nil {
|
||||
return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// parseKVLines parses output that looks like lines containing "<key>: <val>"
|
||||
// and returns <val> if <key> is found. Otherwise, it returns the empty string.
|
||||
func parseKVLines(output, key string) string {
|
||||
delim := ":"
|
||||
key = key + delim
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
pieces := strings.SplitAfterN(line, delim, 2)
|
||||
if len(pieces) != 2 {
|
||||
continue
|
||||
}
|
||||
k, v := pieces[0], pieces[1]
|
||||
if k == key {
|
||||
return strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
139
test/e2e/util.go
139
test/e2e/util.go
|
@ -42,6 +42,7 @@ import (
|
|||
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/runtime"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/watch"
|
||||
|
||||
"code.google.com/p/go-uuid/uuid"
|
||||
|
@ -60,15 +61,28 @@ const (
|
|||
// String used to mark pod deletion
|
||||
nonExist = "NonExist"
|
||||
|
||||
// How often to poll pods.
|
||||
podPoll = 5 * time.Second
|
||||
// How often to poll pods and nodes.
|
||||
poll = 5 * time.Second
|
||||
|
||||
// service accounts are provisioned after namespace creation
|
||||
// a service account is required to support pod creation in a namespace as part of admission control
|
||||
serviceAccountProvisionTimeout = 2 * time.Minute
|
||||
|
||||
// How often to poll for service accounts
|
||||
serviceAccountPoll = 5 * time.Second
|
||||
// How long to try single API calls (like 'get' or 'list'). Used to prevent
|
||||
// transient failures from failing tests.
|
||||
singleCallTimeout = 30 * time.Second
|
||||
|
||||
// How long nodes have to be "ready" when a test begins. They should already
|
||||
// be "ready" before the test starts, so this is small.
|
||||
nodeReadyInitialTimeout = 20 * time.Second
|
||||
|
||||
// How long pods have to be "ready" when a test begins. They should already
|
||||
// be "ready" before the test starts, so this is small.
|
||||
podReadyBeforeTimeout = 20 * time.Second
|
||||
|
||||
// How wide to print pod names, by default. Useful for aligning printing to
|
||||
// quickly scan through output.
|
||||
podPrintWidth = 55
|
||||
)
|
||||
|
||||
type CloudConfig struct {
|
||||
|
@ -218,7 +232,6 @@ func podRunningReady(p *api.Pod) (bool, error) {
|
|||
if !podReady(p) {
|
||||
return false, fmt.Errorf("pod '%s' on '%s' didn't have condition {%v %v}; conditions: %v",
|
||||
p.ObjectMeta.Name, p.Spec.NodeName, api.PodReady, api.ConditionTrue, p.Status.Conditions)
|
||||
|
||||
}
|
||||
return true, nil
|
||||
}
|
||||
|
@ -233,15 +246,16 @@ func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) erro
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
start := time.Now()
|
||||
Logf("Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready",
|
||||
timeout, minPods, ns)
|
||||
for start := time.Now(); time.Since(start) < timeout; time.Sleep(podPoll) {
|
||||
if wait.Poll(poll, timeout, func() (bool, error) {
|
||||
// We get the new list of pods in every iteration beause more pods come
|
||||
// online during startup and we want to ensure they are also checked.
|
||||
podList, err := c.Pods(ns).List(labels.Everything(), fields.Everything())
|
||||
if err != nil {
|
||||
Logf("Error getting pods in namespace '%s': %v", ns, err)
|
||||
continue
|
||||
return false, nil
|
||||
}
|
||||
nOk, badPods := 0, []api.Pod{}
|
||||
for _, pod := range podList.Items {
|
||||
|
@ -254,14 +268,17 @@ func waitForPodsRunningReady(ns string, minPods int, timeout time.Duration) erro
|
|||
Logf("%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)",
|
||||
nOk, len(podList.Items), ns, int(time.Since(start).Seconds()))
|
||||
if nOk == len(podList.Items) && nOk >= minPods {
|
||||
return nil
|
||||
return true, nil
|
||||
}
|
||||
logPodStates(badPods)
|
||||
return false, nil
|
||||
}) != nil {
|
||||
return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout)
|
||||
}
|
||||
return fmt.Errorf("Not all pods in namespace '%s' running and ready within %v", ns, timeout)
|
||||
return nil
|
||||
}
|
||||
|
||||
func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName string, poll, timeout time.Duration) error {
|
||||
func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName string, timeout time.Duration) error {
|
||||
Logf("Waiting up to %v for service account %s to be provisioned in ns %s", timeout, serviceAccountName, ns)
|
||||
for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
|
||||
_, err := c.ServiceAccounts(ns).Get(serviceAccountName)
|
||||
|
@ -275,20 +292,23 @@ func waitForServiceAccountInNamespace(c *client.Client, ns, serviceAccountName s
|
|||
return fmt.Errorf("Service account %s in namespace %s not ready within %v", serviceAccountName, ns, timeout)
|
||||
}
|
||||
|
||||
func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeout time.Duration, condition podCondition) error {
|
||||
Logf("Waiting up to %v for pod %s status to be %s", timeout, podName, desc)
|
||||
func waitForPodCondition(c *client.Client, ns, podName, desc string, timeout time.Duration, condition podCondition) error {
|
||||
Logf("Waiting up to %[1]v for pod %-[2]*[3]s status to be %[4]s", timeout, podPrintWidth, podName, desc)
|
||||
for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
|
||||
pod, err := c.Pods(ns).Get(podName)
|
||||
if err != nil {
|
||||
Logf("Get pod %s in ns %s failed, ignoring for %v: %v", podName, ns, poll, err)
|
||||
// Aligning this text makes it much more readable
|
||||
Logf("Get pod %-[1]*[2]s in namespace '%[3]s' failed, ignoring for %[4]v. Error: %[5]v",
|
||||
podPrintWidth, podName, ns, poll, err)
|
||||
continue
|
||||
}
|
||||
done, err := condition(pod)
|
||||
if done {
|
||||
return err
|
||||
}
|
||||
Logf("Waiting for pod '%s' in namespace '%s' status to be '%q' (found phase: '%q', readiness: %t) (%v)",
|
||||
podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start))
|
||||
Logf("Waiting for pod %-[1]*[2]s in namespace '%[3]s' status to be '%[4]s'"+
|
||||
"(found phase: %[5]q, readiness: %[6]t) (%[7]v elapsed)",
|
||||
podPrintWidth, podName, ns, desc, pod.Status.Phase, podReady(pod), time.Since(start))
|
||||
}
|
||||
return fmt.Errorf("gave up waiting for pod '%s' to be '%s' after %v", podName, desc, timeout)
|
||||
}
|
||||
|
@ -297,7 +317,7 @@ func waitForPodCondition(c *client.Client, ns, podName, desc string, poll, timeo
|
|||
// the default service account is what is associated with pods when they do not specify a service account
|
||||
// as a result, pods are not able to be provisioned in a namespace until the service account is provisioned
|
||||
func waitForDefaultServiceAccountInNamespace(c *client.Client, namespace string) error {
|
||||
return waitForServiceAccountInNamespace(c, namespace, "default", serviceAccountPoll, serviceAccountProvisionTimeout)
|
||||
return waitForServiceAccountInNamespace(c, namespace, "default", serviceAccountProvisionTimeout)
|
||||
}
|
||||
|
||||
// createNS should be used by every test, note that we append a common prefix to the provided test name.
|
||||
|
@ -315,7 +335,7 @@ func createTestingNS(baseName string, c *client.Client) (*api.Namespace, error)
|
|||
}
|
||||
|
||||
func waitForPodRunningInNamespace(c *client.Client, podName string, namespace string) error {
|
||||
return waitForPodCondition(c, namespace, podName, "running", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
return waitForPodCondition(c, namespace, podName, "running", podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
if pod.Status.Phase == api.PodRunning {
|
||||
return true, nil
|
||||
}
|
||||
|
@ -332,7 +352,7 @@ func waitForPodRunning(c *client.Client, podName string) error {
|
|||
|
||||
// waitForPodNotPending returns an error if it took too long for the pod to go out of pending state.
|
||||
func waitForPodNotPending(c *client.Client, ns, podName string) error {
|
||||
return waitForPodCondition(c, ns, podName, "!pending", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
return waitForPodCondition(c, ns, podName, "!pending", podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
if pod.Status.Phase != api.PodPending {
|
||||
Logf("Saw pod '%s' in namespace '%s' out of pending state (found '%q')", podName, ns, pod.Status.Phase)
|
||||
return true, nil
|
||||
|
@ -343,7 +363,7 @@ func waitForPodNotPending(c *client.Client, ns, podName string) error {
|
|||
|
||||
// waitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or ran too long.
|
||||
func waitForPodSuccessInNamespace(c *client.Client, podName string, contName string, namespace string) error {
|
||||
return waitForPodCondition(c, namespace, podName, "success or failure", podPoll, podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
return waitForPodCondition(c, namespace, podName, "success or failure", podStartTimeout, func(pod *api.Pod) (bool, error) {
|
||||
// Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632
|
||||
ci, ok := api.GetContainerStatus(pod.Status.ContainerStatuses, contName)
|
||||
if !ok {
|
||||
|
@ -986,6 +1006,19 @@ func DeleteRC(c *client.Client, ns, name string) error {
|
|||
return err
|
||||
}
|
||||
|
||||
// Convenient wrapper around listing nodes supporting retries.
|
||||
func listNodes(c *client.Client, label labels.Selector, field fields.Selector) (*api.NodeList, error) {
|
||||
var nodes *api.NodeList
|
||||
var errLast error
|
||||
if wait.Poll(poll, singleCallTimeout, func() (bool, error) {
|
||||
nodes, errLast = c.Nodes().List(label, field)
|
||||
return errLast == nil, nil
|
||||
}) != nil {
|
||||
return nil, fmt.Errorf("listNodes() failed with last error: %v", errLast)
|
||||
}
|
||||
return nodes, nil
|
||||
}
|
||||
|
||||
// FailedContainers inspects all containers in a pod and returns failure
|
||||
// information for containers that have failed or been restarted.
|
||||
// A map is returned where the key is the containerID and the value is a
|
||||
|
@ -1111,6 +1144,74 @@ func getSigner(provider string) (ssh.Signer, error) {
|
|||
return util.MakePrivateKeySigner(key)
|
||||
}
|
||||
|
||||
// checkPodsRunning returns whether all pods whose names are listed in podNames
|
||||
// are running and ready.
|
||||
func checkPodsRunningReady(c *client.Client, podNames []string, timeout time.Duration) bool {
|
||||
np, desc := len(podNames), "running and ready"
|
||||
Logf("Waiting up to %v for the following %d pods to be %s: %s", timeout, np, desc, podNames)
|
||||
result := make(chan bool, len(podNames))
|
||||
for ix := range podNames {
|
||||
// Launch off pod readiness checkers.
|
||||
go func(name string) {
|
||||
err := waitForPodCondition(c, api.NamespaceDefault, name, desc, timeout, podRunningReady)
|
||||
result <- err == nil
|
||||
}(podNames[ix])
|
||||
}
|
||||
// Wait for them all to finish.
|
||||
success := true
|
||||
// TODO(mbforbes): Change to `for range` syntax and remove logging once we
|
||||
// support only Go >= 1.4.
|
||||
for _, podName := range podNames {
|
||||
if !<-result {
|
||||
Logf("Pod %-[1]*[2]s failed to be %[3]s.", podPrintWidth, podName, desc)
|
||||
success = false
|
||||
}
|
||||
}
|
||||
Logf("Wanted all %d pods to be %s. Result: %t. Pods: %v", np, desc, success, podNames)
|
||||
return success
|
||||
}
|
||||
|
||||
// waitForNodeToBeReady returns whether node name is ready within timeout.
|
||||
func waitForNodeToBeReady(c *client.Client, name string, timeout time.Duration) bool {
|
||||
return waitForNodeToBe(c, name, true, timeout)
|
||||
}
|
||||
|
||||
// waitForNodeToBeNotReady returns whether node name is not ready (i.e. the
|
||||
// readiness condition is anything but ready, e.g false or unknown) within
|
||||
// timeout.
|
||||
func waitForNodeToBeNotReady(c *client.Client, name string, timeout time.Duration) bool {
|
||||
return waitForNodeToBe(c, name, false, timeout)
|
||||
}
|
||||
|
||||
// waitForNodeToBe returns whether node name's readiness state matches wantReady
|
||||
// within timeout. If wantReady is true, it will ensure the node is ready; if
|
||||
// it's false, it ensures the node is in any state other than ready (e.g. not
|
||||
// ready or unknown).
|
||||
func waitForNodeToBe(c *client.Client, name string, wantReady bool, timeout time.Duration) bool {
|
||||
Logf("Waiting up to %v for node %s readiness to be %t", timeout, name, wantReady)
|
||||
for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) {
|
||||
node, err := c.Nodes().Get(name)
|
||||
if err != nil {
|
||||
Logf("Couldn't get node %s", name)
|
||||
continue
|
||||
}
|
||||
|
||||
// Check the node readiness condition (logging all).
|
||||
for i, cond := range node.Status.Conditions {
|
||||
Logf("Node %s condition %d/%d: type: %v, status: %v",
|
||||
name, i+1, len(node.Status.Conditions), cond.Type, cond.Status)
|
||||
// Ensure that the condition type is readiness and the status
|
||||
// matches as desired.
|
||||
if cond.Type == api.NodeReady && (cond.Status == api.ConditionTrue) == wantReady {
|
||||
Logf("Successfully found node %s readiness to be %t", name, wantReady)
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
Logf("Node %s didn't reach desired readiness (%t) within %v", name, wantReady, timeout)
|
||||
return false
|
||||
}
|
||||
|
||||
// LatencyMetrics stores data about request latency at a given quantile
|
||||
// broken down by verb (e.g. GET, PUT, LIST) and resource (e.g. pods, services).
|
||||
type LatencyMetric struct {
|
||||
|
|
Loading…
Reference in New Issue