Node upgrade tests.

pull/6/head
Max Forbes 2015-06-17 00:13:26 -07:00
parent 28197e07d6
commit 2803fbe343
11 changed files with 490 additions and 238 deletions

View File

@ -179,13 +179,13 @@ function get-kubeconfig-bearertoken() {
function set_binary_version() { function set_binary_version() {
if [[ "${1}" == "latest_stable" ]]; then if [[ "${1}" == "latest_stable" ]]; then
KUBE_VERSION=$(gsutil cat gs://kubernetes-release/release/stable.txt) KUBE_VERSION=$(gsutil cat gs://kubernetes-release/release/stable.txt)
echo "Using latest stable version: ${KUBE_VERSION}" echo "Using latest stable version: ${KUBE_VERSION}" >&2
elif [[ "${1}" == "latest_release" ]]; then elif [[ "${1}" == "latest_release" ]]; then
KUBE_VERSION=$(gsutil cat gs://kubernetes-release/release/latest.txt) KUBE_VERSION=$(gsutil cat gs://kubernetes-release/release/latest.txt)
echo "Using latest release version: ${KUBE_VERSION}" echo "Using latest release version: ${KUBE_VERSION}" >&2
elif [[ "${1}" == "latest_ci" ]]; then elif [[ "${1}" == "latest_ci" ]]; then
KUBE_VERSION=$(gsutil cat gs://kubernetes-release/ci/latest.txt) KUBE_VERSION=$(gsutil cat gs://kubernetes-release/ci/latest.txt)
echo "Using latest ci version: ${KUBE_VERSION}" echo "Using latest ci version: ${KUBE_VERSION}" >&2
else else
KUBE_VERSION=${1} KUBE_VERSION=${1}
fi fi

View File

@ -34,10 +34,11 @@ source "${KUBE_ROOT}/cluster/${KUBERNETES_PROVIDER}/util.sh"
function usage() { function usage() {
echo "!!! EXPERIMENTAL !!!" echo "!!! EXPERIMENTAL !!!"
echo "" echo ""
echo "${0} [-M|-N] -l | <release or continuous integration version> | [latest_stable|latest_release|latest_ci]" echo "${0} [-M|-N|-P] -l | <release or continuous integration version> | [latest_stable|latest_release|latest_ci]"
echo " Upgrades master and nodes by default" echo " Upgrades master and nodes by default"
echo " -M: Upgrade master only" echo " -M: Upgrade master only"
echo " -N: Upgrade nodes only" echo " -N: Upgrade nodes only"
echo " -P: Node upgrade prerequisites only (create a new instance template)"
echo " -l: Use local(dev) binaries" echo " -l: Use local(dev) binaries"
echo "" echo ""
echo "(... Fetching current release versions ...)" echo "(... Fetching current release versions ...)"
@ -109,21 +110,27 @@ function prepare-upgrade() {
tars_from_version tars_from_version
} }
# Reads kube-env metadata from master and extracts value from provided key.
# Reads kube-env metadata from first node in MINION_NAMES.
# #
# Assumed vars: # Assumed vars:
# MASTER_NAME # MINION_NAMES
# PROJECT # PROJECT
# ZONE # ZONE
function get-node-env() {
# TODO(mbforbes): Make this more reliable with retries.
gcloud compute --project ${PROJECT} ssh --zone ${ZONE} ${MINION_NAMES[0]} --command \
"curl --fail --silent -H 'Metadata-Flavor: Google' \
'http://metadata/computeMetadata/v1/instance/attributes/kube-env'" 2>/dev/null
}
# Using provided node env, extracts value from provided key.
# #
# Args: # Args:
# $1 env key to use # $1 node env (kube-env of node; result of calling get-node-env)
# $2 env key to use
function get-env-val() { function get-env-val() {
# TODO(mbforbes): Make this more reliable with retries. echo "${1}" | grep ${2} | cut -d : -f 2 | cut -d \' -f 2
gcloud compute --project ${PROJECT} ssh --zone ${ZONE} ${MASTER_NAME} --command \
"curl --fail --silent -H 'Metadata-Flavor: Google' \
'http://metadata/computeMetadata/v1/instance/attributes/kube-env'" 2>/dev/null \
| grep ${1} | cut -d : -f 2 | cut -d \' -f 2
} }
# Assumed vars: # Assumed vars:
@ -132,9 +139,40 @@ function get-env-val() {
# NODE_INSTANCE_PREFIX # NODE_INSTANCE_PREFIX
# PROJECT # PROJECT
# ZONE # ZONE
#
# Vars set:
# KUBELET_TOKEN
# KUBE_PROXY_TOKEN
# CA_CERT_BASE64
# EXTRA_DOCKER_OPTS
# KUBELET_CERT_BASE64
# KUBELET_KEY_BASE64
function upgrade-nodes() { function upgrade-nodes() {
local sanitized_version=$(echo ${KUBE_VERSION} | sed s/"\."/-/g) prepare-node-upgrade
echo "== Upgrading nodes to ${KUBE_VERSION}. ==" do-node-upgrade
}
# prepare-node-upgrade creates a new instance template suitable for upgrading
# to KUBE_VERSION and echos a single line with the name of the new template.
#
# Assumed vars:
# KUBE_VERSION
# MINION_SCOPES
# NODE_INSTANCE_PREFIX
# PROJECT
# ZONE
#
# Vars set:
# SANITIZED_VERSION
# KUBELET_TOKEN
# KUBE_PROXY_TOKEN
# CA_CERT_BASE64
# EXTRA_DOCKER_OPTS
# KUBELET_CERT_BASE64
# KUBELET_KEY_BASE64
function prepare-node-upgrade() {
echo "== Preparing node upgrade (to ${KUBE_VERSION}). ==" >&2
SANITIZED_VERSION=$(echo ${KUBE_VERSION} | sed s/"\."/-/g)
detect-minion-names detect-minion-names
@ -146,34 +184,56 @@ function upgrade-nodes() {
scope_flags=("--no-scopes") scope_flags=("--no-scopes")
fi fi
# Get required node tokens. # Get required node env vars from exiting template.
KUBELET_TOKEN=$(get-env-val "KUBELET_TOKEN") local node_env=$(get-node-env)
KUBE_PROXY_TOKEN=$(get-env-val "KUBE_PROXY_TOKEN") KUBELET_TOKEN=$(get-env-val "${node_env}" "KUBELET_TOKEN")
KUBE_PROXY_TOKEN=$(get-env-val "${node_env}" "KUBE_PROXY_TOKEN")
CA_CERT_BASE64=$(get-env-val "${node_env}" "CA_CERT")
EXTRA_DOCKER_OPTS=$(get-env-val "${node_env}" "EXTRA_DOCKER_OPTS")
KUBELET_CERT_BASE64=$(get-env-val "${node_env}" "KUBELET_CERT")
KUBELET_KEY_BASE64=$(get-env-val "${node_env}" "KUBELET_KEY")
# TODO(mbforbes): How do we ensure kube-env is written in a ${version}- # TODO(mbforbes): How do we ensure kube-env is written in a ${version}-
# compatible way? # compatible way?
write-node-env write-node-env
# TODO(mbforbes): Get configure-vm script from ${version}. (Must plumb this # TODO(mbforbes): Get configure-vm script from ${version}. (Must plumb this
# through all create-node-instance-template implementations). # through all create-node-instance-template implementations).
create-node-instance-template ${sanitized_version} create-node-instance-template ${SANITIZED_VERSION}
# The following is echo'd so that callers can get the template name.
echo "${NODE_INSTANCE_PREFIX}-template-${SANITIZED_VERSION}"
echo "== Finished preparing node upgrade (to ${KUBE_VERSION}). ==" >&2
}
# Prereqs:
# - prepare-node-upgrade should have been called successfully
function do-node-upgrade() {
echo "== Upgrading nodes to ${KUBE_VERSION}. ==" >&2
# Do the actual upgrade. # Do the actual upgrade.
gcloud preview rolling-updates start \ # NOTE(mbforbes): If you are changing this gcloud command, update
--group "${NODE_INSTANCE_PREFIX}-group" \ # test/e2e/restart.go to match this EXACTLY.
--max-num-concurrent-instances 1 \ gcloud preview rolling-updates \
--max-num-failed-instances 0 \ --project="${PROJECT}" \
--project "${PROJECT}" \ --zone="${ZONE}" \
--zone "${ZONE}" \ start \
--template "${NODE_INSTANCE_PREFIX}-template-${sanitized_version}" --group="${NODE_INSTANCE_PREFIX}-group" \
--template="${NODE_INSTANCE_PREFIX}-template-${SANITIZED_VERSION}" \
--instance-startup-timeout=300s \
--max-num-concurrent-instances=1 \
--max-num-failed-instances=0 \
--min-instance-update-time=0s
echo "== Done ==" # TODO(mbforbes): Wait for the rolling-update to finish.
echo "== Finished upgrading nodes to ${KUBE_VERSION}. ==" >&2
} }
master_upgrade=true master_upgrade=true
node_upgrade=true node_upgrade=true
node_prereqs=false
local_binaries=false local_binaries=false
while getopts ":MNlh" opt; do while getopts ":MNPlh" opt; do
case ${opt} in case ${opt} in
M) M)
node_upgrade=false node_upgrade=false
@ -181,6 +241,9 @@ while getopts ":MNlh" opt; do
N) N)
master_upgrade=false master_upgrade=false
;; ;;
P)
node_prereqs=true
;;
l) l)
local_binaries=true local_binaries=true
;; ;;
@ -213,6 +276,11 @@ fi
prepare-upgrade prepare-upgrade
if [[ "${node_prereqs}" == "true" ]]; then
prepare-node-upgrade
exit 0
fi
if [[ "${master_upgrade}" == "true" ]]; then if [[ "${master_upgrade}" == "true" ]]; then
upgrade-master upgrade-master
fi fi
@ -220,6 +288,7 @@ fi
if [[ "${node_upgrade}" == "true" ]]; then if [[ "${node_upgrade}" == "true" ]]; then
if [[ "${local_binaries}" == "true" ]]; then if [[ "${local_binaries}" == "true" ]]; then
echo "Upgrading nodes to local binaries is not yet supported." >&2 echo "Upgrading nodes to local binaries is not yet supported." >&2
exit 1
else else
upgrade-nodes upgrade-nodes
fi fi

View File

@ -250,7 +250,7 @@ function detect-minion-names {
MINION_NAMES=($(gcloud preview --project "${PROJECT}" instance-groups \ MINION_NAMES=($(gcloud preview --project "${PROJECT}" instance-groups \
--zone "${ZONE}" instances --group "${NODE_INSTANCE_PREFIX}-group" list \ --zone "${ZONE}" instances --group "${NODE_INSTANCE_PREFIX}-group" list \
| cut -d'/' -f11)) | cut -d'/' -f11))
echo "MINION_NAMES=${MINION_NAMES[*]}" echo "MINION_NAMES=${MINION_NAMES[*]}" >&2
} }
# Waits until the number of running nodes in the instance group is equal to NUM_NODES # Waits until the number of running nodes in the instance group is equal to NUM_NODES
@ -415,8 +415,9 @@ function create-node-template {
fi fi
fi fi
local attempt=0 local attempt=1
while true; do while true; do
echo "Attempt ${attempt} to create ${1}" >&2
if ! gcloud compute instance-templates create "$1" \ if ! gcloud compute instance-templates create "$1" \
--project "${PROJECT}" \ --project "${PROJECT}" \
--machine-type "${MINION_SIZE}" \ --machine-type "${MINION_SIZE}" \
@ -428,12 +429,12 @@ function create-node-template {
--network "${NETWORK}" \ --network "${NETWORK}" \
$2 \ $2 \
--can-ip-forward \ --can-ip-forward \
--metadata-from-file "$3","$4"; then --metadata-from-file "$3","$4" >&2; then
if (( attempt > 5 )); then if (( attempt > 5 )); then
echo -e "${color_red}Failed to create instance template $1 ${color_norm}" >&2 echo -e "${color_red}Failed to create instance template $1 ${color_norm}" >&2
exit 2 exit 2
fi fi
echo -e "${color_yellow}Attempt $(($attempt+1)) failed to create instance template $1. Retrying.${color_norm}" >&2 echo -e "${color_yellow}Attempt ${attempt} failed to create instance template $1. Retrying.${color_norm}" >&2
attempt=$(($attempt+1)) attempt=$(($attempt+1))
else else
break break

View File

@ -142,6 +142,8 @@ func (s *SSHTunnel) Close() error {
return nil return nil
} }
// RunSSHCommand returns the stdout, stderr, and exit code from running cmd on
// host along with any SSH-level error.
func RunSSHCommand(cmd, host string, signer ssh.Signer) (string, string, int, error) { func RunSSHCommand(cmd, host string, signer ssh.Signer) (string, string, int, error) {
// Setup the config, dial the server, and open a session. // Setup the config, dial the server, and open a session.
config := &ssh.ClientConfig{ config := &ssh.ClientConfig{

View File

@ -17,17 +17,16 @@ limitations under the License.
package e2e package e2e
import ( import (
"bytes"
"fmt" "fmt"
"io"
"net/http" "net/http"
"os"
"os/exec" "os/exec"
"path" "path"
"strings"
"sync" "sync"
"time" "time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util" "github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait" "github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait"
@ -36,68 +35,176 @@ import (
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
) )
// version applies to upgrades; kube-push always pushes local binaries.
const version = "latest_ci"
// The following upgrade functions are passed into the framework below and used
// to do the actual upgrades.
var masterUpgrade = func() error {
_, _, err := runScript("hack/e2e-internal/e2e-upgrade.sh", "-M", version)
return err
}
var masterPush = func() error {
_, _, err := runScript("hack/e2e-internal/e2e-push.sh", "-m")
return err
}
var nodeUpgrade = func(f Framework, replicas int) error {
Logf("Preparing node upgarde by creating new instance template")
stdout, _, err := runScript("hack/e2e-internal/e2e-upgrade.sh", "-P", version)
if err != nil {
return err
}
tmpl := strings.TrimSpace(stdout)
Logf("Performing a node upgrade to %s; waiting at most %v per node", tmpl, restartPerNodeTimeout)
if err := migRollingUpdate(tmpl, restartPerNodeTimeout); err != nil {
return fmt.Errorf("error doing node upgrade via a migRollingUpdate to %s: %v", tmpl, err)
}
Logf("Waiting up to %v for all nodes to be ready after the upgrade", restartNodeReadyAgainTimeout)
if _, err := checkNodesReady(f.Client, restartNodeReadyAgainTimeout, testContext.CloudConfig.NumNodes); err != nil {
return err
}
Logf("Waiting up to %v for all pods to be running and ready after the upgrade", restartPodReadyAgainTimeout)
return waitForPodsRunningReady(f.Namespace.Name, replicas, restartPodReadyAgainTimeout)
}
var _ = Describe("Skipped", func() { var _ = Describe("Skipped", func() {
Describe("Cluster upgrade", func() { Describe("Cluster upgrade", func() {
svcName := "baz" svcName, replicas := "baz", 2
var podName string var rcName, ip string
framework := Framework{BaseName: "cluster-upgrade"} var ingress api.LoadBalancerIngress
var webserver *WebserverTest f := Framework{BaseName: "cluster-upgrade"}
var w *WebserverTest
BeforeEach(func() { BeforeEach(func() {
framework.beforeEach() By("Setting up the service, RC, and pods")
webserver = NewWebserverTest(framework.Client, framework.Namespace.Name, svcName) f.beforeEach()
pod := webserver.CreateWebserverPod() w = NewWebserverTest(f.Client, f.Namespace.Name, svcName)
podName = pod.Name rc := w.CreateWebserverRC(replicas)
svc := webserver.BuildServiceSpec() rcName = rc.ObjectMeta.Name
svc := w.BuildServiceSpec()
svc.Spec.Type = api.ServiceTypeLoadBalancer svc.Spec.Type = api.ServiceTypeLoadBalancer
webserver.CreateService(svc) w.CreateService(svc)
})
AfterEach(func() { By("Waiting for the service to become reachable")
framework.afterEach() result, err := waitForLoadBalancerIngress(f.Client, svcName, f.Namespace.Name)
webserver.Cleanup()
})
Describe("kube-push", func() {
It("of master should maintain responsive services", func() {
testClusterUpgrade(framework, svcName, podName, func() {
runUpgradeScript("hack/e2e-internal/e2e-push.sh", "-m")
})
})
})
Describe("gce-upgrade", func() {
It("of master should maintain responsive services", func() {
if !providerIs("gce") {
By(fmt.Sprintf("Skippingt test, which is not implemented for %s", testContext.Provider))
return
}
testClusterUpgrade(framework, svcName, podName, func() {
runUpgradeScript("hack/e2e-internal/e2e-upgrade.sh", "-M", "-l")
})
})
})
})
})
func testClusterUpgrade(framework Framework, svcName, podName string, upgrade func()) {
result, err := waitForLoadBalancerIngress(framework.Client, svcName, framework.Namespace.Name)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
ingresses := result.Status.LoadBalancer.Ingress ingresses := result.Status.LoadBalancer.Ingress
if len(ingresses) != 1 { if len(ingresses) != 1 {
Failf("Was expecting only 1 ingress IP but got %d (%v): %v", len(ingresses), ingresses, result) Failf("Was expecting only 1 ingress IP but got %d (%v): %v", len(ingresses), ingresses, result)
} }
ingress := ingresses[0] ingress = ingresses[0]
ip := ingress.IP Logf("Got load balancer ingress point %v", ingress)
ip = ingress.IP
if ip == "" { if ip == "" {
ip = ingress.Hostname ip = ingress.Hostname
} }
By("Waiting for pod to become reachable")
testLoadBalancerReachable(ingress, 80) testLoadBalancerReachable(ingress, 80)
validateClusterUpgrade(framework, svcName, podName)
Logf("starting async validation") // TODO(mbforbes): Add setup, validate, and teardown for:
// - secrets
// - volumes
// - persistent volumes
})
AfterEach(func() {
f.afterEach()
w.Cleanup()
})
Describe("kube-push", func() {
It("of master should maintain responsive services", func() {
By("Validating cluster before master upgrade")
expectNoError(validate(f, svcName, rcName, ingress, replicas))
By("Performing a master upgrade")
testMasterUpgrade(ip, masterPush)
By("Validating cluster after master upgrade")
expectNoError(validate(f, svcName, rcName, ingress, replicas))
})
})
Describe("gce-upgrade-master", func() {
It("should maintain responsive services", func() {
// TODO(mbforbes): Add GKE support.
if !providerIs("gce") {
By(fmt.Sprintf("Skipping upgrade test, which is not implemented for %s", testContext.Provider))
return
}
By("Validating cluster before master upgrade")
expectNoError(validate(f, svcName, rcName, ingress, replicas))
By("Performing a master upgrade")
testMasterUpgrade(ip, masterUpgrade)
By("Validating cluster after master upgrade")
expectNoError(validate(f, svcName, rcName, ingress, replicas))
})
})
Describe("gce-upgrade-cluster", func() {
var tmplBefore, tmplAfter string
BeforeEach(func() {
By("Getting the node template before the upgrade")
var err error
tmplBefore, err = migTemplate()
expectNoError(err)
})
AfterEach(func() {
By("Cleaning up any unused node templates")
var err error
tmplAfter, err = migTemplate()
if err != nil {
Logf("Could not get node template post-upgrade; may have leaked template %s", tmplBefore)
return
}
if tmplBefore == tmplAfter {
// The node upgrade failed so there's no need to delete
// anything.
Logf("Node template %s is still in use; not cleaning up", tmplBefore)
return
}
// TODO(mbforbes): Distinguish between transient failures
// and "cannot delete--in use" errors and retry on the
// former.
Logf("Deleting node template %s", tmplBefore)
o, err := exec.Command("gcloud", "compute", "instance-templates",
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
"delete",
tmplBefore).CombinedOutput()
if err != nil {
Logf("gcloud compute instance-templates delete %s call failed with err: %v, output: %s",
tmplBefore, err, string(o))
Logf("May have leaked %s", tmplBefore)
}
})
It("should maintain a functioning cluster", func() {
// TODO(mbforbes): Add GKE support.
if !providerIs("gce") {
By(fmt.Sprintf("Skipping upgrade test, which is not implemented for %s", testContext.Provider))
return
}
By("Validating cluster before master upgrade")
expectNoError(validate(f, svcName, rcName, ingress, replicas))
By("Performing a master upgrade")
testMasterUpgrade(ip, masterUpgrade)
By("Validating cluster after master upgrade")
expectNoError(validate(f, svcName, rcName, ingress, replicas))
By("Performing a node upgrade")
testNodeUpgrade(f, nodeUpgrade, replicas)
By("Validating cluster after node upgrade")
expectNoError(validate(f, svcName, rcName, ingress, replicas))
})
})
})
})
func testMasterUpgrade(ip string, mUp func() error) {
Logf("Starting async validation")
httpClient := http.Client{Timeout: 2 * time.Second} httpClient := http.Client{Timeout: 2 * time.Second}
done := make(chan struct{}, 1) done := make(chan struct{}, 1)
// Let's make sure we've finished the heartbeat before shutting things down. // Let's make sure we've finished the heartbeat before shutting things down.
@ -107,55 +214,94 @@ func testClusterUpgrade(framework Framework, svcName, podName string, upgrade fu
wg.Add(1) wg.Add(1)
defer wg.Done() defer wg.Done()
expectNoError(wait.Poll(poll, singleCallTimeout, func() (bool, error) { if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) {
r, err := httpClient.Get("http://" + ip) r, err := httpClient.Get("http://" + ip)
if err != nil { if err != nil {
Logf("Error reaching %s: %v", ip, err)
return false, nil return false, nil
} }
if r.StatusCode < http.StatusOK || r.StatusCode >= http.StatusNotFound { if r.StatusCode < http.StatusOK || r.StatusCode >= http.StatusNotFound {
Logf("Bad response; status: %d, response: %v", r.StatusCode, r)
return false, nil return false, nil
} }
return true, nil return true, nil
})) }); err != nil {
// We log the error here because the test will fail at the very end
// because this validation runs in another goroutine. Without this,
// a failure is very confusing to track down because from the logs
// everything looks fine.
msg := fmt.Sprintf("Failed to contact service during master upgrade: %v", err)
Logf(msg)
Failf(msg)
}
}, 200*time.Millisecond, done) }, 200*time.Millisecond, done)
By("Starting upgrade") Logf("Starting master upgrade")
upgrade() expectNoError(mUp())
done <- struct{}{} done <- struct{}{}
Logf("Stopping async validation") Logf("Stopping async validation")
wg.Wait() wg.Wait()
Logf("Upgrade complete.") Logf("Master upgrade complete")
By("Validating post upgrade state")
validateClusterUpgrade(framework, svcName, podName)
} }
func runUpgradeScript(scriptPath string, args ...string) { func testNodeUpgrade(f Framework, nUp func(f Framework, n int) error, replicas int) {
cmd := exec.Command(path.Join(testContext.RepoRoot, scriptPath), args...) Logf("Starting node upgrade")
upgradeLogPath := path.Join(testContext.OutputDir, "upgrade-"+string(util.NewUUID())+".log") expectNoError(nUp(f, replicas))
Logf("Writing upgrade logs to %s", upgradeLogPath) Logf("Node upgrade complete")
upgradeLog, err := os.Create(upgradeLogPath)
expectNoError(err) // TODO(mbforbes): Validate that:
// - the node software version truly changed
cmd.Stdout = io.MultiWriter(os.Stdout, upgradeLog)
cmd.Stderr = io.MultiWriter(os.Stderr, upgradeLog)
if err := cmd.Run(); err != nil {
Failf("Upgrade failed: %v", err)
}
} }
func validateClusterUpgrade(framework Framework, svcName, podName string) { // runScript runs script on testContext.RepoRoot using args and returns
pods, err := framework.Client.Pods(framework.Namespace.Name).List(labels.Everything(), fields.Everything()) // stdout, stderr, and error.
Expect(err).NotTo(HaveOccurred()) func runScript(script string, args ...string) (string, string, error) {
Expect(len(pods.Items) == 1).Should(BeTrue()) Logf("Running %s %v", script, args)
if podName != pods.Items[0].Name { var bout, berr bytes.Buffer
Failf("pod name should not have changed") cmd := exec.Command(path.Join(testContext.RepoRoot, script), args...)
cmd.Stdout, cmd.Stderr = &bout, &berr
err := cmd.Run()
stdout, stderr := bout.String(), berr.String()
if err != nil {
return "", "", fmt.Errorf("error running %s %v; got error %v, stdout %q, stderr %q",
script, args, err, stdout, stderr)
} }
_, err = podRunningReady(&pods.Items[0]) Logf("stdout: %s", stdout)
Expect(err).NotTo(HaveOccurred()) Logf("stderr: %s", stderr)
svc, err := framework.Client.Services(framework.Namespace.Name).Get(svcName) return stdout, stderr, nil
Expect(err).NotTo(HaveOccurred())
if svcName != svc.Name {
Failf("service name should not have changed")
} }
func validate(f Framework, svcNameWant, rcNameWant string, ingress api.LoadBalancerIngress, podsWant int) error {
Logf("Beginning cluster validation")
// Verify RC.
rcs, err := f.Client.ReplicationControllers(f.Namespace.Name).List(labels.Everything())
if err != nil {
return fmt.Errorf("error listing RCs: %v", err)
}
if len(rcs.Items) != 1 {
return fmt.Errorf("wanted 1 RC with name %s, got %d", rcNameWant, len(rcs.Items))
}
if got := rcs.Items[0].Name; got != rcNameWant {
return fmt.Errorf("wanted RC name %q, got %q", rcNameWant, got)
}
// Verify pods.
if err := verifyPods(f.Client, f.Namespace.Name, rcNameWant, false, podsWant); err != nil {
return fmt.Errorf("failed to find %d %q pods: %v", podsWant, rcNameWant, err)
}
// Verify service.
svc, err := f.Client.Services(f.Namespace.Name).Get(svcNameWant)
if err != nil {
return fmt.Errorf("error getting service %s: %v", svcNameWant, err)
}
if svcNameWant != svc.Name {
return fmt.Errorf("wanted service name %q, got %q", svcNameWant, svc.Name)
}
// TODO(mbforbes): Make testLoadBalancerReachable return an error.
testLoadBalancerReachable(ingress, 80)
Logf("Cluster validation succeeded")
return nil
} }

View File

@ -141,7 +141,7 @@ func ServeImageOrFail(c *client.Client, test string, image string) {
By("Trying to dial each unique pod") By("Trying to dial each unique pod")
retryTimeout := 2 * time.Minute retryTimeout := 2 * time.Minute
retryInterval := 5 * time.Second retryInterval := 5 * time.Second
err = wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, pods}.checkAllResponses) err = wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, true, pods}.checkAllResponses)
if err != nil { if err != nil {
Failf("Did not get expected responses within the timeout period of %.2f seconds.", retryTimeout.Seconds()) Failf("Did not get expected responses within the timeout period of %.2f seconds.", retryTimeout.Seconds())
} }

View File

@ -155,7 +155,8 @@ func issueSSHCommand(node *api.Node, provider, cmd string) error {
// failed step, it will return false through result and not run the rest. // failed step, it will return false through result and not run the rest.
func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) { func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) {
// Setup // Setup
ps := newPodStore(c, api.NamespaceDefault, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name)) ns := api.NamespaceDefault
ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
defer ps.Stop() defer ps.Stop()
// Get the node initially. // Get the node initially.
@ -183,7 +184,7 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan
// For each pod, we do a sanity check to ensure it's running / healthy // For each pod, we do a sanity check to ensure it's running / healthy
// now, as that's what we'll be checking later. // now, as that's what we'll be checking later.
if !checkPodsRunningReady(c, podNames, podReadyBeforeTimeout) { if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) {
result <- false result <- false
return return
} }
@ -209,7 +210,7 @@ func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan
// Ensure all of the pods that we found on this node before the reboot are // Ensure all of the pods that we found on this node before the reboot are
// running / healthy. // running / healthy.
if !checkPodsRunningReady(c, podNames, rebootPodReadyAgainTimeout) { if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) {
result <- false result <- false
return return
} }

View File

@ -24,6 +24,7 @@ import (
"time" "time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/latest"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client" "github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields" "github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
@ -34,9 +35,9 @@ import (
. "github.com/onsi/gomega" . "github.com/onsi/gomega"
) )
var serveHostnameImage string = "gcr.io/google_containers/serve_hostname:1.1" const serveHostnameImage = "gcr.io/google_containers/serve_hostname:1.1"
func resizeNodeInstanceGroup(size int) error { func resizeGroup(size int) error {
// TODO: make this hit the compute API directly instread of shelling out to gcloud. // TODO: make this hit the compute API directly instread of shelling out to gcloud.
output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID, "--zone="+testContext.CloudConfig.Zone, output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID, "--zone="+testContext.CloudConfig.Zone,
"resize", testContext.CloudConfig.NodeInstanceGroup, fmt.Sprintf("--new-size=%v", size)).CombinedOutput() "resize", testContext.CloudConfig.NodeInstanceGroup, fmt.Sprintf("--new-size=%v", size)).CombinedOutput()
@ -46,7 +47,7 @@ func resizeNodeInstanceGroup(size int) error {
return err return err
} }
func nodeInstanceGroupSize() (int, error) { func groupSize() (int, error) {
// TODO: make this hit the compute API directly instread of shelling out to gcloud. // TODO: make this hit the compute API directly instread of shelling out to gcloud.
output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID, output, err := exec.Command("gcloud", "preview", "managed-instance-groups", "--project="+testContext.CloudConfig.ProjectID,
"--zone="+testContext.CloudConfig.Zone, "describe", testContext.CloudConfig.NodeInstanceGroup).CombinedOutput() "--zone="+testContext.CloudConfig.Zone, "describe", testContext.CloudConfig.NodeInstanceGroup).CombinedOutput()
@ -71,9 +72,9 @@ func nodeInstanceGroupSize() (int, error) {
return currentSize, nil return currentSize, nil
} }
func waitForNodeInstanceGroupSize(size int) error { func waitForGroupSize(size int) error {
for start := time.Now(); time.Since(start) < 4*time.Minute; time.Sleep(5 * time.Second) { for start := time.Now(); time.Since(start) < 4*time.Minute; time.Sleep(5 * time.Second) {
currentSize, err := nodeInstanceGroupSize() currentSize, err := groupSize()
if err != nil { if err != nil {
Logf("Failed to get node instance group size: %v", err) Logf("Failed to get node instance group size: %v", err)
continue continue
@ -104,7 +105,7 @@ func waitForClusterSize(c *client.Client, size int) error {
return fmt.Errorf("timeout waiting for cluster size to be %d", size) return fmt.Errorf("timeout waiting for cluster size to be %d", size)
} }
func newServiceWithNameSelector(name string) *api.Service { func svcByName(name string) *api.Service {
return &api.Service{ return &api.Service{
ObjectMeta: api.ObjectMeta{ ObjectMeta: api.ObjectMeta{
Name: "test-service", Name: "test-service",
@ -121,12 +122,12 @@ func newServiceWithNameSelector(name string) *api.Service {
} }
} }
func createServiceWithNameSelector(c *client.Client, ns, name string) error { func newSVCByName(c *client.Client, ns, name string) error {
_, err := c.Services(ns).Create(newServiceWithNameSelector(name)) _, err := c.Services(ns).Create(svcByName(name))
return err return err
} }
func newPodOnNode(podName, nodeName string, image string) *api.Pod { func podOnNode(podName, nodeName string, image string) *api.Pod {
return &api.Pod{ return &api.Pod{
ObjectMeta: api.ObjectMeta{ ObjectMeta: api.ObjectMeta{
Name: podName, Name: podName,
@ -148,18 +149,39 @@ func newPodOnNode(podName, nodeName string, image string) *api.Pod {
} }
} }
func createServeHostnamePodOnNode(c *client.Client, namespace, podName, nodeName string) error { func newPodOnNode(c *client.Client, namespace, podName, nodeName string) error {
pod, err := c.Pods(namespace).Create(newPodOnNode(podName, nodeName, serveHostnameImage)) pod, err := c.Pods(namespace).Create(podOnNode(podName, nodeName, serveHostnameImage))
if err == nil { if err == nil {
Logf("Created pod %s on node %s", pod.ObjectMeta.Name, nodeName) Logf("Created pod %s on node %s", pod.ObjectMeta.Name, nodeName)
} else { } else {
Logf("Failed to create pod %s on node %s: %s", podName, nodeName, err) Logf("Failed to create pod %s on node %s: %v", podName, nodeName, err)
} }
return err return err
} }
func newReplicationControllerWithNameSelector(name string, replicas int, image string) *api.ReplicationController { func rcByName(name string, replicas int, image string, labels map[string]string) *api.ReplicationController {
return rcByNameContainer(name, replicas, image, labels, api.Container{
Name: name,
Image: image,
})
}
func rcByNamePort(name string, replicas int, image string, port int, labels map[string]string) *api.ReplicationController {
return rcByNameContainer(name, replicas, image, labels, api.Container{
Name: name,
Image: image,
Ports: []api.ContainerPort{{ContainerPort: port}},
})
}
func rcByNameContainer(name string, replicas int, image string, labels map[string]string, c api.Container) *api.ReplicationController {
// Add "name": name to the labels, overwriting if it exists.
labels["name"] = name
return &api.ReplicationController{ return &api.ReplicationController{
TypeMeta: api.TypeMeta{
Kind: "ReplicationController",
APIVersion: latest.Version,
},
ObjectMeta: api.ObjectMeta{ ObjectMeta: api.ObjectMeta{
Name: name, Name: name,
}, },
@ -170,28 +192,24 @@ func newReplicationControllerWithNameSelector(name string, replicas int, image s
}, },
Template: &api.PodTemplateSpec{ Template: &api.PodTemplateSpec{
ObjectMeta: api.ObjectMeta{ ObjectMeta: api.ObjectMeta{
Labels: map[string]string{"name": name}, Labels: labels,
}, },
Spec: api.PodSpec{ Spec: api.PodSpec{
Containers: []api.Container{ Containers: []api.Container{c},
{
Name: name,
Image: image,
Ports: []api.ContainerPort{{ContainerPort: 9376}},
},
},
}, },
}, },
}, },
} }
} }
func createServeHostnameReplicationController(c *client.Client, ns, name string, replicas int) (*api.ReplicationController, error) { // newRCByName creates a replication controller with a selector by name of name.
func newRCByName(c *client.Client, ns, name string, replicas int) (*api.ReplicationController, error) {
By(fmt.Sprintf("creating replication controller %s", name)) By(fmt.Sprintf("creating replication controller %s", name))
return c.ReplicationControllers(ns).Create(newReplicationControllerWithNameSelector(name, replicas, serveHostnameImage)) return c.ReplicationControllers(ns).Create(rcByNamePort(
name, replicas, serveHostnameImage, 9376, map[string]string{}))
} }
func resizeReplicationController(c *client.Client, ns, name string, replicas int) error { func resizeRC(c *client.Client, ns, name string, replicas int) error {
rc, err := c.ReplicationControllers(ns).Get(name) rc, err := c.ReplicationControllers(ns).Get(name)
if err != nil { if err != nil {
return err return err
@ -201,7 +219,7 @@ func resizeReplicationController(c *client.Client, ns, name string, replicas int
return err return err
} }
func waitForPodsCreated(c *client.Client, ns, name string, replicas int) (*api.PodList, error) { func podsCreated(c *client.Client, ns, name string, replicas int) (*api.PodList, error) {
// List the pods, making sure we observe all the replicas. // List the pods, making sure we observe all the replicas.
label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name})) label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) { for start := time.Now(); time.Since(start) < time.Minute; time.Sleep(5 * time.Second) {
@ -218,7 +236,7 @@ func waitForPodsCreated(c *client.Client, ns, name string, replicas int) (*api.P
return nil, fmt.Errorf("Pod name %s: Gave up waiting for %d pods to come up", name, replicas) return nil, fmt.Errorf("Pod name %s: Gave up waiting for %d pods to come up", name, replicas)
} }
func waitForPodsRunning(c *client.Client, pods *api.PodList) []error { func podsRunning(c *client.Client, pods *api.PodList) []error {
// Wait for the pods to enter the running state. Waiting loops until the pods // Wait for the pods to enter the running state. Waiting loops until the pods
// are running so non-running pods cause a timeout for this test. // are running so non-running pods cause a timeout for this test.
By("ensuring each pod is running") By("ensuring each pod is running")
@ -233,24 +251,24 @@ func waitForPodsRunning(c *client.Client, pods *api.PodList) []error {
return e return e
} }
func verifyPodsResponding(c *client.Client, ns, name string, pods *api.PodList) error { func podsResponding(c *client.Client, ns, name string, wantName bool, pods *api.PodList) error {
By("trying to dial each unique pod") By("trying to dial each unique pod")
retryTimeout := 2 * time.Minute retryTimeout := 2 * time.Minute
retryInterval := 5 * time.Second retryInterval := 5 * time.Second
label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name})) label := labels.SelectorFromSet(labels.Set(map[string]string{"name": name}))
return wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, pods}.checkAllResponses) return wait.Poll(retryInterval, retryTimeout, podResponseChecker{c, ns, label, name, wantName, pods}.checkAllResponses)
} }
func waitForPodsCreatedRunningResponding(c *client.Client, ns, name string, replicas int) error { func verifyPods(c *client.Client, ns, name string, wantName bool, replicas int) error {
pods, err := waitForPodsCreated(c, ns, name, replicas) pods, err := podsCreated(c, ns, name, replicas)
if err != nil { if err != nil {
return err return err
} }
e := waitForPodsRunning(c, pods) e := podsRunning(c, pods)
if len(e) > 0 { if len(e) > 0 {
return fmt.Errorf("Failed to wait for pods running: %v", e) return fmt.Errorf("Failed to wait for pods running: %v", e)
} }
err = verifyPodsResponding(c, ns, name, pods) err = podsResponding(c, ns, name, wantName, pods)
if err != nil { if err != nil {
return err return err
} }
@ -331,7 +349,7 @@ func performTemporaryNetworkFailure(c *client.Client, ns, rcName string, replica
waitForRCPodToDisappear(c, ns, rcName, podNameToDisappear) waitForRCPodToDisappear(c, ns, rcName, podNameToDisappear)
By("verifying whether the pod from the unreachable node is recreated") By("verifying whether the pod from the unreachable node is recreated")
err := waitForPodsCreatedRunningResponding(c, ns, rcName, replicas) err := verifyPods(c, ns, rcName, true, replicas)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// network traffic is unblocked in a defered function // network traffic is unblocked in a defered function
@ -372,10 +390,10 @@ var _ = Describe("Nodes", func() {
return return
} }
By("restoring the original node instance group size") By("restoring the original node instance group size")
if err := resizeNodeInstanceGroup(testContext.CloudConfig.NumNodes); err != nil { if err := resizeGroup(testContext.CloudConfig.NumNodes); err != nil {
Failf("Couldn't restore the original node instance group size: %v", err) Failf("Couldn't restore the original node instance group size: %v", err)
} }
if err := waitForNodeInstanceGroupSize(testContext.CloudConfig.NumNodes); err != nil { if err := waitForGroupSize(testContext.CloudConfig.NumNodes); err != nil {
Failf("Couldn't restore the original node instance group size: %v", err) Failf("Couldn't restore the original node instance group size: %v", err)
} }
if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes); err != nil { if err := waitForClusterSize(c, testContext.CloudConfig.NumNodes); err != nil {
@ -396,20 +414,20 @@ var _ = Describe("Nodes", func() {
// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
name := "my-hostname-delete-node" name := "my-hostname-delete-node"
replicas := testContext.CloudConfig.NumNodes replicas := testContext.CloudConfig.NumNodes
createServeHostnameReplicationController(c, ns, name, replicas) newRCByName(c, ns, name, replicas)
err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) err := verifyPods(c, ns, name, true, replicas)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
By(fmt.Sprintf("decreasing cluster size to %d", replicas-1)) By(fmt.Sprintf("decreasing cluster size to %d", replicas-1))
err = resizeNodeInstanceGroup(replicas - 1) err = resizeGroup(replicas - 1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = waitForNodeInstanceGroupSize(replicas - 1) err = waitForGroupSize(replicas - 1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = waitForClusterSize(c, replicas-1) err = waitForClusterSize(c, replicas-1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
By("verifying whether the pods from the removed node are recreated") By("verifying whether the pods from the removed node are recreated")
err = waitForPodsCreatedRunningResponding(c, ns, name, replicas) err = verifyPods(c, ns, name, true, replicas)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
}) })
@ -426,24 +444,24 @@ var _ = Describe("Nodes", func() {
// Create a replication controller for a service that serves its hostname. // Create a replication controller for a service that serves its hostname.
// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
name := "my-hostname-add-node" name := "my-hostname-add-node"
createServiceWithNameSelector(c, ns, name) newSVCByName(c, ns, name)
replicas := testContext.CloudConfig.NumNodes replicas := testContext.CloudConfig.NumNodes
createServeHostnameReplicationController(c, ns, name, replicas) newRCByName(c, ns, name, replicas)
err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) err := verifyPods(c, ns, name, true, replicas)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
By(fmt.Sprintf("increasing cluster size to %d", replicas+1)) By(fmt.Sprintf("increasing cluster size to %d", replicas+1))
err = resizeNodeInstanceGroup(replicas + 1) err = resizeGroup(replicas + 1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = waitForNodeInstanceGroupSize(replicas + 1) err = waitForGroupSize(replicas + 1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = waitForClusterSize(c, replicas+1) err = waitForClusterSize(c, replicas+1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1)) By(fmt.Sprintf("increasing size of the replication controller to %d and verifying all pods are running", replicas+1))
err = resizeReplicationController(c, ns, name, replicas+1) err = resizeRC(c, ns, name, replicas+1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = waitForPodsCreatedRunningResponding(c, ns, name, replicas+1) err = verifyPods(c, ns, name, true, replicas+1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
}) })
}) })
@ -472,10 +490,10 @@ var _ = Describe("Nodes", func() {
// Create a replication controller for a service that serves its hostname. // Create a replication controller for a service that serves its hostname.
// The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname // The source for the Docker containter kubernetes/serve_hostname is in contrib/for-demos/serve_hostname
name := "my-hostname-net" name := "my-hostname-net"
createServiceWithNameSelector(c, ns, name) newSVCByName(c, ns, name)
replicas := testContext.CloudConfig.NumNodes replicas := testContext.CloudConfig.NumNodes
createServeHostnameReplicationController(c, ns, name, replicas) newRCByName(c, ns, name, replicas)
err := waitForPodsCreatedRunningResponding(c, ns, name, replicas) err := verifyPods(c, ns, name, true, replicas)
Expect(err).NotTo(HaveOccurred(), "Each pod should start running and responding") Expect(err).NotTo(HaveOccurred(), "Each pod should start running and responding")
By("choose a node with at least one pod - we will block some network traffic on this node") By("choose a node with at least one pod - we will block some network traffic on this node")
@ -496,9 +514,9 @@ var _ = Describe("Nodes", func() {
// increasing the RC size is not a valid way to test this // increasing the RC size is not a valid way to test this
// since we have no guarantees the pod will be scheduled on our node. // since we have no guarantees the pod will be scheduled on our node.
additionalPod := "additionalpod" additionalPod := "additionalpod"
err = createServeHostnamePodOnNode(c, ns, additionalPod, node.Name) err = newPodOnNode(c, ns, additionalPod, node.Name)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
err = waitForPodsCreatedRunningResponding(c, ns, additionalPod, 1) err = verifyPods(c, ns, additionalPod, true, 1)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
// verify that it is really on the requested node // verify that it is really on the requested node

View File

@ -86,7 +86,8 @@ var _ = Describe("Restart", func() {
for i, p := range pods { for i, p := range pods {
podNamesBefore[i] = p.ObjectMeta.Name podNamesBefore[i] = p.ObjectMeta.Name
} }
if !checkPodsRunningReady(c, podNamesBefore, podReadyBeforeTimeout) { ns := api.NamespaceDefault
if !checkPodsRunningReady(c, ns, podNamesBefore, podReadyBeforeTimeout) {
Failf("At least one pod wasn't running and ready at test start.") Failf("At least one pod wasn't running and ready at test start.")
} }
@ -115,7 +116,7 @@ var _ = Describe("Restart", func() {
podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout) podNamesAfter, err := waitForNPods(ps, len(podNamesBefore), restartPodReadyAgainTimeout)
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart) remaining := restartPodReadyAgainTimeout - time.Since(podCheckStart)
if !checkPodsRunningReady(c, podNamesAfter, remaining) { if !checkPodsRunningReady(c, ns, podNamesAfter, remaining) {
Failf("At least one pod wasn't running and ready after the restart.") Failf("At least one pod wasn't running and ready after the restart.")
} }
}) })
@ -166,7 +167,8 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string,
return false, nil return false, nil
} }
if len(nodeList.Items) != expect { if len(nodeList.Items) != expect {
errLast = fmt.Errorf("expected to find %d nodes but found only %d", expect, len(nodeList.Items)) errLast = fmt.Errorf("expected to find %d nodes but found only %d (%v elapsed)",
expect, len(nodeList.Items), time.Since(start))
Logf("%v", errLast) Logf("%v", errLast)
return false, nil return false, nil
} }
@ -180,6 +182,7 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string,
return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v", return nodeNames, fmt.Errorf("couldn't find %d nodes within %v; last error: %v",
expect, nt, errLast) expect, nt, errLast)
} }
Logf("Successfully found %d nodes", expect)
// Next, ensure in parallel that all the nodes are ready. We subtract the // Next, ensure in parallel that all the nodes are ready. We subtract the
// time we spent waiting above. // time we spent waiting above.
@ -209,28 +212,32 @@ func checkNodesReady(c *client.Client, nt time.Duration, expect int) ([]string,
func restartNodes(provider string, nt time.Duration) error { func restartNodes(provider string, nt time.Duration) error {
switch provider { switch provider {
case "gce": case "gce":
return migRollingUpdate(nt) return migRollingUpdateSelf(nt)
default: default:
return fmt.Errorf("restartNodes(...) not implemented for %s", provider) return fmt.Errorf("restartNodes(...) not implemented for %s", provider)
} }
} }
// migRollingUpdate starts a MIG rolling update and waits up to nt times the func migRollingUpdateSelf(nt time.Duration) error {
// nubmer of nodes for it to complete.
func migRollingUpdate(nt time.Duration) error {
By("getting the name of the template for the managed instance group") By("getting the name of the template for the managed instance group")
templ, err := migTemplate() tmpl, err := migTemplate()
if err != nil { if err != nil {
return fmt.Errorf("couldn't get MIG template name: %v", err) return fmt.Errorf("couldn't get MIG template name: %v", err)
} }
return migRollingUpdate(tmpl, nt)
}
By("starting the managed instance group rolling update") // migRollingUpdate starts a MIG rolling update, upgrading the nodes to a new
id, err := migRollingUpdateStart(templ, nt) // instance template named tmpl, and waits up to nt times the nubmer of nodes
// for it to complete.
func migRollingUpdate(tmpl string, nt time.Duration) error {
By(fmt.Sprintf("starting the MIG rolling update to %s", tmpl))
id, err := migRollingUpdateStart(tmpl, nt)
if err != nil { if err != nil {
return fmt.Errorf("couldn't start the MIG rolling update: %v", err) return fmt.Errorf("couldn't start the MIG rolling update: %v", err)
} }
By("polling the managed instance group rolling update until it completes") By(fmt.Sprintf("polling the MIG rolling update (%s) until it completes", id))
if err := migRollingUpdatePoll(id, nt); err != nil { if err := migRollingUpdatePoll(id, nt); err != nil {
return fmt.Errorf("err waiting until update completed: %v", err) return fmt.Errorf("err waiting until update completed: %v", err)
} }
@ -285,6 +292,8 @@ func migRollingUpdateStart(templ string, nt time.Duration) (string, error) {
if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) { if err := wait.Poll(poll, singleCallTimeout, func() (bool, error) {
// TODO(mbforbes): make this hit the compute API directly instead of // TODO(mbforbes): make this hit the compute API directly instead of
// shelling out to gcloud. // shelling out to gcloud.
// NOTE(mbforbes): If you are changing this gcloud command, update
// cluster/gce/upgrade.sh to match this EXACTLY.
o, err := exec.Command("gcloud", "preview", "rolling-updates", o, err := exec.Command("gcloud", "preview", "rolling-updates",
fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID), fmt.Sprintf("--project=%s", testContext.CloudConfig.ProjectID),
fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone), fmt.Sprintf("--zone=%s", testContext.CloudConfig.Zone),
@ -361,6 +370,7 @@ func migRollingUpdatePoll(id string, nt time.Duration) error {
}) != nil { }) != nil {
return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast) return fmt.Errorf("timeout waiting %v for MIG rolling update to complete. Last error: %v", timeout, errLast)
} }
Logf("MIG rolling update complete after %v", time.Since(start))
return nil return nil
} }

View File

@ -22,13 +22,10 @@ import (
"math/rand" "math/rand"
"net/http" "net/http"
"sort" "sort"
"strconv"
"strings" "strings"
"sync/atomic"
"time" "time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api/latest"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client" "github.com/GoogleCloudPlatform/kubernetes/pkg/client"
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields" "github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels" "github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
@ -277,7 +274,7 @@ var _ = Describe("Services", func() {
} }
By("creating pod to be part of service " + serviceName) By("creating pod to be part of service " + serviceName)
t.CreateWebserverPod() t.CreateWebserverRC(1)
By("hitting the pod through the service's NodePort") By("hitting the pod through the service's NodePort")
testReachable(pickMinionIP(c), port.NodePort) testReachable(pickMinionIP(c), port.NodePort)
@ -324,7 +321,7 @@ var _ = Describe("Services", func() {
} }
By("creating pod to be part of service " + serviceName) By("creating pod to be part of service " + serviceName)
t.CreateWebserverPod() t.CreateWebserverRC(1)
By("hitting the pod through the service's NodePort") By("hitting the pod through the service's NodePort")
ip := pickMinionIP(c) ip := pickMinionIP(c)
@ -365,7 +362,7 @@ var _ = Describe("Services", func() {
} }
By("creating pod to be part of service " + t.ServiceName) By("creating pod to be part of service " + t.ServiceName)
t.CreateWebserverPod() t.CreateWebserverRC(1)
By("changing service " + serviceName + " to type=NodePort") By("changing service " + serviceName + " to type=NodePort")
service.Spec.Type = api.ServiceTypeNodePort service.Spec.Type = api.ServiceTypeNodePort
@ -515,7 +512,7 @@ var _ = Describe("Services", func() {
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
By("creating pod to be part of service " + t.ServiceName) By("creating pod to be part of service " + t.ServiceName)
t.CreateWebserverPod() t.CreateWebserverRC(1)
if service.Spec.Type != api.ServiceTypeLoadBalancer { if service.Spec.Type != api.ServiceTypeLoadBalancer {
Failf("got unexpected Spec.Type for LoadBalancer service: %v", service) Failf("got unexpected Spec.Type for LoadBalancer service: %v", service)
@ -1011,11 +1008,12 @@ func testReachable(ip string, port int) {
Failf("Got port==0 for reachability check (%s)", url) Failf("Got port==0 for reachability check (%s)", url)
} }
By(fmt.Sprintf("Checking reachability of %s", url)) By(fmt.Sprintf("Waiting up to %v for %s to be reachable", podStartTimeout, url))
start := time.Now()
expectNoError(wait.Poll(poll, podStartTimeout, func() (bool, error) { expectNoError(wait.Poll(poll, podStartTimeout, func() (bool, error) {
resp, err := httpGetNoConnectionPool(url) resp, err := httpGetNoConnectionPool(url)
if err != nil { if err != nil {
Logf("Got error waiting for reachability of %s: %v", url, err) Logf("Got error waiting for reachability of %s: %v (%v)", url, err, time.Since(start))
return false, nil return false, nil
} }
defer resp.Body.Close() defer resp.Body.Close()
@ -1044,7 +1042,7 @@ func testNotReachable(ip string, port int) {
Failf("Got port==0 for non-reachability check (%s)", url) Failf("Got port==0 for non-reachability check (%s)", url)
} }
By(fmt.Sprintf("Checking that %s is not reachable", url)) By(fmt.Sprintf("Waiting up to %v for %s to be *not* reachable", podStartTimeout, url))
expectNoError(wait.Poll(poll, podStartTimeout, func() (bool, error) { expectNoError(wait.Poll(poll, podStartTimeout, func() (bool, error) {
resp, err := httpGetNoConnectionPool(url) resp, err := httpGetNoConnectionPool(url)
if err != nil { if err != nil {
@ -1086,11 +1084,10 @@ type WebserverTest struct {
TestId string TestId string
Labels map[string]string Labels map[string]string
pods map[string]bool rcs map[string]bool
services map[string]bool services map[string]bool
name string
// Used for generating e.g. unique pod names image string
sequence int32
} }
func NewWebserverTest(client *client.Client, namespace string, serviceName string) *WebserverTest { func NewWebserverTest(client *client.Client, namespace string, serviceName string) *WebserverTest {
@ -1103,15 +1100,13 @@ func NewWebserverTest(client *client.Client, namespace string, serviceName strin
"testid": t.TestId, "testid": t.TestId,
} }
t.pods = make(map[string]bool) t.rcs = make(map[string]bool)
t.services = make(map[string]bool) t.services = make(map[string]bool)
return t t.name = "webserver"
} t.image = "gcr.io/google_containers/test-webserver"
func (t *WebserverTest) SequenceNext() int { return t
n := atomic.AddInt32(&t.sequence, 1)
return int(n)
} }
// Build default config for a service (which can then be changed) // Build default config for a service (which can then be changed)
@ -1131,43 +1126,27 @@ func (t *WebserverTest) BuildServiceSpec() *api.Service {
return service return service
} }
// Create a pod with the well-known webserver configuration, and record it for cleanup // CreateWebserverRC creates rc-backed pods with the well-known webserver
func (t *WebserverTest) CreateWebserverPod() *api.Pod { // configuration and records it for cleanup.
name := t.ServiceName + "-" + strconv.Itoa(t.SequenceNext()) func (t *WebserverTest) CreateWebserverRC(replicas int) *api.ReplicationController {
pod := &api.Pod{ rcSpec := rcByName(t.name, replicas, t.image, t.Labels)
TypeMeta: api.TypeMeta{ rcAct, err := t.createRC(rcSpec)
Kind: "Pod",
APIVersion: latest.Version,
},
ObjectMeta: api.ObjectMeta{
Name: name,
Labels: t.Labels,
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Name: "webserver",
Image: "gcr.io/google_containers/test-webserver",
},
},
},
}
pod, err := t.CreatePod(pod)
if err != nil { if err != nil {
Failf("Failed to create pod %s: %v", pod.Name, err) Failf("Failed to create rc %s: %v", rcSpec.Name, err)
} }
expectNoError(waitForPodRunningInNamespace(t.Client, pod.Name, t.Namespace)) if err := verifyPods(t.Client, t.Namespace, t.name, false, replicas); err != nil {
return pod Failf("Failed to create %d pods with name %s: %v", replicas, t.name, err)
}
return rcAct
} }
// Create a pod, and record it for cleanup // createRC creates a replication controller and records it for cleanup.
func (t *WebserverTest) CreatePod(pod *api.Pod) (*api.Pod, error) { func (t *WebserverTest) createRC(rc *api.ReplicationController) (*api.ReplicationController, error) {
podClient := t.Client.Pods(t.Namespace) rc, err := t.Client.ReplicationControllers(t.Namespace).Create(rc)
result, err := podClient.Create(pod)
if err == nil { if err == nil {
t.pods[pod.Name] = true t.rcs[rc.Name] = true
} }
return result, err return rc, err
} }
// Create a service, and record it for cleanup // Create a service, and record it for cleanup
@ -1190,14 +1169,23 @@ func (t *WebserverTest) DeleteService(serviceName string) error {
func (t *WebserverTest) Cleanup() []error { func (t *WebserverTest) Cleanup() []error {
var errs []error var errs []error
for rcName := range t.rcs {
for podName := range t.pods { By("stopping RC " + rcName + " in namespace " + t.Namespace)
podClient := t.Client.Pods(t.Namespace) // First, resize the RC to 0.
By("deleting pod " + podName + " in namespace " + t.Namespace) old, err := t.Client.ReplicationControllers(t.Namespace).Get(rcName)
err := podClient.Delete(podName, nil)
if err != nil { if err != nil {
errs = append(errs, err) errs = append(errs, err)
} }
old.Spec.Replicas = 0
if _, err := t.Client.ReplicationControllers(t.Namespace).Update(old); err != nil {
errs = append(errs, err)
}
// TODO(mbforbes): Wait.
// Then, delete the RC altogether.
if err := t.Client.ReplicationControllers(t.Namespace).Delete(rcName); err != nil {
errs = append(errs, err)
}
} }
for serviceName := range t.services { for serviceName := range t.services {

View File

@ -514,6 +514,7 @@ type podResponseChecker struct {
ns string ns string
label labels.Selector label labels.Selector
controllerName string controllerName string
respondName bool // Whether the pod should respond with its own name.
pods *api.PodList pods *api.PodList
} }
@ -535,16 +536,32 @@ func (r podResponseChecker) checkAllResponses() (done bool, err error) {
Do(). Do().
Raw() Raw()
if err != nil { if err != nil {
Logf("Controller %s: Failed to GET from replica %d (%s): %v:", r.controllerName, i+1, pod.Name, err) Logf("Controller %s: Failed to GET from replica %d [%s]: %v:", r.controllerName, i+1, pod.Name, err)
continue continue
} }
// The body should be the pod name. // The response checker expects the pod's name unless !respondName, in
if string(body) != pod.Name { // which case it just checks for a non-empty response.
Logf("Controller %s: Replica %d expected response %s but got %s", r.controllerName, i+1, pod.Name, string(body)) got := string(body)
what := ""
if r.respondName {
what = "expected"
want := pod.Name
if got != want {
Logf("Controller %s: Replica %d [%s] expected response %q but got %q",
r.controllerName, i+1, pod.Name, want, got)
continue continue
} }
} else {
what = "non-empty"
if len(got) == 0 {
Logf("Controller %s: Replica %d [%s] expected non-empty response",
r.controllerName, i+1, pod.Name)
continue
}
}
successes++ successes++
Logf("Controller %s: Got expected result from replica %d: %s, %d of %d required successes so far", r.controllerName, i+1, string(body), successes, len(r.pods.Items)) Logf("Controller %s: Got %s result from replica %d [%s]: %q, %d of %d required successes so far",
r.controllerName, what, i+1, pod.Name, got, successes, len(r.pods.Items))
} }
if successes < len(r.pods.Items) { if successes < len(r.pods.Items) {
return false, nil return false, nil
@ -1171,15 +1188,15 @@ func getSigner(provider string) (ssh.Signer, error) {
} }
// checkPodsRunning returns whether all pods whose names are listed in podNames // checkPodsRunning returns whether all pods whose names are listed in podNames
// are running and ready. // in namespace ns are running and ready, using c and waiting at most timeout.
func checkPodsRunningReady(c *client.Client, podNames []string, timeout time.Duration) bool { func checkPodsRunningReady(c *client.Client, ns string, podNames []string, timeout time.Duration) bool {
np, desc := len(podNames), "running and ready" np, desc := len(podNames), "running and ready"
Logf("Waiting up to %v for the following %d pods to be %s: %s", timeout, np, desc, podNames) Logf("Waiting up to %v for the following %d pods to be %s: %s", timeout, np, desc, podNames)
result := make(chan bool, len(podNames)) result := make(chan bool, len(podNames))
for ix := range podNames { for ix := range podNames {
// Launch off pod readiness checkers. // Launch off pod readiness checkers.
go func(name string) { go func(name string) {
err := waitForPodCondition(c, api.NamespaceDefault, name, desc, timeout, podRunningReady) err := waitForPodCondition(c, ns, name, desc, timeout, podRunningReady)
result <- err == nil result <- err == nil
}(podNames[ix]) }(podNames[ix])
} }