2015-01-13 02:11:27 +00:00
/ *
2015-05-01 16:19:44 +00:00
Copyright 2014 The Kubernetes Authors All rights reserved .
2015-01-13 02:11:27 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package e2e
import (
2015-03-26 20:34:18 +00:00
"bytes"
2015-02-08 15:57:48 +00:00
"fmt"
2015-04-24 17:26:12 +00:00
"io/ioutil"
2015-05-06 15:03:22 +00:00
"math"
2015-01-24 01:06:13 +00:00
"math/rand"
2015-04-24 17:26:12 +00:00
"os"
2015-03-26 20:34:18 +00:00
"os/exec"
2015-01-13 02:11:27 +00:00
"path/filepath"
2015-05-25 09:43:26 +00:00
"sort"
2015-01-24 01:06:13 +00:00
"strconv"
2015-03-26 20:34:18 +00:00
"strings"
2015-01-13 02:11:27 +00:00
"time"
2015-04-24 17:26:12 +00:00
"code.google.com/p/go-uuid/uuid"
2015-01-13 02:11:27 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
2015-03-06 22:49:25 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/clientcmd"
2015-05-19 13:59:46 +00:00
clientcmdapi "github.com/GoogleCloudPlatform/kubernetes/pkg/client/clientcmd/api"
2015-05-19 16:13:08 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/cloudprovider"
2015-04-22 19:08:08 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
2015-05-05 14:48:50 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubectl"
2015-04-22 19:08:08 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
2015-05-06 20:50:36 +00:00
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
2015-03-26 20:34:18 +00:00
2015-04-24 17:26:12 +00:00
"golang.org/x/crypto/ssh"
2015-02-08 15:57:48 +00:00
. "github.com/onsi/ginkgo"
2015-02-12 18:37:31 +00:00
. "github.com/onsi/gomega"
2015-01-13 02:11:27 +00:00
)
2015-03-05 20:04:00 +00:00
const (
// Initial pod start can be delayed O(minutes) by slow docker pulls
// TODO: Make this 30 seconds once #4566 is resolved.
podStartTimeout = 5 * time . Minute
2015-05-08 00:11:48 +00:00
// String used to mark pod deletion
nonExist = "NonExist"
2015-04-29 22:28:48 +00:00
// How often to poll pods.
podPoll = 5 * time . Second
2015-03-05 20:04:00 +00:00
)
2015-05-19 16:13:08 +00:00
type CloudConfig struct {
2015-04-23 14:28:16 +00:00
ProjectID string
Zone string
MasterName string
NodeInstanceGroup string
NumNodes int
2015-05-19 16:13:08 +00:00
Provider cloudprovider . Interface
}
2015-03-31 23:36:31 +00:00
type TestContextType struct {
KubeConfig string
KubeContext string
CertDir string
Host string
RepoRoot string
Provider string
2015-04-02 14:26:21 +00:00
CloudConfig CloudConfig
2015-05-26 16:27:13 +00:00
KubectlPath string
2015-01-13 02:11:27 +00:00
}
2015-03-31 23:36:31 +00:00
var testContext TestContextType
2015-01-13 02:11:27 +00:00
2015-05-14 13:18:24 +00:00
type ContainerFailures struct {
status * api . ContainerStateTerminated
restarts int
}
2015-02-10 00:57:06 +00:00
func Logf ( format string , a ... interface { } ) {
fmt . Fprintf ( GinkgoWriter , "INFO: " + format + "\n" , a ... )
}
2015-02-10 01:02:11 +00:00
func Failf ( format string , a ... interface { } ) {
2015-02-12 18:35:12 +00:00
Fail ( fmt . Sprintf ( format , a ... ) , 1 )
2015-02-10 01:02:11 +00:00
}
2015-04-18 22:30:10 +00:00
func providerIs ( providers ... string ) bool {
if testContext . Provider == "" {
Fail ( "testContext.Provider is not defined" )
}
for _ , provider := range providers {
if strings . ToLower ( provider ) == strings . ToLower ( testContext . Provider ) {
return true
}
}
return false
}
2015-03-05 20:04:00 +00:00
type podCondition func ( pod * api . Pod ) ( bool , error )
2015-05-18 20:49:32 +00:00
// podReady returns whether pod has a condition of Ready with a status of true.
func podReady ( pod * api . Pod ) bool {
for _ , cond := range pod . Status . Conditions {
if cond . Type == api . PodReady && cond . Status == api . ConditionTrue {
return true
}
}
return false
}
2015-05-21 22:59:35 +00:00
// logPodStates logs basic info of provided pods for debugging.
func logPodStates ( pods [ ] api . Pod ) {
// Find maximum widths for pod, node, and phase strings for column printing.
maxPodW , maxNodeW , maxPhaseW := len ( "POD" ) , len ( "NODE" ) , len ( "PHASE" )
for _ , pod := range pods {
if len ( pod . ObjectMeta . Name ) > maxPodW {
maxPodW = len ( pod . ObjectMeta . Name )
}
2015-05-22 23:40:57 +00:00
if len ( pod . Spec . NodeName ) > maxNodeW {
maxNodeW = len ( pod . Spec . NodeName )
2015-05-21 22:59:35 +00:00
}
if len ( pod . Status . Phase ) > maxPhaseW {
maxPhaseW = len ( pod . Status . Phase )
}
2015-05-18 20:49:32 +00:00
}
2015-05-21 22:59:35 +00:00
// Increase widths by one to separate by a single space.
maxPodW ++
maxNodeW ++
maxPhaseW ++
// Log pod info. * does space padding, - makes them left-aligned.
Logf ( "%-[1]*[2]s %-[3]*[4]s %-[5]*[6]s %[7]s" ,
maxPodW , "POD" , maxNodeW , "NODE" , maxPhaseW , "PHASE" , "CONDITIONS" )
for _ , pod := range pods {
Logf ( "%-[1]*[2]s %-[3]*[4]s %-[5]*[6]s %[7]s" ,
2015-05-22 23:40:57 +00:00
maxPodW , pod . ObjectMeta . Name , maxNodeW , pod . Spec . NodeName , maxPhaseW , pod . Status . Phase , pod . Status . Conditions )
2015-05-18 20:49:32 +00:00
}
2015-05-21 22:59:35 +00:00
Logf ( "" ) // Final empty line helps for readability.
2015-05-18 20:49:32 +00:00
}
// podRunningReady checks whether pod p's phase is running and it has a ready
// condition of status true.
func podRunningReady ( p * api . Pod ) ( bool , error ) {
// Check the phase is running.
if p . Status . Phase != api . PodRunning {
return false , fmt . Errorf ( "want pod '%s' on '%s' to be '%v' but was '%v'" ,
2015-05-22 23:40:57 +00:00
p . ObjectMeta . Name , p . Spec . NodeName , api . PodRunning , p . Status . Phase )
2015-05-18 20:49:32 +00:00
}
// Check the ready condition is true.
if ! podReady ( p ) {
return false , fmt . Errorf ( "pod '%s' on '%s' didn't have condition {%v %v}; conditions: %v" ,
2015-05-22 23:40:57 +00:00
p . ObjectMeta . Name , p . Spec . NodeName , api . PodReady , api . ConditionTrue , p . Status . Conditions )
2015-05-18 20:49:32 +00:00
}
return true , nil
}
// waitForPodsRunningReady waits up to timeout to ensure that all pods in
// namespace ns are running and ready, requiring that it finds at least minPods.
// It has separate behavior from other 'wait for' pods functions in that it re-
// queries the list of pods on every iteration. This is useful, for example, in
// cluster startup, because the number of pods increases while waiting.
func waitForPodsRunningReady ( ns string , minPods int , timeout time . Duration ) error {
c , err := loadClient ( )
if err != nil {
return err
}
Logf ( "Waiting up to %v for all pods (need at least %d) in namespace '%s' to be running and ready" ,
timeout , minPods , ns )
for start := time . Now ( ) ; time . Since ( start ) < timeout ; time . Sleep ( podPoll ) {
// We get the new list of pods in every iteration beause more pods come
// online during startup and we want to ensure they are also checked.
podList , err := c . Pods ( ns ) . List ( labels . Everything ( ) , fields . Everything ( ) )
if err != nil {
Logf ( "Error getting pods in namespace '%s': %v" , ns , err )
continue
}
2015-05-21 22:59:35 +00:00
nOk , badPods := 0 , [ ] api . Pod { }
2015-05-18 20:49:32 +00:00
for _ , pod := range podList . Items {
if res , err := podRunningReady ( & pod ) ; res && err == nil {
nOk ++
2015-05-21 22:59:35 +00:00
} else {
badPods = append ( badPods , pod )
2015-05-18 20:49:32 +00:00
}
}
2015-05-21 22:59:35 +00:00
Logf ( "%d / %d pods in namespace '%s' are running and ready (%d seconds elapsed)" ,
nOk , len ( podList . Items ) , ns , int ( time . Since ( start ) . Seconds ( ) ) )
2015-05-18 20:49:32 +00:00
if nOk == len ( podList . Items ) && nOk >= minPods {
return nil
}
2015-05-21 22:59:35 +00:00
logPodStates ( badPods )
2015-05-18 20:49:32 +00:00
}
return fmt . Errorf ( "Not all pods in namespace '%s' running and ready within %v" , ns , timeout )
}
2015-04-29 22:28:48 +00:00
func waitForPodCondition ( c * client . Client , ns , podName , desc string , poll , timeout time . Duration , condition podCondition ) error {
Logf ( "Waiting up to %v for pod %s status to be %s" , timeout , podName , desc )
for start := time . Now ( ) ; time . Since ( start ) < timeout ; time . Sleep ( poll ) {
2015-03-05 20:04:00 +00:00
pod , err := c . Pods ( ns ) . Get ( podName )
2015-01-13 02:11:27 +00:00
if err != nil {
2015-04-29 22:28:48 +00:00
Logf ( "Get pod %s in ns %s failed, ignoring for %v: %v" , podName , ns , poll , err )
2015-03-05 20:04:00 +00:00
continue
2015-01-13 02:11:27 +00:00
}
2015-03-05 20:04:00 +00:00
done , err := condition ( pod )
if done {
return err
2015-01-13 02:11:27 +00:00
}
2015-05-18 20:49:32 +00:00
Logf ( "Waiting for pod '%s' in namespace '%s' status to be '%q' (found phase: '%q', readiness: %t) (%v)" ,
podName , ns , desc , pod . Status . Phase , podReady ( pod ) , time . Since ( start ) )
2015-01-13 02:11:27 +00:00
}
2015-05-18 20:49:32 +00:00
return fmt . Errorf ( "gave up waiting for pod '%s' to be '%s' after %v" , podName , desc , timeout )
2015-01-13 02:11:27 +00:00
}
2015-04-17 15:22:29 +00:00
// createNS should be used by every test, note that we append a common prefix to the provided test name.
func createTestingNS ( baseName string , c * client . Client ) ( * api . Namespace , error ) {
namespaceObj := & api . Namespace {
ObjectMeta : api . ObjectMeta {
2015-04-21 13:27:39 +00:00
Name : fmt . Sprintf ( "e2e-tests-%v-%v" , baseName , uuid . New ( ) ) ,
2015-04-17 15:22:29 +00:00
Namespace : "" ,
} ,
Status : api . NamespaceStatus { } ,
}
_ , err := c . Namespaces ( ) . Create ( namespaceObj )
return namespaceObj , err
}
2015-03-19 00:46:31 +00:00
func waitForPodRunningInNamespace ( c * client . Client , podName string , namespace string ) error {
2015-04-29 22:28:48 +00:00
return waitForPodCondition ( c , namespace , podName , "running" , podPoll , podStartTimeout , func ( pod * api . Pod ) ( bool , error ) {
2015-05-27 23:07:26 +00:00
if pod . Status . Phase == api . PodRunning {
return true , nil
}
if pod . Status . Phase == api . PodFailed {
return true , fmt . Errorf ( "Giving up; pod went into failed status: \n%#v" , pod . Status )
}
return false , nil
2015-03-05 20:04:00 +00:00
} )
}
2015-03-19 00:46:31 +00:00
func waitForPodRunning ( c * client . Client , podName string ) error {
return waitForPodRunningInNamespace ( c , podName , api . NamespaceDefault )
}
2015-03-05 20:04:00 +00:00
// waitForPodNotPending returns an error if it took too long for the pod to go out of pending state.
func waitForPodNotPending ( c * client . Client , ns , podName string ) error {
2015-04-29 22:28:48 +00:00
return waitForPodCondition ( c , ns , podName , "!pending" , podPoll , podStartTimeout , func ( pod * api . Pod ) ( bool , error ) {
2015-01-22 16:15:36 +00:00
if pod . Status . Phase != api . PodPending {
2015-05-18 20:49:32 +00:00
Logf ( "Saw pod '%s' in namespace '%s' out of pending state (found '%q')" , podName , ns , pod . Status . Phase )
2015-03-05 20:04:00 +00:00
return true , nil
2015-01-22 16:15:36 +00:00
}
2015-03-05 20:04:00 +00:00
return false , nil
} )
2015-01-22 16:15:36 +00:00
}
2015-03-20 18:26:34 +00:00
// waitForPodSuccessInNamespace returns nil if the pod reached state success, or an error if it reached failure or ran too long.
func waitForPodSuccessInNamespace ( c * client . Client , podName string , contName string , namespace string ) error {
2015-04-29 22:28:48 +00:00
return waitForPodCondition ( c , namespace , podName , "success or failure" , podPoll , podStartTimeout , func ( pod * api . Pod ) ( bool , error ) {
2015-01-13 02:11:27 +00:00
// Cannot use pod.Status.Phase == api.PodSucceeded/api.PodFailed due to #2632
2015-03-25 11:09:35 +00:00
ci , ok := api . GetContainerStatus ( pod . Status . ContainerStatuses , contName )
2015-01-13 02:11:27 +00:00
if ! ok {
2015-05-18 20:49:32 +00:00
Logf ( "No Status.Info for container '%s' in pod '%s' yet" , contName , podName )
2015-01-13 02:11:27 +00:00
} else {
if ci . State . Termination != nil {
if ci . State . Termination . ExitCode == 0 {
2015-02-09 15:48:07 +00:00
By ( "Saw pod success" )
2015-03-05 20:04:00 +00:00
return true , nil
2015-01-13 02:11:27 +00:00
} else {
2015-05-18 20:49:32 +00:00
return true , fmt . Errorf ( "pod '%s' terminated with failure: %+v" , podName , ci . State . Termination )
2015-01-13 02:11:27 +00:00
}
} else {
2015-05-18 20:49:32 +00:00
Logf ( "Nil State.Termination for container '%s' in pod '%s' in namespace '%s' so far" , contName , podName , namespace )
2015-01-13 02:11:27 +00:00
}
}
2015-03-05 20:04:00 +00:00
return false , nil
} )
2015-01-13 02:11:27 +00:00
}
2015-03-20 18:26:34 +00:00
// waitForPodSuccess returns nil if the pod reached state success, or an error if it reached failure or ran too long.
// The default namespace is used to identify pods.
func waitForPodSuccess ( c * client . Client , podName string , contName string ) error {
return waitForPodSuccessInNamespace ( c , podName , contName , api . NamespaceDefault )
}
2015-04-23 14:28:16 +00:00
// Context for checking pods responses by issuing GETs to them and verifying if the answer with pod name.
type podResponseChecker struct {
c * client . Client
ns string
label labels . Selector
controllerName string
pods * api . PodList
}
// checkAllResponses issues GETs to all pods in the context and verify they reply with pod name.
func ( r podResponseChecker ) checkAllResponses ( ) ( done bool , err error ) {
successes := 0
currentPods , err := r . c . Pods ( r . ns ) . List ( r . label , fields . Everything ( ) )
Expect ( err ) . NotTo ( HaveOccurred ( ) )
for i , pod := range r . pods . Items {
// Check that the replica list remains unchanged, otherwise we have problems.
if ! isElementOf ( pod . UID , currentPods ) {
return false , fmt . Errorf ( "pod with UID %s is no longer a member of the replica set. Must have been restarted for some reason. Current replica set: %v" , pod . UID , currentPods )
}
body , err := r . c . Get ( ) .
Prefix ( "proxy" ) .
Namespace ( r . ns ) .
Resource ( "pods" ) .
Name ( string ( pod . Name ) ) .
Do ( ) .
Raw ( )
if err != nil {
Logf ( "Controller %s: Failed to GET from replica %d (%s): %v:" , r . controllerName , i + 1 , pod . Name , err )
continue
}
// The body should be the pod name.
if string ( body ) != pod . Name {
Logf ( "Controller %s: Replica %d expected response %s but got %s" , r . controllerName , i + 1 , pod . Name , string ( body ) )
continue
}
successes ++
Logf ( "Controller %s: Got expected result from replica %d: %s, %d of %d required successes so far" , r . controllerName , i + 1 , string ( body ) , successes , len ( r . pods . Items ) )
}
if successes < len ( r . pods . Items ) {
return false , nil
}
return true , nil
}
2015-01-08 20:41:38 +00:00
func loadConfig ( ) ( * client . Config , error ) {
2015-03-06 22:49:25 +00:00
switch {
2015-03-31 23:36:31 +00:00
case testContext . KubeConfig != "" :
fmt . Printf ( ">>> testContext.KubeConfig: %s\n" , testContext . KubeConfig )
c , err := clientcmd . LoadFromFile ( testContext . KubeConfig )
2015-03-06 22:49:25 +00:00
if err != nil {
2015-03-31 23:36:31 +00:00
return nil , fmt . Errorf ( "error loading KubeConfig: %v" , err . Error ( ) )
}
if testContext . KubeContext != "" {
fmt . Printf ( ">>> testContext.KubeContext: %s\n" , testContext . KubeContext )
c . CurrentContext = testContext . KubeContext
2015-03-06 22:49:25 +00:00
}
2015-05-19 13:59:46 +00:00
return clientcmd . NewDefaultClientConfig ( * c , & clientcmd . ConfigOverrides { ClusterInfo : clientcmdapi . Cluster { Server : testContext . Host } } ) . ClientConfig ( )
2015-03-06 22:49:25 +00:00
default :
2015-05-13 20:54:02 +00:00
return nil , fmt . Errorf ( "KubeConfig must be specified to load client config" )
2015-01-13 02:11:27 +00:00
}
2015-01-08 20:41:38 +00:00
}
func loadClient ( ) ( * client . Client , error ) {
config , err := loadConfig ( )
2015-01-13 02:11:27 +00:00
if err != nil {
2015-03-06 22:49:25 +00:00
return nil , fmt . Errorf ( "error creating client: %v" , err . Error ( ) )
2015-01-13 02:11:27 +00:00
}
2015-01-08 20:41:38 +00:00
c , err := client . New ( config )
2015-01-13 02:11:27 +00:00
if err != nil {
2015-03-06 22:49:25 +00:00
return nil , fmt . Errorf ( "error creating client: %v" , err . Error ( ) )
2015-01-13 02:11:27 +00:00
}
2015-02-09 15:10:02 +00:00
return c , nil
2015-01-13 02:11:27 +00:00
}
2015-01-24 01:06:13 +00:00
2015-04-17 15:22:29 +00:00
// randomSuffix provides a random string to append to pods,services,rcs.
2015-01-24 01:06:13 +00:00
// TODO: Allow service names to have the same form as names
// for pods and replication controllers so we don't
// need to use such a function and can instead
// use the UUID utilty function.
func randomSuffix ( ) string {
r := rand . New ( rand . NewSource ( time . Now ( ) . UnixNano ( ) ) )
return strconv . Itoa ( r . Int ( ) % 10000 )
}
2015-02-12 18:37:31 +00:00
func expectNoError ( err error , explain ... interface { } ) {
ExpectWithOffset ( 1 , err ) . NotTo ( HaveOccurred ( ) , explain ... )
}
2015-03-26 20:34:18 +00:00
2015-05-15 09:39:30 +00:00
// Stops everything from filePath from namespace ns and checks if everything maching selectors from the given namespace is correctly stopped.
func cleanup ( filePath string , ns string , selectors ... string ) {
2015-03-26 20:34:18 +00:00
By ( "using stop to clean up resources" )
2015-05-15 09:39:30 +00:00
var nsArg string
if ns != "" {
nsArg = fmt . Sprintf ( "--namespace=%s" , ns )
}
runKubectl ( "stop" , "-f" , filePath , nsArg )
2015-03-26 20:34:18 +00:00
for _ , selector := range selectors {
2015-05-15 09:39:30 +00:00
resources := runKubectl ( "get" , "pods,rc,se" , "-l" , selector , "--no-headers" , nsArg )
2015-03-26 20:34:18 +00:00
if resources != "" {
Failf ( "Resources left running after stop:\n%s" , resources )
}
}
}
// validatorFn is the function which is individual tests will implement.
// we may want it to return more than just an error, at some point.
type validatorFn func ( c * client . Client , podID string ) error
// validateController is a generic mechanism for testing RC's that are running.
// It takes a container name, a test name, and a validator function which is plugged in by a specific test.
// "containername": this is grepped for.
// "containerImage" : this is the name of the image we expect to be launched. Not to confuse w/ images (kitten.jpg) which are validated.
// "testname": which gets bubbled up to the logging/failure messages if errors happen.
// "validator" function: This function is given a podID and a client, and it can do some specific validations that way.
2015-04-30 02:53:09 +00:00
func validateController ( c * client . Client , containerImage string , replicas int , containername string , testname string , validator validatorFn , ns string ) {
2015-04-08 17:22:33 +00:00
getPodsTemplate := "--template={{range.items}}{{.metadata.name}} {{end}}"
2015-03-26 20:34:18 +00:00
// NB: kubectl adds the "exists" function to the standard template functions.
// This lets us check to see if the "running" entry exists for each of the containers
// we care about. Exists will never return an error and it's safe to check a chain of
// things, any one of which may not exist. In the below template, all of info,
// containername, and running might be nil, so the normal index function isn't very
// helpful.
// This template is unit-tested in kubectl, so if you change it, update the unit test.
// You can read about the syntax here: http://golang.org/pkg/text/template/.
2015-04-08 17:22:33 +00:00
getContainerStateTemplate := fmt . Sprintf ( ` --template= {{ if ( exists . "status" "containerStatuses" ) }} {{ range .status .containerStatuses }} {{ if ( and ( eq .name "%s" ) ( exists . "state" "running" ) ) }} true {{ end }} {{ end }} {{ end }} ` , containername )
2015-03-26 20:34:18 +00:00
2015-04-08 17:22:33 +00:00
getImageTemplate := fmt . Sprintf ( ` --template= {{ if ( exists . "status" "containerStatuses" ) }} {{ range .status .containerStatuses }} {{ if eq .name "%s" }} {{ .image }} {{ end }} {{ end }} {{ end }} ` , containername )
2015-03-26 20:34:18 +00:00
By ( fmt . Sprintf ( "waiting for all containers in %s pods to come up." , testname ) ) //testname should be selector
2015-05-19 18:17:32 +00:00
for start := time . Now ( ) ; time . Since ( start ) < podStartTimeout ; time . Sleep ( 5 * time . Second ) {
2015-04-30 02:53:09 +00:00
getPodsOutput := runKubectl ( "get" , "pods" , "-o" , "template" , getPodsTemplate , "--api-version=v1beta3" , "-l" , testname , fmt . Sprintf ( "--namespace=%v" , ns ) )
2015-03-26 20:34:18 +00:00
pods := strings . Fields ( getPodsOutput )
if numPods := len ( pods ) ; numPods != replicas {
By ( fmt . Sprintf ( "Replicas for %s: expected=%d actual=%d" , testname , replicas , numPods ) )
2015-05-19 18:17:32 +00:00
continue
2015-03-26 20:34:18 +00:00
}
var runningPods [ ] string
for _ , podID := range pods {
2015-04-30 02:53:09 +00:00
running := runKubectl ( "get" , "pods" , podID , "-o" , "template" , getContainerStateTemplate , "--api-version=v1beta3" , fmt . Sprintf ( "--namespace=%v" , ns ) )
2015-04-08 17:22:33 +00:00
if running != "true" {
2015-03-26 20:34:18 +00:00
Logf ( "%s is created but not running" , podID )
2015-05-19 18:17:32 +00:00
continue
2015-03-26 20:34:18 +00:00
}
2015-04-30 02:53:09 +00:00
currentImage := runKubectl ( "get" , "pods" , podID , "-o" , "template" , getImageTemplate , "--api-version=v1beta3" , fmt . Sprintf ( "--namespace=%v" , ns ) )
2015-03-26 20:34:18 +00:00
if currentImage != containerImage {
Logf ( "%s is created but running wrong image; expected: %s, actual: %s" , podID , containerImage , currentImage )
2015-05-19 18:17:32 +00:00
continue
2015-03-26 20:34:18 +00:00
}
// Call the generic validator function here.
// This might validate for example, that (1) getting a url works and (2) url is serving correct content.
if err := validator ( c , podID ) ; err != nil {
Logf ( "%s is running right image but validator function failed: %v" , podID , err )
2015-05-19 18:17:32 +00:00
continue
2015-03-26 20:34:18 +00:00
}
Logf ( "%s is verified up and running" , podID )
runningPods = append ( runningPods , podID )
}
// If we reach here, then all our checks passed.
if len ( runningPods ) == replicas {
2015-05-19 18:17:32 +00:00
return
2015-03-26 20:34:18 +00:00
}
2015-05-19 18:17:32 +00:00
}
2015-03-26 20:34:18 +00:00
// Reaching here means that one of more checks failed multiple times. Assuming its not a race condition, something is broken.
Failf ( "Timed out after %v seconds waiting for %s pods to reach valid state" , podStartTimeout . Seconds ( ) , testname )
}
2015-05-19 16:13:08 +00:00
// kubectlCmd runs the kubectl executable through the wrapper script.
2015-03-26 20:34:18 +00:00
func kubectlCmd ( args ... string ) * exec . Cmd {
defaultArgs := [ ] string { }
2015-04-21 14:07:08 +00:00
// Reference a --server option so tests can run anywhere.
if testContext . Host != "" {
defaultArgs = append ( defaultArgs , "--" + clientcmd . FlagAPIServer + "=" + testContext . Host )
}
2015-03-31 23:36:31 +00:00
if testContext . KubeConfig != "" {
defaultArgs = append ( defaultArgs , "--" + clientcmd . RecommendedConfigPathFlag + "=" + testContext . KubeConfig )
2015-04-20 19:51:16 +00:00
// Reference the KubeContext
2015-04-02 04:24:16 +00:00
if testContext . KubeContext != "" {
defaultArgs = append ( defaultArgs , "--" + clientcmd . FlagContext + "=" + testContext . KubeContext )
}
2015-04-20 19:51:16 +00:00
2015-03-26 20:34:18 +00:00
} else {
2015-03-31 23:36:31 +00:00
if testContext . CertDir != "" {
2015-03-26 20:34:18 +00:00
defaultArgs = append ( defaultArgs ,
2015-03-31 23:36:31 +00:00
fmt . Sprintf ( "--certificate-authority=%s" , filepath . Join ( testContext . CertDir , "ca.crt" ) ) ,
fmt . Sprintf ( "--client-certificate=%s" , filepath . Join ( testContext . CertDir , "kubecfg.crt" ) ) ,
fmt . Sprintf ( "--client-key=%s" , filepath . Join ( testContext . CertDir , "kubecfg.key" ) ) )
2015-03-26 20:34:18 +00:00
}
}
kubectlArgs := append ( defaultArgs , args ... )
2015-04-02 04:24:16 +00:00
2015-05-26 16:27:13 +00:00
//We allow users to specify path to kubectl, so you can test either "kubectl" or "cluster/kubectl.sh"
//and so on.
cmd := exec . Command ( testContext . KubectlPath , kubectlArgs ... )
//caller will invoke this and wait on it.
2015-03-26 20:34:18 +00:00
return cmd
}
func runKubectl ( args ... string ) string {
var stdout , stderr bytes . Buffer
cmd := kubectlCmd ( args ... )
cmd . Stdout , cmd . Stderr = & stdout , & stderr
2015-05-26 16:27:13 +00:00
Logf ( "Running '%s %s'" , cmd . Path , strings . Join ( cmd . Args , " " ) )
2015-03-26 20:34:18 +00:00
if err := cmd . Run ( ) ; err != nil {
Failf ( "Error running %v:\nCommand stdout:\n%v\nstderr:\n%v\n" , cmd , cmd . Stdout , cmd . Stderr )
return ""
}
Logf ( stdout . String ( ) )
// TODO: trimspace should be unnecessary after switching to use kubectl binary directly
return strings . TrimSpace ( stdout . String ( ) )
}
2015-03-31 17:25:14 +00:00
// testContainerOutput runs testContainerOutputInNamespace with the default namespace.
func testContainerOutput ( scenarioName string , c * client . Client , pod * api . Pod , expectedOutput [ ] string ) {
2015-04-27 13:35:28 +00:00
testContainerOutputInNamespace ( scenarioName , c , pod , expectedOutput , api . NamespaceDefault )
2015-03-31 17:25:14 +00:00
}
// testContainerOutputInNamespace runs the given pod in the given namespace and waits
// for the first container in the podSpec to move into the 'Success' status. It retrieves
// the container log and searches for lines of expected output.
2015-04-27 13:35:28 +00:00
func testContainerOutputInNamespace ( scenarioName string , c * client . Client , pod * api . Pod , expectedOutput [ ] string , ns string ) {
2015-03-31 17:25:14 +00:00
By ( fmt . Sprintf ( "Creating a pod to test %v" , scenarioName ) )
2015-04-28 12:21:57 +00:00
defer c . Pods ( ns ) . Delete ( pod . Name , nil )
2015-03-31 17:25:14 +00:00
if _ , err := c . Pods ( ns ) . Create ( pod ) ; err != nil {
Failf ( "Failed to create pod: %v" , err )
}
containerName := pod . Spec . Containers [ 0 ] . Name
// Wait for client pod to complete.
2015-04-26 22:54:10 +00:00
expectNoError ( waitForPodSuccessInNamespace ( c , pod . Name , containerName , ns ) )
2015-03-31 17:25:14 +00:00
// Grab its logs. Get host first.
podStatus , err := c . Pods ( ns ) . Get ( pod . Name )
if err != nil {
Failf ( "Failed to get pod status: %v" , err )
}
2015-05-22 23:40:57 +00:00
By ( fmt . Sprintf ( "Trying to get logs from node %s pod %s container %s: %v" ,
podStatus . Spec . NodeName , podStatus . Name , containerName , err ) )
2015-03-31 17:25:14 +00:00
var logs [ ] byte
2015-05-19 18:17:32 +00:00
start := time . Now ( )
2015-03-31 17:25:14 +00:00
// Sometimes the actual containers take a second to get started, try to get logs for 60s
2015-05-19 18:17:32 +00:00
for time . Now ( ) . Sub ( start ) < ( 60 * time . Second ) {
2015-03-31 17:25:14 +00:00
logs , err = c . Get ( ) .
Prefix ( "proxy" ) .
2015-04-02 20:15:39 +00:00
Resource ( "nodes" ) .
2015-05-22 23:40:57 +00:00
Name ( podStatus . Spec . NodeName ) .
2015-03-31 17:25:14 +00:00
Suffix ( "containerLogs" , ns , podStatus . Name , containerName ) .
Do ( ) .
Raw ( )
2015-05-19 18:17:32 +00:00
fmt . Sprintf ( "pod logs:%v\n" , string ( logs ) )
By ( fmt . Sprintf ( "pod logs:%v\n" , string ( logs ) ) )
2015-03-31 17:25:14 +00:00
if strings . Contains ( string ( logs ) , "Internal Error" ) {
2015-05-22 23:40:57 +00:00
By ( fmt . Sprintf ( "Failed to get logs from node %q pod %q container %q: %v" ,
podStatus . Spec . NodeName , podStatus . Name , containerName , string ( logs ) ) )
2015-05-19 18:17:32 +00:00
time . Sleep ( 5 * time . Second )
continue
2015-03-31 17:25:14 +00:00
}
2015-05-19 18:17:32 +00:00
break
}
2015-03-31 17:25:14 +00:00
for _ , m := range expectedOutput {
Expect ( string ( logs ) ) . To ( ContainSubstring ( m ) , "%q in container output" , m )
}
}
2015-04-22 19:08:08 +00:00
2015-05-08 00:11:48 +00:00
// podInfo contains pod information useful for debugging e2e tests.
type podInfo struct {
oldHostname string
oldPhase string
hostname string
phase string
}
// PodDiff is a map of pod name to podInfos
type PodDiff map [ string ] * podInfo
// Print formats and prints the give PodDiff.
func ( p PodDiff ) Print ( ignorePhases util . StringSet ) {
for name , info := range p {
if ignorePhases . Has ( info . phase ) {
continue
}
if info . phase == nonExist {
Logf ( "Pod %v was deleted, had phase %v and host %v" , name , info . phase , info . hostname )
continue
}
phaseChange , hostChange := false , false
msg := fmt . Sprintf ( "Pod %v " , name )
if info . oldPhase != info . phase {
phaseChange = true
if info . oldPhase == nonExist {
msg += fmt . Sprintf ( "in phase %v " , info . phase )
} else {
msg += fmt . Sprintf ( "went from phase: %v -> %v " , info . oldPhase , info . phase )
}
}
if info . oldHostname != info . hostname {
hostChange = true
if info . oldHostname == nonExist || info . oldHostname == "" {
msg += fmt . Sprintf ( "assigned host %v " , info . hostname )
} else {
msg += fmt . Sprintf ( "went from host: %v -> %v " , info . oldHostname , info . hostname )
}
}
if phaseChange || hostChange {
Logf ( msg )
}
}
}
// Diff computes a PodDiff given 2 lists of pods.
2015-05-22 20:00:46 +00:00
func Diff ( oldPods * api . PodList , curPods * api . PodList ) PodDiff {
2015-05-08 00:11:48 +00:00
podInfoMap := PodDiff { }
// New pods will show up in the curPods list but not in oldPods. They have oldhostname/phase == nonexist.
2015-05-22 20:00:46 +00:00
for _ , pod := range curPods . Items {
2015-05-22 23:40:57 +00:00
podInfoMap [ pod . Name ] = & podInfo { hostname : pod . Spec . NodeName , phase : string ( pod . Status . Phase ) , oldHostname : nonExist , oldPhase : nonExist }
2015-05-08 00:11:48 +00:00
}
// Deleted pods will show up in the oldPods list but not in curPods. They have a hostname/phase == nonexist.
2015-05-22 20:00:46 +00:00
for _ , pod := range oldPods . Items {
2015-05-08 00:11:48 +00:00
if info , ok := podInfoMap [ pod . Name ] ; ok {
2015-05-22 23:40:57 +00:00
info . oldHostname , info . oldPhase = pod . Spec . NodeName , string ( pod . Status . Phase )
2015-05-08 00:11:48 +00:00
} else {
2015-05-22 23:40:57 +00:00
podInfoMap [ pod . Name ] = & podInfo { hostname : nonExist , phase : nonExist , oldHostname : pod . Spec . NodeName , oldPhase : string ( pod . Status . Phase ) }
2015-05-08 00:11:48 +00:00
}
}
return podInfoMap
}
2015-05-06 15:03:22 +00:00
// RunRC Launches (and verifies correctness) of a Replication Controller
// It will waits for all pods it spawns to become "Running".
// It's the caller's responsibility to clean up externally (i.e. use the
// namespace lifecycle for handling cleanup).
2015-05-22 20:00:46 +00:00
func RunRC ( c * client . Client , name string , ns , image string , replicas int ) error {
2015-04-22 19:08:08 +00:00
var last int
2015-05-06 15:03:22 +00:00
maxContainerFailures := int ( math . Max ( 1.0 , float64 ( replicas ) * .01 ) )
2015-04-22 19:08:08 +00:00
current := 0
same := 0
2015-05-08 00:11:48 +00:00
By ( fmt . Sprintf ( "Creating replication controller %s" , name ) )
rc := & api . ReplicationController {
2015-04-22 19:08:08 +00:00
ObjectMeta : api . ObjectMeta {
Name : name ,
} ,
Spec : api . ReplicationControllerSpec {
Replicas : replicas ,
Selector : map [ string ] string {
"name" : name ,
} ,
Template : & api . PodTemplateSpec {
ObjectMeta : api . ObjectMeta {
Labels : map [ string ] string { "name" : name } ,
} ,
Spec : api . PodSpec {
Containers : [ ] api . Container {
{
Name : name ,
Image : image ,
Ports : [ ] api . ContainerPort { { ContainerPort : 80 } } ,
} ,
} ,
} ,
} ,
} ,
2015-05-08 00:11:48 +00:00
}
_ , err := c . ReplicationControllers ( ns ) . Create ( rc )
2015-04-22 19:08:08 +00:00
if err != nil {
return fmt . Errorf ( "Error creating replication controller: %v" , err )
}
2015-05-08 00:11:48 +00:00
Logf ( "Created replication controller with name: %v, namespace: %v, replica count: %v" , rc . Name , ns , rc . Spec . Replicas )
2015-04-22 19:08:08 +00:00
2015-05-13 14:06:32 +00:00
By ( fmt . Sprintf ( "Making sure all %d replicas of rc %s in namespace %s exist" , replicas , name , ns ) )
2015-05-22 20:00:46 +00:00
label := labels . SelectorFromSet ( labels . Set ( map [ string ] string { "name" : name } ) )
pods , err := listPods ( c , ns , label , fields . Everything ( ) )
if err != nil {
return fmt . Errorf ( "Error listing pods: %v" , err )
}
current = len ( pods . Items )
failCount := 5
2015-04-22 19:08:08 +00:00
for same < failCount && current < replicas {
2015-05-22 20:00:46 +00:00
Logf ( "Controller %s: Found %d pods out of %d" , name , current , replicas )
if last < current {
same = 0
} else if last == current {
same ++
} else if current < last {
return fmt . Errorf ( "Controller %s: Number of submitted pods dropped from %d to %d" , name , last , current )
}
2015-04-22 19:08:08 +00:00
2015-05-22 20:00:46 +00:00
if same >= failCount {
return fmt . Errorf ( "Controller %s: No pods submitted for the last %d checks" , name , failCount )
}
2015-04-22 19:08:08 +00:00
2015-05-22 20:00:46 +00:00
last = current
time . Sleep ( 5 * time . Second )
pods , err = listPods ( c , ns , label , fields . Everything ( ) )
if err != nil {
return fmt . Errorf ( "Error listing pods: %v" , err )
2015-04-22 19:08:08 +00:00
}
2015-05-22 20:00:46 +00:00
current = len ( pods . Items )
2015-04-22 19:08:08 +00:00
}
if current != replicas {
return fmt . Errorf ( "Controller %s: Only found %d replicas out of %d" , name , current , replicas )
}
2015-05-08 00:11:48 +00:00
Logf ( "Controller %s in ns %s: Found %d pods out of %d" , name , ns , current , replicas )
2015-04-22 19:08:08 +00:00
2015-05-06 15:03:22 +00:00
By ( fmt . Sprintf ( "Waiting for all %d replicas to be running with a max container failures of %d" , replicas , maxContainerFailures ) )
2015-04-22 19:08:08 +00:00
same = 0
last = 0
2015-05-22 20:00:46 +00:00
failCount = 10
2015-04-22 19:08:08 +00:00
current = 0
2015-05-22 20:00:46 +00:00
oldPods := & api . PodList { }
2015-04-22 19:08:08 +00:00
for same < failCount && current < replicas {
2015-05-22 20:00:46 +00:00
current = 0
waiting := 0
pending := 0
unknown := 0
inactive := 0
failedContainers := 0
time . Sleep ( 10 * time . Second )
// TODO: Use a reflector both to put less strain on the cluster and
// for more clarity.
currentPods , err := listPods ( c , ns , label , fields . Everything ( ) )
if err != nil {
return fmt . Errorf ( "Error listing pods: %v" , err )
}
for _ , p := range currentPods . Items {
if p . Status . Phase == api . PodRunning {
current ++
for _ , v := range FailedContainers ( p ) {
failedContainers = failedContainers + v . restarts
2015-04-27 18:05:41 +00:00
}
2015-05-22 20:00:46 +00:00
} else if p . Status . Phase == api . PodPending {
2015-05-22 23:40:57 +00:00
if p . Spec . NodeName == "" {
2015-05-22 20:00:46 +00:00
waiting ++
} else {
pending ++
}
} else if p . Status . Phase == api . PodSucceeded || p . Status . Phase == api . PodFailed {
inactive ++
} else if p . Status . Phase == api . PodUnknown {
unknown ++
2015-04-22 19:08:08 +00:00
}
2015-05-22 20:00:46 +00:00
}
Logf ( "Pod States: %d running, %d pending, %d waiting, %d inactive, %d unknown " , current , pending , waiting , inactive , unknown )
if len ( currentPods . Items ) != len ( pods . Items ) {
// This failure mode includes:
// kubelet is dead, so node controller deleted pods and rc creates more
// - diagnose by noting the pod diff below.
// pod is unhealthy, so replication controller creates another to take its place
// - diagnose by comparing the previous "2 Pod states" lines for inactive pods
errorStr := fmt . Sprintf ( "Number of reported pods changed: %d vs %d" , len ( currentPods . Items ) , len ( pods . Items ) )
Logf ( "%v, pods that changed since the last iteration:" , errorStr )
Diff ( oldPods , currentPods ) . Print ( util . NewStringSet ( ) )
return fmt . Errorf ( errorStr )
}
if last < current {
same = 0
} else if last == current {
same ++
} else if current < last {
// The pod failed or succeeded, or was somehow pushed out of running by the kubelet.
errorStr := fmt . Sprintf ( "Number of running pods dropped from %d to %d" , last , current )
Logf ( "%v, pods that changed since the last iteration:" , errorStr )
Diff ( oldPods , currentPods ) . Print ( util . NewStringSet ( ) )
return fmt . Errorf ( errorStr )
}
if same >= failCount {
// Most times this happens because a few nodes have kubelet problems, and their pods are
// stuck in pending.
errorStr := fmt . Sprintf ( "No pods started for the last %d checks" , failCount )
Logf ( "%v, pods currently in pending:" , errorStr )
Diff ( currentPods , & api . PodList { } ) . Print ( util . NewStringSet ( string ( api . PodRunning ) ) )
return fmt . Errorf ( errorStr )
}
last = current
oldPods = currentPods
2015-05-06 16:53:54 +00:00
2015-05-22 20:00:46 +00:00
if failedContainers > maxContainerFailures {
return fmt . Errorf ( "%d containers failed which is more than allowed %d" , failedContainers , maxContainerFailures )
2015-05-14 13:18:24 +00:00
}
2015-04-22 19:08:08 +00:00
}
if current != replicas {
return fmt . Errorf ( "Only %d pods started out of %d" , current , replicas )
}
return nil
}
2015-05-21 21:10:25 +00:00
func ScaleRC ( c * client . Client , ns , name string , size uint ) error {
By ( fmt . Sprintf ( "Scaling replication controller %s in namespace %s to %d" , name , ns , size ) )
scaler , err := kubectl . ScalerFor ( "ReplicationController" , kubectl . NewScalerClient ( c ) )
2015-05-05 14:48:50 +00:00
if err != nil {
return err
}
waitForReplicas := kubectl . NewRetryParams ( 5 * time . Second , 5 * time . Minute )
2015-05-21 21:10:25 +00:00
if err = scaler . Scale ( ns , name , size , nil , nil , waitForReplicas ) ; err != nil {
2015-05-05 14:48:50 +00:00
return err
}
return waitForRCPodsRunning ( c , ns , name )
}
// Wait up to 10 minutes for pods to become Running.
func waitForRCPodsRunning ( c * client . Client , ns , rcName string ) error {
running := false
label := labels . SelectorFromSet ( labels . Set ( map [ string ] string { "name" : rcName } ) )
for start := time . Now ( ) ; time . Since ( start ) < 10 * time . Minute ; time . Sleep ( 5 * time . Second ) {
pods , err := listPods ( c , ns , label , fields . Everything ( ) )
if err != nil {
Logf ( "Error listing pods: %v" , err )
continue
}
for _ , p := range pods . Items {
if p . Status . Phase != api . PodRunning {
continue
}
}
running = true
break
}
if ! running {
return fmt . Errorf ( "Timeout while waiting for replication controller %s pods to be running" , rcName )
}
return nil
}
// Delete a Replication Controller and all pods it spawned
func DeleteRC ( c * client . Client , ns , name string ) error {
By ( fmt . Sprintf ( "Deleting replication controller %s in namespace %s" , name , ns ) )
2015-05-25 08:20:34 +00:00
reaper , err := kubectl . ReaperForReplicationController ( c , 10 * time . Minute )
2015-05-05 14:48:50 +00:00
if err != nil {
return err
}
2015-05-25 08:20:34 +00:00
startTime := time . Now ( )
2015-05-05 14:48:50 +00:00
_ , err = reaper . Stop ( ns , name , api . NewDeleteOptions ( 0 ) )
2015-05-25 08:20:34 +00:00
deleteRCTime := time . Now ( ) . Sub ( startTime )
Logf ( "Deleting RC took: %v" , deleteRCTime )
2015-05-05 14:48:50 +00:00
return err
}
2015-04-22 19:08:08 +00:00
// Convenient wrapper around listing pods supporting retries.
func listPods ( c * client . Client , namespace string , label labels . Selector , field fields . Selector ) ( * api . PodList , error ) {
maxRetries := 4
pods , err := c . Pods ( namespace ) . List ( label , field )
for i := 0 ; i < maxRetries ; i ++ {
if err == nil {
return pods , nil
}
pods , err = c . Pods ( namespace ) . List ( label , field )
}
return pods , err
}
2015-04-27 18:05:41 +00:00
2015-05-14 13:18:24 +00:00
// FailedContainers inspects all containers in a pod and returns failure
// information for containers that have failed or been restarted.
// A map is returned where the key is the containerID and the value is a
// struct containing the restart and failure information
func FailedContainers ( pod api . Pod ) map [ string ] ContainerFailures {
var state ContainerFailures
states := make ( map [ string ] ContainerFailures )
2015-04-27 18:05:41 +00:00
statuses := pod . Status . ContainerStatuses
if len ( statuses ) == 0 {
return nil
} else {
for _ , status := range statuses {
2015-05-06 15:03:22 +00:00
if status . State . Termination != nil {
2015-05-14 13:18:24 +00:00
states [ status . ContainerID ] = ContainerFailures { status : status . State . Termination }
2015-05-06 15:03:22 +00:00
} else if status . LastTerminationState . Termination != nil {
2015-05-14 13:18:24 +00:00
states [ status . ContainerID ] = ContainerFailures { status : status . LastTerminationState . Termination }
2015-05-06 15:03:22 +00:00
}
2015-05-14 13:18:24 +00:00
if status . RestartCount > 0 {
var ok bool
if state , ok = states [ status . ContainerID ] ; ! ok {
state = ContainerFailures { }
}
state . restarts = status . RestartCount
states [ status . ContainerID ] = state
2015-04-27 18:05:41 +00:00
}
}
}
2015-05-14 13:18:24 +00:00
return states
2015-04-27 18:05:41 +00:00
}
2015-04-28 11:58:20 +00:00
// Prints the histogram of the events and returns the number of bad events.
func BadEvents ( events [ ] * api . Event ) int {
type histogramKey struct {
reason string
source string
}
histogram := make ( map [ histogramKey ] int )
for _ , e := range events {
histogram [ histogramKey { reason : e . Reason , source : e . Source . Component } ] ++
}
for key , number := range histogram {
Logf ( "- reason: %s, source: %s -> %d" , key . reason , key . source , number )
}
badPatterns := [ ] string { "kill" , "fail" }
badEvents := 0
for key , number := range histogram {
for _ , s := range badPatterns {
if strings . Contains ( key . reason , s ) {
Logf ( "WARNING %d events from %s with reason: %s" , number , key . source , key . reason )
badEvents += number
break
}
}
}
return badEvents
}
2015-04-24 17:26:12 +00:00
2015-04-29 22:28:48 +00:00
// NodeSSHHosts returns SSH-able host names for all nodes. It returns an error
// if it can't find an external IP for every node, though it still returns all
// hosts that it found in that case.
func NodeSSHHosts ( c * client . Client ) ( [ ] string , error ) {
var hosts [ ] string
nodelist , err := c . Nodes ( ) . List ( labels . Everything ( ) , fields . Everything ( ) )
if err != nil {
return hosts , fmt . Errorf ( "error getting nodes: %v" , err )
}
for _ , n := range nodelist . Items {
for _ , addr := range n . Status . Addresses {
// Use the first external IP address we find on the node, and
// use at most one per node.
// TODO(mbforbes): Use the "preferred" address for the node, once
// such a thing is defined (#2462).
if addr . Type == api . NodeExternalIP {
hosts = append ( hosts , addr . Address + ":22" )
break
}
}
}
// Error if any node didn't have an external IP.
if len ( hosts ) != len ( nodelist . Items ) {
return hosts , fmt . Errorf (
"only found %d external IPs on nodes, but found %d nodes. Nodelist: %v" ,
len ( hosts ) , len ( nodelist . Items ) , nodelist )
}
return hosts , nil
}
2015-04-24 17:26:12 +00:00
// SSH synchronously SSHs to a node running on provider and runs cmd. If there
// is no error performing the SSH, the stdout, stderr, and exit code are
// returned.
func SSH ( cmd , host , provider string ) ( string , string , int , error ) {
// Get a signer for the provider.
signer , err := getSigner ( provider )
if err != nil {
return "" , "" , 0 , fmt . Errorf ( "error getting signer for provider %s: '%v'" , provider , err )
}
// Setup the config, dial the server, and open a session.
config := & ssh . ClientConfig {
User : os . Getenv ( "USER" ) ,
Auth : [ ] ssh . AuthMethod { ssh . PublicKeys ( signer ) } ,
}
client , err := ssh . Dial ( "tcp" , host , config )
if err != nil {
return "" , "" , 0 , fmt . Errorf ( "error getting SSH client to host %s: '%v'" , host , err )
}
session , err := client . NewSession ( )
if err != nil {
return "" , "" , 0 , fmt . Errorf ( "error creating session to host %s: '%v'" , host , err )
}
defer session . Close ( )
// Run the command.
code := 0
var bout , berr bytes . Buffer
session . Stdout , session . Stderr = & bout , & berr
if err = session . Run ( cmd ) ; err != nil {
// Check whether the command failed to run or didn't complete.
if exiterr , ok := err . ( * ssh . ExitError ) ; ok {
// If we got an ExitError and the exit code is nonzero, we'll
// consider the SSH itself successful (just that the command run
// errored on the host).
if code = exiterr . ExitStatus ( ) ; code != 0 {
err = nil
}
} else {
// Some other kind of error happened (e.g. an IOError); consider the
// SSH unsuccessful.
err = fmt . Errorf ( "failed running `%s` on %s: '%v'" , cmd , host , err )
}
}
return bout . String ( ) , berr . String ( ) , code , err
}
// getSigner returns an ssh.Signer for the provider ("gce", etc.) that can be
// used to SSH to their nodes.
func getSigner ( provider string ) ( ssh . Signer , error ) {
// Get the directory in which SSH keys are located.
keydir := filepath . Join ( os . Getenv ( "HOME" ) , ".ssh" )
// Select the key itself to use. When implementing more providers here,
// please also add them to any SSH tests that are disabled because of signer
// support.
keyfile := ""
switch provider {
case "gce" , "gke" :
keyfile = "google_compute_engine"
default :
return nil , fmt . Errorf ( "getSigner(...) not implemented for %s" , provider )
}
key := filepath . Join ( keydir , keyfile )
2015-05-04 20:55:05 +00:00
Logf ( "Using SSH key: %s" , key )
2015-04-24 17:26:12 +00:00
// Create an actual signer.
file , err := os . Open ( key )
if err != nil {
return nil , fmt . Errorf ( "error opening SSH key %s: '%v'" , key , err )
}
defer file . Close ( )
buffer , err := ioutil . ReadAll ( file )
if err != nil {
return nil , fmt . Errorf ( "error reading SSH key %s: '%v'" , key , err )
}
signer , err := ssh . ParsePrivateKey ( buffer )
if err != nil {
return nil , fmt . Errorf ( "error parsing SSH key %s: '%v'" , key , err )
}
return signer , nil
}
2015-04-29 10:24:01 +00:00
// LatencyMetrics stores data about request latency at a given quantile
// broken down by verb (e.g. GET, PUT, LIST) and resource (e.g. pods, services).
type LatencyMetric struct {
2015-05-25 09:43:26 +00:00
Verb string
Resource string
2015-04-29 10:24:01 +00:00
// 0 <= quantile <=1, e.g. 0.95 is 95%tile, 0.5 is median.
2015-05-25 09:43:26 +00:00
Quantile float64
Latency time . Duration
2015-04-29 10:24:01 +00:00
}
2015-05-25 09:43:26 +00:00
// LatencyMetricByLatency implements sort.Interface for []LatencyMetric based on
// the latency field.
type LatencyMetricByLatency [ ] LatencyMetric
func ( a LatencyMetricByLatency ) Len ( ) int { return len ( a ) }
func ( a LatencyMetricByLatency ) Swap ( i , j int ) { a [ i ] , a [ j ] = a [ j ] , a [ i ] }
func ( a LatencyMetricByLatency ) Less ( i , j int ) bool { return a [ i ] . Latency < a [ j ] . Latency }
2015-04-29 10:24:01 +00:00
func ReadLatencyMetrics ( c * client . Client ) ( [ ] LatencyMetric , error ) {
2015-05-22 20:00:46 +00:00
body , err := c . Get ( ) . AbsPath ( "/metrics" ) . DoRaw ( )
2015-04-29 10:24:01 +00:00
if err != nil {
return nil , err
}
metrics := make ( [ ] LatencyMetric , 0 )
for _ , line := range strings . Split ( string ( body ) , "\n" ) {
if strings . HasPrefix ( line , "apiserver_request_latencies_summary{" ) {
// Example line:
// apiserver_request_latencies_summary{resource="namespaces",verb="LIST",quantile="0.99"} 908
// TODO: This parsing code is long and not readable. We should improve it.
keyVal := strings . Split ( line , " " )
if len ( keyVal ) != 2 {
return nil , fmt . Errorf ( "Error parsing metric %q" , line )
}
keyElems := strings . Split ( line , "\"" )
if len ( keyElems ) != 7 {
return nil , fmt . Errorf ( "Error parsing metric %q" , line )
}
resource := keyElems [ 1 ]
verb := keyElems [ 3 ]
quantile , err := strconv . ParseFloat ( keyElems [ 5 ] , 64 )
if err != nil {
return nil , fmt . Errorf ( "Error parsing metric %q" , line )
}
latency , err := strconv . ParseFloat ( keyVal [ 1 ] , 64 )
if err != nil {
return nil , fmt . Errorf ( "Error parsing metric %q" , line )
}
metrics = append ( metrics , LatencyMetric { verb , resource , quantile , time . Duration ( int64 ( latency ) ) * time . Microsecond } )
}
}
return metrics , nil
}
// Prints summary metrics for request types with latency above threshold
// and returns number of such request types.
2015-05-06 20:50:36 +00:00
func HighLatencyRequests ( c * client . Client , threshold time . Duration , ignoredResources util . StringSet ) ( int , error ) {
2015-05-25 07:21:26 +00:00
ignoredVerbs := util . NewStringSet ( "WATCHLIST" , "PROXY" )
2015-04-29 10:24:01 +00:00
metrics , err := ReadLatencyMetrics ( c )
if err != nil {
return 0 , err
}
2015-05-25 09:43:26 +00:00
sort . Sort ( sort . Reverse ( LatencyMetricByLatency ( metrics ) ) )
2015-04-29 10:24:01 +00:00
var badMetrics [ ] LatencyMetric
2015-05-25 09:43:26 +00:00
top := 5
2015-04-29 10:24:01 +00:00
for _ , metric := range metrics {
2015-05-25 09:43:26 +00:00
if ignoredResources . Has ( metric . Resource ) || ignoredVerbs . Has ( metric . Verb ) {
continue
}
isBad := false
if metric . Latency > threshold &&
2015-04-29 10:24:01 +00:00
// We are only interested in 99%tile, but for logging purposes
// it's useful to have all the offending percentiles.
2015-05-25 09:43:26 +00:00
metric . Quantile <= 0.99 {
2015-04-29 10:24:01 +00:00
badMetrics = append ( badMetrics , metric )
2015-05-25 09:43:26 +00:00
isBad = true
}
if top > 0 || isBad {
top --
prefix := ""
if isBad {
prefix = "WARNING "
}
Logf ( "%vTop latency metric: %+v" , prefix , metric )
2015-04-29 10:24:01 +00:00
}
}
return len ( badMetrics ) , nil
}