2015-07-21 14:15:55 +00:00
/ *
2016-06-03 00:25:58 +00:00
Copyright 2016 The Kubernetes Authors .
2015-07-21 14:15:55 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
2017-04-03 13:42:15 +00:00
package autoscaling
2015-07-21 14:15:55 +00:00
import (
2016-06-03 10:39:17 +00:00
"bytes"
2015-07-21 14:15:55 +00:00
"fmt"
2016-06-03 10:39:17 +00:00
"io/ioutil"
2017-03-23 09:53:57 +00:00
"math"
2016-06-03 10:39:17 +00:00
"net/http"
"os/exec"
2017-03-23 09:53:57 +00:00
"regexp"
2016-06-15 12:42:03 +00:00
"strconv"
2016-05-23 12:10:40 +00:00
"strings"
2015-07-21 14:15:55 +00:00
"time"
2017-06-22 18:24:23 +00:00
"k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1beta1"
2017-03-24 10:45:25 +00:00
"k8s.io/apimachinery/pkg/api/errors"
2017-01-11 14:09:48 +00:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2017-01-19 14:50:16 +00:00
"k8s.io/apimachinery/pkg/fields"
2017-06-22 14:12:15 +00:00
"k8s.io/apimachinery/pkg/labels"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
2017-03-17 17:10:43 +00:00
"k8s.io/apimachinery/pkg/util/intstr"
2017-01-11 14:09:48 +00:00
"k8s.io/apimachinery/pkg/util/sets"
2017-03-17 17:10:43 +00:00
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/wait"
2017-06-23 20:56:37 +00:00
clientset "k8s.io/client-go/kubernetes"
2017-06-29 16:52:59 +00:00
"k8s.io/kubernetes/pkg/api"
2016-04-07 17:21:31 +00:00
"k8s.io/kubernetes/test/e2e/framework"
2017-03-12 21:49:33 +00:00
"k8s.io/kubernetes/test/e2e/scheduling"
2016-10-12 11:37:37 +00:00
testutils "k8s.io/kubernetes/test/utils"
2015-08-14 09:50:19 +00:00
2016-05-30 14:45:47 +00:00
"github.com/golang/glog"
2015-07-21 14:15:55 +00:00
. "github.com/onsi/ginkgo"
2015-08-14 09:50:19 +00:00
. "github.com/onsi/gomega"
2015-07-21 14:15:55 +00:00
)
2015-10-07 09:15:58 +00:00
const (
2017-06-28 14:10:41 +00:00
defaultTimeout = 3 * time . Minute
resizeTimeout = 5 * time . Minute
scaleUpTimeout = 5 * time . Minute
scaleUpTriggerTimeout = 2 * time . Minute
scaleDownTimeout = 20 * time . Minute
podTimeout = 2 * time . Minute
nodesRecoverTimeout = 5 * time . Minute
rcCreationRetryTimeout = 4 * time . Minute
rcCreationRetryDelay = 20 * time . Second
2017-06-30 12:15:17 +00:00
makeSchedulableTimeout = 10 * time . Minute
makeSchedulableDelay = 20 * time . Second
2017-09-20 17:56:52 +00:00
freshStatusLimit = 20 * time . Second
2016-06-03 10:39:17 +00:00
2017-10-02 11:12:25 +00:00
gkeEndpoint = "https://test-container.sandbox.googleapis.com"
gkeUpdateTimeout = 15 * time . Minute
gkeNodepoolNameKey = "cloud.google.com/gke-nodepool"
2017-03-17 17:10:43 +00:00
disabledTaint = "DisabledForAutoscalingTest"
2017-06-30 12:15:17 +00:00
criticalAddonsOnlyTaint = "CriticalAddonsOnly"
2017-03-17 17:10:43 +00:00
newNodesForScaledownTests = 2
2017-03-23 09:53:57 +00:00
unhealthyClusterThreshold = 4
2017-04-04 08:26:38 +00:00
caNoScaleUpStatus = "NoActivity"
caOngoingScaleUpStatus = "InProgress"
2017-09-20 17:56:52 +00:00
timestampFormat = "2006-01-02 15:04:05 -0700 MST"
2015-10-07 09:15:58 +00:00
)
2017-08-01 12:06:11 +00:00
var _ = SIGDescribe ( "Cluster size autoscaling [Slow]" , func ( ) {
2016-04-07 17:21:31 +00:00
f := framework . NewDefaultFramework ( "autoscaling" )
2016-10-18 13:00:38 +00:00
var c clientset . Interface
2015-08-14 09:50:19 +00:00
var nodeCount int
2017-11-03 13:55:39 +00:00
var coreCount int64
2017-09-18 12:05:44 +00:00
var memAllocatableMb int
2016-05-23 12:10:40 +00:00
var originalSizes map [ string ] int
2015-07-21 14:15:55 +00:00
BeforeEach ( func ( ) {
2016-10-18 13:00:38 +00:00
c = f . ClientSet
2016-06-03 10:39:17 +00:00
framework . SkipUnlessProviderIs ( "gce" , "gke" )
2015-08-14 09:50:19 +00:00
2016-05-23 12:10:40 +00:00
originalSizes = make ( map [ string ] int )
sum := 0
for _ , mig := range strings . Split ( framework . TestContext . CloudConfig . NodeInstanceGroup , "," ) {
2017-04-03 17:37:32 +00:00
size , err := framework . GroupSize ( mig )
2016-05-23 12:10:40 +00:00
framework . ExpectNoError ( err )
By ( fmt . Sprintf ( "Initial size of %s: %d" , mig , size ) )
originalSizes [ mig ] = size
sum += size
}
2017-06-05 23:44:36 +00:00
// Give instances time to spin up
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , sum , scaleUpTimeout ) )
2017-06-05 23:44:36 +00:00
nodes := framework . GetReadySchedulableNodesOrDie ( f . ClientSet )
nodeCount = len ( nodes . Items )
2017-11-03 13:55:39 +00:00
coreCount = 0
for _ , node := range nodes . Items {
2017-11-03 13:55:39 +00:00
quentity := node . Status . Capacity [ v1 . ResourceCPU ]
coreCount += quentity . Value ( )
2017-11-03 13:55:39 +00:00
}
2017-06-28 10:21:12 +00:00
By ( fmt . Sprintf ( "Initial number of schedulable nodes: %v" , nodeCount ) )
2017-06-05 23:44:36 +00:00
Expect ( nodeCount ) . NotTo ( BeZero ( ) )
2017-09-18 12:05:44 +00:00
mem := nodes . Items [ 0 ] . Status . Allocatable [ v1 . ResourceMemory ]
memAllocatableMb = int ( ( & mem ) . Value ( ) / 1024 / 1024 )
2017-06-05 23:44:36 +00:00
2016-05-23 12:10:40 +00:00
Expect ( nodeCount ) . Should ( Equal ( sum ) )
2016-06-10 13:56:29 +00:00
if framework . ProviderIs ( "gke" ) {
2017-10-02 11:10:08 +00:00
val , err := isAutoscalerEnabled ( 5 )
2016-06-10 13:56:29 +00:00
framework . ExpectNoError ( err )
if ! val {
2016-06-15 12:42:03 +00:00
err = enableAutoscaler ( "default-pool" , 3 , 5 )
2016-06-10 13:56:29 +00:00
framework . ExpectNoError ( err )
}
2017-11-03 13:55:39 +00:00
Expect ( getNAPNodePoolsNumber ( ) ) . Should ( Equal ( 0 ) )
2016-06-10 13:56:29 +00:00
}
2015-07-21 14:15:55 +00:00
} )
2016-06-06 08:41:54 +00:00
AfterEach ( func ( ) {
2017-11-02 12:24:24 +00:00
if framework . ProviderIs ( "gke" ) {
By ( "Remove changes introduced by NAP tests" )
removeNAPNodePools ( )
disableAutoprovisioning ( )
}
2016-06-07 12:04:58 +00:00
By ( fmt . Sprintf ( "Restoring initial size of the cluster" ) )
2016-06-06 14:01:05 +00:00
setMigSizes ( originalSizes )
2017-06-28 10:21:12 +00:00
expectedNodes := 0
for _ , size := range originalSizes {
expectedNodes += size
}
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , expectedNodes , scaleDownTimeout ) )
2017-10-25 15:54:32 +00:00
nodes , err := c . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { } )
2017-03-17 17:10:43 +00:00
framework . ExpectNoError ( err )
2017-06-30 12:15:17 +00:00
s := time . Now ( )
makeSchedulableLoop :
for start := time . Now ( ) ; time . Since ( start ) < makeSchedulableTimeout ; time . Sleep ( makeSchedulableDelay ) {
for _ , n := range nodes . Items {
err = makeNodeSchedulable ( c , & n , true )
switch err . ( type ) {
case CriticalAddonsOnlyError :
continue makeSchedulableLoop
default :
framework . ExpectNoError ( err )
}
}
break
2017-03-17 17:10:43 +00:00
}
2017-08-21 02:03:18 +00:00
glog . Infof ( "Made nodes schedulable again in %v" , time . Since ( s ) . String ( ) )
2016-06-06 08:41:54 +00:00
} )
It ( "shouldn't increase cluster size if pending pod is too large [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2016-06-06 14:01:05 +00:00
By ( "Creating unschedulable pod" )
2017-09-18 12:05:44 +00:00
ReserveMemory ( f , "memory-reservation" , 1 , int ( 1.1 * float64 ( memAllocatableMb ) ) , false , defaultTimeout )
2016-11-18 20:55:17 +00:00
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "memory-reservation" )
2016-05-23 12:10:40 +00:00
2016-06-06 14:01:05 +00:00
By ( "Waiting for scale up hoping it won't happen" )
2017-09-27 10:22:47 +00:00
// Verify that the appropriate event was generated
2016-05-23 12:10:40 +00:00
eventFound := false
EventsLoop :
for start := time . Now ( ) ; time . Since ( start ) < scaleUpTimeout ; time . Sleep ( 20 * time . Second ) {
By ( "Waiting for NotTriggerScaleUp event" )
2017-10-25 15:54:32 +00:00
events , err := f . ClientSet . CoreV1 ( ) . Events ( f . Namespace . Name ) . List ( metav1 . ListOptions { } )
2016-05-23 12:10:40 +00:00
framework . ExpectNoError ( err )
for _ , e := range events . Items {
if e . InvolvedObject . Kind == "Pod" && e . Reason == "NotTriggerScaleUp" && strings . Contains ( e . Message , "it wouldn't fit if a new node is added" ) {
By ( "NotTriggerScaleUp event found" )
eventFound = true
break EventsLoop
}
}
}
Expect ( eventFound ) . Should ( Equal ( true ) )
2017-06-20 12:23:38 +00:00
// Verify that cluster size is not changed
2016-10-18 13:00:38 +00:00
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2016-06-06 14:01:05 +00:00
func ( size int ) bool { return size <= nodeCount } , time . Second ) )
2016-05-23 12:10:40 +00:00
} )
2015-07-21 14:15:55 +00:00
2017-04-12 21:35:50 +00:00
simpleScaleUpTest := func ( unready int ) {
2017-09-18 12:05:44 +00:00
ReserveMemory ( f , "memory-reservation" , 100 , nodeCount * memAllocatableMb , false , 1 * time . Second )
2016-11-18 20:55:17 +00:00
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "memory-reservation" )
2016-06-06 14:01:05 +00:00
2017-06-20 12:23:38 +00:00
// Verify that cluster size is increased
2017-04-12 21:35:50 +00:00
framework . ExpectNoError ( WaitForClusterSizeFuncWithUnready ( f . ClientSet ,
func ( size int ) bool { return size >= nodeCount + 1 } , scaleUpTimeout , unready ) )
2016-06-29 07:28:11 +00:00
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
2017-03-17 15:31:07 +00:00
}
It ( "should increase cluster size if pending pods are small [Feature:ClusterSizeAutoscalingScaleUp]" ,
2017-04-12 21:35:50 +00:00
func ( ) { simpleScaleUpTest ( 0 ) } )
2017-03-17 15:31:07 +00:00
It ( "should increase cluster size if pending pods are small and one node is broken [Feature:ClusterSizeAutoscalingScaleUp]" ,
func ( ) {
2017-04-12 21:35:50 +00:00
framework . TestUnderTemporaryNetworkFailure ( c , "default" , getAnyNode ( c ) , func ( ) { simpleScaleUpTest ( 1 ) } )
2017-03-17 15:31:07 +00:00
} )
2016-06-29 07:28:11 +00:00
2017-04-04 08:26:38 +00:00
It ( "shouldn't trigger additional scale-ups during processing scale-up [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2017-09-20 17:56:52 +00:00
// Wait for the situation to stabilize - CA should be running and have up-to-date node readiness info.
status , err := waitForScaleUpStatus ( c , func ( s * scaleUpStatus ) bool {
return s . ready == s . target && s . ready <= nodeCount
} , scaleUpTriggerTimeout )
2017-04-04 08:26:38 +00:00
framework . ExpectNoError ( err )
2017-09-20 17:56:52 +00:00
2017-04-04 08:26:38 +00:00
unmanagedNodes := nodeCount - status . ready
2017-08-09 13:56:56 +00:00
By ( "Schedule more pods than can fit and wait for cluster to scale-up" )
2017-09-18 12:05:44 +00:00
ReserveMemory ( f , "memory-reservation" , 100 , nodeCount * memAllocatableMb , false , 1 * time . Second )
2017-04-04 08:26:38 +00:00
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "memory-reservation" )
2017-09-20 17:56:52 +00:00
status , err = waitForScaleUpStatus ( c , func ( s * scaleUpStatus ) bool {
return s . status == caOngoingScaleUpStatus
} , scaleUpTriggerTimeout )
2017-04-04 08:26:38 +00:00
framework . ExpectNoError ( err )
target := status . target
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
By ( "Expect no more scale-up to be happening after all pods are scheduled" )
status , err = getScaleUpStatus ( c )
framework . ExpectNoError ( err )
if status . target != target {
glog . Warningf ( "Final number of nodes (%v) does not match initial scale-up target (%v)." , status . target , target )
}
2017-09-20 17:56:52 +00:00
Expect ( status . timestamp . Add ( freshStatusLimit ) . Before ( time . Now ( ) ) ) . Should ( Equal ( false ) )
2017-04-04 08:26:38 +00:00
Expect ( status . status ) . Should ( Equal ( caNoScaleUpStatus ) )
Expect ( status . ready ) . Should ( Equal ( status . target ) )
Expect ( len ( framework . GetReadySchedulableNodesOrDie ( f . ClientSet ) . Items ) ) . Should ( Equal ( status . target + unmanagedNodes ) )
} )
2016-06-29 07:28:11 +00:00
It ( "should increase cluster size if pending pods are small and there is another node pool that is not autoscaled [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
framework . SkipUnlessProviderIs ( "gke" )
By ( "Creating new node-pool with one n1-standard-4 machine" )
const extraPoolName = "extra-pool"
2016-07-01 09:02:44 +00:00
addNodePool ( extraPoolName , "n1-standard-4" , 1 )
2016-06-29 10:19:49 +00:00
defer deleteNodePool ( extraPoolName )
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount + 1 , resizeTimeout ) )
2016-06-29 07:28:11 +00:00
glog . Infof ( "Not enabling cluster autoscaler for the node pool (on purpose)." )
2017-06-28 14:32:05 +00:00
By ( "Get memory available on new node, so we can account for it when creating RC" )
2017-09-14 10:06:34 +00:00
nodes := getPoolNodes ( f , extraPoolName )
2017-06-28 14:32:05 +00:00
Expect ( len ( nodes ) ) . Should ( Equal ( 1 ) )
2017-09-14 10:06:34 +00:00
extraMem := nodes [ 0 ] . Status . Capacity [ v1 . ResourceMemory ]
2017-06-28 14:32:05 +00:00
extraMemMb := int ( ( & extraMem ) . Value ( ) / 1024 / 1024 )
2017-09-18 12:05:44 +00:00
ReserveMemory ( f , "memory-reservation" , 100 , nodeCount * memAllocatableMb + extraMemMb , false , defaultTimeout )
2016-11-18 20:55:17 +00:00
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "memory-reservation" )
2016-06-29 07:28:11 +00:00
// Verify, that cluster size is increased
2016-10-18 13:00:38 +00:00
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2017-06-28 14:32:05 +00:00
func ( size int ) bool { return size >= nodeCount + 2 } , scaleUpTimeout ) )
2016-06-13 14:06:06 +00:00
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
2016-05-23 12:10:40 +00:00
} )
2016-05-17 13:07:30 +00:00
2016-06-29 10:19:49 +00:00
It ( "should disable node pool autoscaling [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
framework . SkipUnlessProviderIs ( "gke" )
By ( "Creating new node-pool with one n1-standard-4 machine" )
const extraPoolName = "extra-pool"
2016-07-01 09:02:44 +00:00
addNodePool ( extraPoolName , "n1-standard-4" , 1 )
2016-06-29 10:19:49 +00:00
defer deleteNodePool ( extraPoolName )
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount + 1 , resizeTimeout ) )
2016-06-29 10:19:49 +00:00
framework . ExpectNoError ( enableAutoscaler ( extraPoolName , 1 , 2 ) )
framework . ExpectNoError ( disableAutoscaler ( extraPoolName , 1 , 2 ) )
} )
2016-05-23 12:10:40 +00:00
It ( "should increase cluster size if pods are pending due to host port conflict [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2017-03-12 21:49:33 +00:00
scheduling . CreateHostPortPods ( f , "host-port" , nodeCount + 2 , false )
2016-11-18 20:55:17 +00:00
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "host-port" )
2016-06-06 14:01:05 +00:00
2016-10-18 13:00:38 +00:00
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2016-05-30 14:45:47 +00:00
func ( size int ) bool { return size >= nodeCount + 2 } , scaleUpTimeout ) )
2016-06-13 14:06:06 +00:00
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
2016-05-30 14:45:47 +00:00
} )
2017-06-23 10:29:40 +00:00
It ( "should increase cluster size if pods are pending due to pod anti-affinity [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2017-06-20 12:23:38 +00:00
pods := nodeCount
newPods := 2
labels := map [ string ] string {
"anti-affinity" : "yes" ,
}
By ( "starting a pod with anti-affinity on each node" )
framework . ExpectNoError ( runAntiAffinityPods ( f , f . Namespace . Name , pods , "some-pod" , labels , labels ) )
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "some-pod" )
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
By ( "scheduling extra pods with anti-affinity to existing ones" )
framework . ExpectNoError ( runAntiAffinityPods ( f , f . Namespace . Name , newPods , "extra-pod" , labels , labels ) )
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "extra-pod" )
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount + newPods , scaleUpTimeout ) )
2017-06-20 12:23:38 +00:00
} )
2017-07-04 12:01:07 +00:00
It ( "should increase cluster size if pod requesting EmptyDir volume is pending [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2017-06-22 14:12:15 +00:00
By ( "creating pods" )
pods := nodeCount
newPods := 1
labels := map [ string ] string {
"anti-affinity" : "yes" ,
}
framework . ExpectNoError ( runAntiAffinityPods ( f , f . Namespace . Name , pods , "some-pod" , labels , labels ) )
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "some-pod" )
By ( "waiting for all pods before triggering scale up" )
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
By ( "creating a pod requesting EmptyDir" )
framework . ExpectNoError ( runVolumeAntiAffinityPods ( f , f . Namespace . Name , newPods , "extra-pod" , labels , labels , emptyDirVolumes ) )
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "extra-pod" )
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount + newPods , scaleUpTimeout ) )
2017-06-22 14:12:15 +00:00
} )
2017-07-04 12:01:07 +00:00
It ( "should increase cluster size if pod requesting volume is pending [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2017-06-22 14:12:15 +00:00
framework . SkipUnlessProviderIs ( "gce" , "gke" )
volumeLabels := labels . Set {
framework . VolumeSelectorKey : f . Namespace . Name ,
}
selector := metav1 . SetAsLabelSelector ( volumeLabels )
By ( "creating volume & pvc" )
diskName , err := framework . CreatePDWithRetry ( )
framework . ExpectNoError ( err )
pvConfig := framework . PersistentVolumeConfig {
NamePrefix : "gce-" ,
Labels : volumeLabels ,
PVSource : v1 . PersistentVolumeSource {
GCEPersistentDisk : & v1 . GCEPersistentDiskVolumeSource {
PDName : diskName ,
FSType : "ext3" ,
ReadOnly : false ,
} ,
} ,
Prebind : nil ,
}
pvcConfig := framework . PersistentVolumeClaimConfig {
Annotations : map [ string ] string {
v1 . BetaStorageClassAnnotation : "" ,
} ,
Selector : selector ,
}
pv , pvc , err := framework . CreatePVPVC ( c , pvConfig , pvcConfig , f . Namespace . Name , false )
framework . ExpectNoError ( err )
framework . ExpectNoError ( framework . WaitOnPVandPVC ( c , f . Namespace . Name , pv , pvc ) )
defer func ( ) {
errs := framework . PVPVCCleanup ( c , f . Namespace . Name , pv , pvc )
if len ( errs ) > 0 {
framework . Failf ( "failed to delete PVC and/or PV. Errors: %v" , utilerrors . NewAggregate ( errs ) )
}
pv , pvc = nil , nil
if diskName != "" {
framework . ExpectNoError ( framework . DeletePDWithRetry ( diskName ) )
}
} ( )
By ( "creating pods" )
pods := nodeCount
labels := map [ string ] string {
"anti-affinity" : "yes" ,
}
framework . ExpectNoError ( runAntiAffinityPods ( f , f . Namespace . Name , pods , "some-pod" , labels , labels ) )
defer func ( ) {
framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "some-pod" )
glog . Infof ( "RC and pods not using volume deleted" )
} ( )
By ( "waiting for all pods before triggering scale up" )
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
By ( "creating a pod requesting PVC" )
pvcPodName := "pvc-pod"
newPods := 1
volumes := buildVolumes ( pv , pvc )
framework . ExpectNoError ( runVolumeAntiAffinityPods ( f , f . Namespace . Name , newPods , pvcPodName , labels , labels , volumes ) )
defer func ( ) {
framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , pvcPodName )
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
} ( )
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount + newPods , scaleUpTimeout ) )
2017-06-22 14:12:15 +00:00
} )
2016-05-23 12:10:40 +00:00
It ( "should add node to the particular mig [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2017-04-03 13:42:15 +00:00
labelKey := "cluster-autoscaling-test.special-node"
labelValue := "true"
2016-05-23 12:10:40 +00:00
By ( "Finding the smallest MIG" )
minMig := ""
minSize := nodeCount
for mig , size := range originalSizes {
if size <= minSize {
minMig = mig
minSize = size
}
}
2016-06-17 11:46:15 +00:00
removeLabels := func ( nodesToClean sets . String ) {
2016-06-06 14:01:05 +00:00
By ( "Removing labels from nodes" )
2017-04-03 13:42:15 +00:00
for node := range nodesToClean {
framework . RemoveLabelOffNode ( c , node , labelKey )
}
2016-06-06 14:01:05 +00:00
}
2017-04-03 17:37:32 +00:00
nodes , err := framework . GetGroupNodes ( minMig )
2016-12-08 01:51:35 +00:00
framework . ExpectNoError ( err )
2016-06-17 11:46:15 +00:00
nodesSet := sets . NewString ( nodes ... )
defer removeLabels ( nodesSet )
2016-06-07 12:04:58 +00:00
By ( fmt . Sprintf ( "Annotating nodes of the smallest MIG(%s): %v" , minMig , nodes ) )
2016-05-23 12:10:40 +00:00
2017-04-03 13:42:15 +00:00
for node := range nodesSet {
framework . AddOrUpdateLabelOnNode ( c , node , labelKey , labelValue )
}
2017-02-15 09:00:50 +00:00
scheduling . CreateNodeSelectorPods ( f , "node-selector" , minSize + 1 , map [ string ] string { labelKey : labelValue } , false )
2016-05-23 12:10:40 +00:00
By ( "Waiting for new node to appear and annotating it" )
2017-04-03 17:37:32 +00:00
framework . WaitForGroupSize ( minMig , int32 ( minSize + 1 ) )
2016-06-06 14:01:05 +00:00
// Verify, that cluster size is increased
2016-10-18 13:00:38 +00:00
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2016-06-06 14:01:05 +00:00
func ( size int ) bool { return size >= nodeCount + 1 } , scaleUpTimeout ) )
2017-04-03 17:37:32 +00:00
newNodes , err := framework . GetGroupNodes ( minMig )
2016-12-08 01:51:35 +00:00
framework . ExpectNoError ( err )
2016-06-17 11:46:15 +00:00
newNodesSet := sets . NewString ( newNodes ... )
newNodesSet . Delete ( nodes ... )
2016-11-25 10:39:31 +00:00
if len ( newNodesSet ) > 1 {
By ( fmt . Sprintf ( "Spotted following new nodes in %s: %v" , minMig , newNodesSet ) )
glog . Infof ( "Usually only 1 new node is expected, investigating" )
2016-11-30 09:18:44 +00:00
glog . Infof ( "Kubectl:%s\n" , framework . RunKubectlOrDie ( "get" , "nodes" , "-o" , "json" ) )
2016-11-25 10:39:31 +00:00
if output , err := exec . Command ( "gcloud" , "compute" , "instances" , "list" ,
"--project=" + framework . TestContext . CloudConfig . ProjectID ,
2016-12-04 23:35:15 +00:00
"--zone=" + framework . TestContext . CloudConfig . Zone ) . Output ( ) ; err == nil {
2016-11-25 10:39:31 +00:00
glog . Infof ( "Gcloud compute instances list: %s" , output )
} else {
glog . Errorf ( "Failed to get instances list: %v" , err )
}
for newNode := range newNodesSet {
2017-04-24 14:12:47 +00:00
if output , err := execCmd ( "gcloud" , "compute" , "instances" , "describe" ,
2016-11-25 10:39:31 +00:00
newNode ,
"--project=" + framework . TestContext . CloudConfig . ProjectID ,
2016-12-04 23:35:15 +00:00
"--zone=" + framework . TestContext . CloudConfig . Zone ) . Output ( ) ; err == nil {
2016-11-25 10:39:31 +00:00
glog . Infof ( "Gcloud compute instances describe: %s" , output )
} else {
glog . Errorf ( "Failed to get instances describe: %v" , err )
}
}
// TODO: possibly remove broken node from newNodesSet to prevent removeLabel from crashing.
// However at this moment we DO WANT it to crash so that we don't check all test runs for the
// rare behavior, but only the broken ones.
}
2016-12-06 19:31:56 +00:00
By ( fmt . Sprintf ( "New nodes: %v\n" , newNodesSet ) )
registeredNodes := sets . NewString ( )
for nodeName := range newNodesSet {
2017-10-25 15:54:32 +00:00
node , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . Get ( nodeName , metav1 . GetOptions { } )
2016-12-06 19:31:56 +00:00
if err == nil && node != nil {
registeredNodes . Insert ( nodeName )
} else {
glog . Errorf ( "Failed to get node %v: %v" , nodeName , err )
}
}
By ( fmt . Sprintf ( "Setting labels for registered new nodes: %v" , registeredNodes . List ( ) ) )
2017-04-03 13:42:15 +00:00
for node := range registeredNodes {
framework . AddOrUpdateLabelOnNode ( c , node , labelKey , labelValue )
}
2016-12-06 19:31:56 +00:00
defer removeLabels ( registeredNodes )
2016-05-23 12:10:40 +00:00
2016-06-13 14:06:06 +00:00
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
2016-11-18 20:55:17 +00:00
framework . ExpectNoError ( framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "node-selector" ) )
2016-05-23 12:10:40 +00:00
} )
2016-06-10 13:56:29 +00:00
2016-06-15 12:42:03 +00:00
It ( "should scale up correct target pool [Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2016-06-10 13:56:29 +00:00
framework . SkipUnlessProviderIs ( "gke" )
By ( "Creating new node-pool with one n1-standard-4 machine" )
2016-06-15 12:42:03 +00:00
const extraPoolName = "extra-pool"
2016-07-01 09:02:44 +00:00
addNodePool ( extraPoolName , "n1-standard-4" , 1 )
2016-06-29 10:19:49 +00:00
defer deleteNodePool ( extraPoolName )
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount + 1 , resizeTimeout ) )
2016-06-15 12:42:03 +00:00
framework . ExpectNoError ( enableAutoscaler ( extraPoolName , 1 , 2 ) )
2017-10-03 08:16:29 +00:00
defer disableAutoscaler ( extraPoolName , 1 , 2 )
2016-06-10 13:56:29 +00:00
By ( "Creating rc with 2 pods too big to fit default-pool but fitting extra-pool" )
2017-09-20 09:40:49 +00:00
ReserveMemory ( f , "memory-reservation" , 2 , int ( 2.5 * float64 ( memAllocatableMb ) ) , false , defaultTimeout )
2016-11-18 20:55:17 +00:00
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "memory-reservation" )
2016-07-04 12:10:37 +00:00
// Apparently GKE master is restarted couple minutes after the node pool is added
// reseting all the timers in scale down code. Adding 5 extra minutes to workaround
// this issue.
// TODO: Remove the extra time when GKE restart is fixed.
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount + 2 , scaleUpTimeout + 5 * time . Minute ) )
2016-06-10 13:56:29 +00:00
} )
2017-04-12 21:35:50 +00:00
simpleScaleDownTest := func ( unready int ) {
2017-06-20 14:37:20 +00:00
cleanup , err := addKubeSystemPdbs ( f )
defer cleanup ( )
framework . ExpectNoError ( err )
2017-06-16 15:22:49 +00:00
2017-03-17 15:31:07 +00:00
By ( "Manually increase cluster size" )
increasedSize := 0
newSizes := make ( map [ string ] int )
for key , val := range originalSizes {
2017-06-14 15:08:02 +00:00
newSizes [ key ] = val + 2 + unready
increasedSize += val + 2 + unready
2017-03-17 15:31:07 +00:00
}
setMigSizes ( newSizes )
2017-04-14 14:39:47 +00:00
framework . ExpectNoError ( WaitForClusterSizeFuncWithUnready ( f . ClientSet ,
func ( size int ) bool { return size >= increasedSize } , scaleUpTimeout , unready ) )
2017-03-17 15:31:07 +00:00
2016-06-10 13:56:29 +00:00
By ( "Some node should be removed" )
2017-04-14 14:39:47 +00:00
framework . ExpectNoError ( WaitForClusterSizeFuncWithUnready ( f . ClientSet ,
func ( size int ) bool { return size < increasedSize } , scaleDownTimeout , unready ) )
2017-03-17 15:31:07 +00:00
}
2017-04-12 21:35:50 +00:00
It ( "should correctly scale down after a node is not needed [Feature:ClusterSizeAutoscalingScaleDown]" ,
func ( ) { simpleScaleDownTest ( 0 ) } )
2017-03-17 15:31:07 +00:00
2017-04-12 21:35:50 +00:00
It ( "should correctly scale down after a node is not needed and one node is broken [Feature:ClusterSizeAutoscalingScaleDown]" ,
2017-03-17 15:31:07 +00:00
func ( ) {
2017-04-12 21:35:50 +00:00
framework . TestUnderTemporaryNetworkFailure ( c , "default" , getAnyNode ( c ) , func ( ) { simpleScaleDownTest ( 1 ) } )
2017-03-17 15:31:07 +00:00
} )
2016-07-01 09:02:44 +00:00
It ( "should correctly scale down after a node is not needed when there is non autoscaled pool[Feature:ClusterSizeAutoscalingScaleDown]" , func ( ) {
framework . SkipUnlessProviderIs ( "gke" )
2017-03-17 17:10:43 +00:00
increasedSize := manuallyIncreaseClusterSize ( f , originalSizes )
2016-07-01 09:02:44 +00:00
const extraPoolName = "extra-pool"
addNodePool ( extraPoolName , "n1-standard-1" , 3 )
defer deleteNodePool ( extraPoolName )
2016-10-18 13:00:38 +00:00
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2016-07-01 09:02:44 +00:00
func ( size int ) bool { return size >= increasedSize + 3 } , scaleUpTimeout ) )
By ( "Some node should be removed" )
2016-07-04 12:10:37 +00:00
// Apparently GKE master is restarted couple minutes after the node pool is added
// reseting all the timers in scale down code. Adding 10 extra minutes to workaround
// this issue.
// TODO: Remove the extra time when GKE restart is fixed.
2016-10-18 13:00:38 +00:00
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2016-07-04 12:10:37 +00:00
func ( size int ) bool { return size < increasedSize + 3 } , scaleDownTimeout + 10 * time . Minute ) )
2016-07-01 09:02:44 +00:00
} )
2017-03-17 17:10:43 +00:00
It ( "should be able to scale down when rescheduling a pod is required and pdb allows for it[Feature:ClusterSizeAutoscalingScaleDown]" , func ( ) {
2017-06-08 19:39:29 +00:00
runDrainTest ( f , originalSizes , f . Namespace . Name , 1 , 1 , func ( increasedSize int ) {
2017-03-17 17:10:43 +00:00
By ( "Some node should be removed" )
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size < increasedSize } , scaleDownTimeout ) )
} )
} )
It ( "shouldn't be able to scale down when rescheduling a pod is required, but pdb doesn't allow drain[Feature:ClusterSizeAutoscalingScaleDown]" , func ( ) {
2017-06-08 19:39:29 +00:00
runDrainTest ( f , originalSizes , f . Namespace . Name , 1 , 0 , func ( increasedSize int ) {
2017-03-17 17:10:43 +00:00
By ( "No nodes should be removed" )
time . Sleep ( scaleDownTimeout )
nodes := framework . GetReadySchedulableNodesOrDie ( f . ClientSet )
Expect ( len ( nodes . Items ) ) . Should ( Equal ( increasedSize ) )
} )
} )
2017-03-21 13:02:55 +00:00
It ( "should be able to scale down by draining multiple pods one by one as dictated by pdb[Feature:ClusterSizeAutoscalingScaleDown]" , func ( ) {
2017-06-08 19:39:29 +00:00
runDrainTest ( f , originalSizes , f . Namespace . Name , 2 , 1 , func ( increasedSize int ) {
By ( "Some node should be removed" )
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size < increasedSize } , scaleDownTimeout ) )
} )
} )
It ( "should be able to scale down by draining system pods with pdb[Feature:ClusterSizeAutoscalingScaleDown]" , func ( ) {
runDrainTest ( f , originalSizes , "kube-system" , 2 , 1 , func ( increasedSize int ) {
2017-03-21 13:02:55 +00:00
By ( "Some node should be removed" )
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size < increasedSize } , scaleDownTimeout ) )
} )
} )
2017-06-26 14:53:30 +00:00
It ( "Should be able to scale a node group up from 0[Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
2017-10-10 10:55:40 +00:00
// Provider-specific setup
if framework . ProviderIs ( "gke" ) {
// GKE-specific setup
By ( "Add a new node pool with 0 nodes and min size 0" )
const extraPoolName = "extra-pool"
addNodePool ( extraPoolName , "n1-standard-4" , 0 )
defer deleteNodePool ( extraPoolName )
framework . ExpectNoError ( enableAutoscaler ( extraPoolName , 0 , 1 ) )
defer disableAutoscaler ( extraPoolName , 0 , 1 )
} else {
// on GCE, run only if there are already at least 2 node groups
framework . SkipUnlessAtLeast ( len ( originalSizes ) , 2 , "At least 2 node groups are needed for scale-to-0 tests" )
By ( "Manually scale smallest node group to 0" )
minMig := ""
minSize := nodeCount
for mig , size := range originalSizes {
if size <= minSize {
minMig = mig
minSize = size
}
2017-06-26 14:53:30 +00:00
}
2017-10-10 10:55:40 +00:00
framework . ExpectNoError ( framework . ResizeGroup ( minMig , int32 ( 0 ) ) )
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount - minSize , resizeTimeout ) )
2017-06-26 14:53:30 +00:00
}
By ( "Make remaining nodes unschedulable" )
2017-10-25 15:54:32 +00:00
nodes , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { FieldSelector : fields . Set {
2017-06-26 14:53:30 +00:00
"spec.unschedulable" : "false" ,
} . AsSelector ( ) . String ( ) } )
2017-06-28 14:32:05 +00:00
framework . ExpectNoError ( err )
2017-06-26 14:53:30 +00:00
for _ , node := range nodes . Items {
err = makeNodeUnschedulable ( f . ClientSet , & node )
defer func ( n v1 . Node ) {
2017-06-30 12:15:17 +00:00
makeNodeSchedulable ( f . ClientSet , & n , false )
2017-06-26 14:53:30 +00:00
} ( node )
2017-10-10 10:55:40 +00:00
2017-06-26 14:53:30 +00:00
framework . ExpectNoError ( err )
}
By ( "Run a scale-up test" )
ReserveMemory ( f , "memory-reservation" , 1 , 100 , false , 1 * time . Second )
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "memory-reservation" )
// Verify that cluster size is increased
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size >= len ( nodes . Items ) + 1 } , scaleUpTimeout ) )
framework . ExpectNoError ( waitForAllCaPodsReadyInNamespace ( f , c ) )
} )
2017-10-02 11:12:25 +00:00
// Scale to 0 test is split into two functions (for GKE & GCE.)
// The reason for it is that scenario is exactly the same,
// but setup & verification use different APIs.
//
// Scenario:
// (GKE only) add an extra node pool with size 1 & enable autoscaling for it
// (GCE only) find the smallest MIG & resize it to 1
// manually drain the single node from this node pool/MIG
// wait for cluster size to decrease
// verify the targeted node pool/MIG is of size 0
gkeScaleToZero := func ( ) {
// GKE-specific setup
By ( "Add a new node pool with 1 node and min size 0" )
const extraPoolName = "extra-pool"
addNodePool ( extraPoolName , "n1-standard-4" , 1 )
defer deleteNodePool ( extraPoolName )
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount + 1 , resizeTimeout ) )
2017-10-03 08:16:29 +00:00
framework . ExpectNoError ( enableAutoscaler ( extraPoolName , 0 , 1 ) )
defer disableAutoscaler ( extraPoolName , 0 , 1 )
2017-10-02 11:12:25 +00:00
ngNodes := getPoolNodes ( f , extraPoolName )
Expect ( len ( ngNodes ) == 1 ) . To ( BeTrue ( ) )
node := ngNodes [ 0 ]
By ( fmt . Sprintf ( "Target node for scale-down: %s" , node . Name ) )
2017-09-25 11:43:18 +00:00
2017-10-02 11:12:25 +00:00
// this part is identical
drainNode ( f , node )
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size < nodeCount + 1 } , scaleDownTimeout ) )
// GKE-specific check
newSize := getPoolSize ( f , extraPoolName )
Expect ( newSize ) . Should ( Equal ( 0 ) )
}
gceScaleToZero := func ( ) {
// non-GKE only
2017-06-29 16:52:59 +00:00
By ( "Find smallest node group and manually scale it to a single node" )
minMig := ""
minSize := nodeCount
for mig , size := range originalSizes {
if size <= minSize {
minMig = mig
minSize = size
}
}
2017-10-03 08:16:29 +00:00
framework . ExpectNoError ( framework . ResizeGroup ( minMig , int32 ( 1 ) ) )
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , nodeCount - minSize + 1 , resizeTimeout ) )
2017-06-29 16:52:59 +00:00
ngNodes , err := framework . GetGroupNodes ( minMig )
framework . ExpectNoError ( err )
Expect ( len ( ngNodes ) == 1 ) . To ( BeTrue ( ) )
2017-10-25 15:54:32 +00:00
node , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . Get ( ngNodes [ 0 ] , metav1 . GetOptions { } )
2017-10-02 11:12:25 +00:00
By ( fmt . Sprintf ( "Target node for scale-down: %s" , node . Name ) )
2017-06-29 16:52:59 +00:00
framework . ExpectNoError ( err )
2017-10-02 11:12:25 +00:00
// this part is identical
drainNode ( f , node )
2017-06-29 16:52:59 +00:00
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2017-10-02 11:12:25 +00:00
func ( size int ) bool { return size < nodeCount - minSize + 1 } , scaleDownTimeout ) )
// non-GKE only
newSize , err := framework . GroupSize ( minMig )
2017-06-29 16:52:59 +00:00
framework . ExpectNoError ( err )
2017-10-02 11:12:25 +00:00
Expect ( newSize ) . Should ( Equal ( 0 ) )
}
It ( "Should be able to scale a node group down to 0[Feature:ClusterSizeAutoscalingScaleDown]" , func ( ) {
if framework . ProviderIs ( "gke" ) { // In GKE, we can just add a node pool
gkeScaleToZero ( )
2017-10-03 08:02:22 +00:00
} else if len ( originalSizes ) >= 2 {
2017-10-02 11:12:25 +00:00
gceScaleToZero ( )
} else {
framework . Skipf ( "At least 2 node groups are needed for scale-to-0 tests" )
}
2017-06-29 16:52:59 +00:00
} )
2017-03-23 09:53:57 +00:00
It ( "Shouldn't perform scale up operation and should list unhealthy status if most of the cluster is broken[Feature:ClusterSizeAutoscalingScaleUp]" , func ( ) {
clusterSize := nodeCount
for clusterSize < unhealthyClusterThreshold + 1 {
clusterSize = manuallyIncreaseClusterSize ( f , originalSizes )
}
By ( "Block network connectivity to some nodes to simulate unhealthy cluster" )
nodesToBreakCount := int ( math . Floor ( math . Max ( float64 ( unhealthyClusterThreshold ) , 0.5 * float64 ( clusterSize ) ) ) )
2017-10-25 15:54:32 +00:00
nodes , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { FieldSelector : fields . Set {
2017-03-23 09:53:57 +00:00
"spec.unschedulable" : "false" ,
} . AsSelector ( ) . String ( ) } )
framework . ExpectNoError ( err )
Expect ( nodesToBreakCount <= len ( nodes . Items ) ) . To ( BeTrue ( ) )
nodesToBreak := nodes . Items [ : nodesToBreakCount ]
// TestUnderTemporaryNetworkFailure only removes connectivity to a single node,
// and accepts func() callback. This is expanding the loop to recursive call
// to avoid duplicating TestUnderTemporaryNetworkFailure
var testFunction func ( )
testFunction = func ( ) {
if len ( nodesToBreak ) > 0 {
ntb := & nodesToBreak [ 0 ]
nodesToBreak = nodesToBreak [ 1 : ]
framework . TestUnderTemporaryNetworkFailure ( c , "default" , ntb , testFunction )
} else {
2017-09-18 12:05:44 +00:00
ReserveMemory ( f , "memory-reservation" , 100 , nodeCount * memAllocatableMb , false , defaultTimeout )
2017-03-23 09:53:57 +00:00
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , "memory-reservation" )
time . Sleep ( scaleUpTimeout )
currentNodes := framework . GetReadySchedulableNodesOrDie ( f . ClientSet )
framework . Logf ( "Currently available nodes: %v, nodes available at the start of test: %v, disabled nodes: %v" , len ( currentNodes . Items ) , len ( nodes . Items ) , nodesToBreakCount )
Expect ( len ( currentNodes . Items ) ) . Should ( Equal ( len ( nodes . Items ) - nodesToBreakCount ) )
status , err := getClusterwideStatus ( c )
framework . Logf ( "Clusterwide status: %v" , status )
framework . ExpectNoError ( err )
Expect ( status ) . Should ( Equal ( "Unhealthy" ) )
}
}
testFunction ( )
// Give nodes time to recover from network failure
2017-08-29 09:42:30 +00:00
framework . ExpectNoError ( framework . WaitForReadyNodes ( c , len ( nodes . Items ) , nodesRecoverTimeout ) )
2017-03-23 09:53:57 +00:00
} )
2017-11-03 08:56:50 +00:00
It ( "should add new node and new node pool on too big pod, scale down to 1 and scale down to 0 [Feature:ClusterSizeAutoscalingScaleWithNAP]" , func ( ) {
2017-11-02 12:24:24 +00:00
framework . SkipUnlessProviderIs ( "gke" )
2017-11-03 13:55:39 +00:00
framework . ExpectNoError ( enableAutoprovisioning ( "" ) )
2017-11-03 08:56:50 +00:00
By ( "Create first pod" )
2017-11-03 13:55:39 +00:00
cleanupFunc1 := ReserveMemory ( f , "memory-reservation1" , 1 , int ( 1.1 * float64 ( memAllocatableMb ) ) , true , defaultTimeout )
2017-11-03 13:55:39 +00:00
defer func ( ) {
2017-11-03 13:55:39 +00:00
if cleanupFunc1 != nil {
cleanupFunc1 ( )
2017-11-03 13:55:39 +00:00
}
} ( )
2017-11-02 12:24:24 +00:00
By ( "Waiting for scale up" )
// Verify that cluster size increased.
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2017-11-03 08:56:50 +00:00
func ( size int ) bool { return size == nodeCount + 1 } , defaultTimeout ) )
2017-11-02 12:24:24 +00:00
By ( "Check if NAP group was created" )
Expect ( getNAPNodePoolsNumber ( ) ) . Should ( Equal ( 1 ) )
2017-11-03 08:56:50 +00:00
By ( "Create second pod" )
2017-11-03 13:55:39 +00:00
cleanupFunc2 := ReserveMemory ( f , "memory-reservation2" , 1 , int ( 1.1 * float64 ( memAllocatableMb ) ) , true , defaultTimeout )
2017-11-03 13:55:39 +00:00
defer func ( ) {
2017-11-03 13:55:39 +00:00
if cleanupFunc2 != nil {
cleanupFunc2 ( )
2017-11-03 13:55:39 +00:00
}
} ( )
2017-11-03 08:56:50 +00:00
By ( "Waiting for scale up" )
// Verify that cluster size increased.
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size == nodeCount + 2 } , defaultTimeout ) )
By ( "Delete first pod" )
2017-11-03 13:55:39 +00:00
cleanupFunc1 ( )
cleanupFunc1 = nil
2017-11-03 08:56:50 +00:00
By ( "Waiting for scale down to 1" )
// Verify that cluster size decreased.
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size == nodeCount + 1 } , scaleDownTimeout ) )
By ( "Delete second pod" )
2017-11-03 13:55:39 +00:00
cleanupFunc2 ( )
cleanupFunc2 = nil
2017-11-03 08:56:50 +00:00
By ( "Waiting for scale down to 0" )
2017-11-02 12:24:24 +00:00
// Verify that cluster size decreased.
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
2017-11-03 08:56:50 +00:00
func ( size int ) bool { return size == nodeCount } , scaleDownTimeout ) )
2017-11-02 12:24:24 +00:00
By ( "Waiting for NAP group remove" )
framework . ExpectNoError ( waitTillAllNAPNodePoolsAreRemoved ( ) )
By ( "Check if NAP group was removeed" )
Expect ( getNAPNodePoolsNumber ( ) ) . Should ( Equal ( 0 ) )
2017-11-03 13:55:39 +00:00
} )
It ( "shouldn't add new node group if not needed [Feature:ClusterSizeAutoscalingScaleWithNAP]" , func ( ) {
framework . SkipUnlessProviderIs ( "gke" )
framework . ExpectNoError ( enableAutoprovisioning ( "" ) )
By ( "Create pods" )
2017-11-03 13:55:39 +00:00
// Create nodesCountAfterResize+1 pods allocating 0.7 allocatable on present nodes. One more node will have to be created.
cleanupFunc := ReserveMemory ( f , "memory-reservation" , nodeCount + 1 , int ( float64 ( nodeCount + 1 ) * float64 ( 0.7 ) * float64 ( memAllocatableMb ) ) , true , scaleUpTimeout )
defer cleanupFunc ( )
2017-11-03 13:55:39 +00:00
By ( "Waiting for scale up" )
// Verify that cluster size increased.
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size >= nodeCount + 1 } , scaleUpTimeout ) )
By ( "Check if NAP group was created hoping id didn't happen" )
Expect ( getNAPNodePoolsNumber ( ) ) . Should ( Equal ( 0 ) )
} )
It ( "shouldn't scale up if cores limit too low, should scale up after limit is changed [Feature:ClusterSizeAutoscalingScaleWithNAP]" , func ( ) {
framework . SkipUnlessProviderIs ( "gke" )
By ( fmt . Sprintf ( "Set core limit to %d" , coreCount ) )
framework . ExpectNoError ( enableAutoprovisioning ( fmt . Sprintf ( ` "resource_limits": { "name":"cpu", "minimum":2, "maximum":%d}, "resource_limits": { "name":"memory", "minimum":0, "maximum":10000000} ` , coreCount ) ) )
2017-11-03 13:55:39 +00:00
// Create pod allocating 1.1 allocatable for present nodes. Bigger node will have to be created.
cleanupFunc := ReserveMemory ( f , "memory-reservation" , 1 , int ( 1.1 * float64 ( memAllocatableMb ) ) , false , time . Second )
defer cleanupFunc ( )
2017-11-03 13:55:39 +00:00
By ( fmt . Sprintf ( "Waiting for scale up hoping it won't happen, sleep for %s" , scaleUpTimeout . String ( ) ) )
time . Sleep ( scaleUpTimeout )
// Verify that cluster size is not changed
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size == nodeCount } , time . Second ) )
By ( "Change resource limits" )
framework . ExpectNoError ( enableAutoprovisioning ( fmt . Sprintf ( ` "resource_limits": { "name":"cpu", "minimum":2, "maximum":%d}, "resource_limits": { "name":"memory", "minimum":0, "maximum":10000000} ` , coreCount + 5 ) ) )
By ( "Wait for scale up" )
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size == nodeCount + 1 } , scaleUpTimeout ) )
By ( "Check if NAP group was created" )
Expect ( getNAPNodePoolsNumber ( ) ) . Should ( Equal ( 1 ) )
2017-11-02 12:24:24 +00:00
} )
2017-11-03 13:55:39 +00:00
It ( "should create new node if there is no node for node selector [Feature:ClusterSizeAutoscalingScaleWithNAP]" , func ( ) {
framework . SkipUnlessProviderIs ( "gke" )
framework . ExpectNoError ( enableAutoprovisioning ( "" ) )
// Create pod allocating 0.7 allocatable for present nodes with node selector.
cleanupFunc := ReserveMemoryWithSelector ( f , "memory-reservation" , 1 , int ( 0.7 * float64 ( memAllocatableMb ) ) , true , scaleUpTimeout , map [ string ] string { "test" : "test" } )
defer cleanupFunc ( )
By ( "Waiting for scale up" )
// Verify that cluster size increased.
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet ,
func ( size int ) bool { return size == nodeCount + 1 } , defaultTimeout ) )
By ( "Check if NAP group was created" )
Expect ( getNAPNodePoolsNumber ( ) ) . Should ( Equal ( 1 ) )
} )
2015-07-21 14:15:55 +00:00
} )
2017-04-24 14:12:47 +00:00
func execCmd ( args ... string ) * exec . Cmd {
2017-04-27 14:52:53 +00:00
glog . Infof ( "Executing: %s" , strings . Join ( args , " " ) )
2017-04-24 14:12:47 +00:00
return exec . Command ( args [ 0 ] , args [ 1 : ] ... )
}
2017-06-08 19:39:29 +00:00
func runDrainTest ( f * framework . Framework , migSizes map [ string ] int , namespace string , podsPerNode , pdbSize int , verifyFunction func ( int ) ) {
2017-03-17 17:10:43 +00:00
increasedSize := manuallyIncreaseClusterSize ( f , migSizes )
2017-10-25 15:54:32 +00:00
nodes , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { FieldSelector : fields . Set {
2017-03-17 17:10:43 +00:00
"spec.unschedulable" : "false" ,
} . AsSelector ( ) . String ( ) } )
framework . ExpectNoError ( err )
2017-03-21 13:02:55 +00:00
numPods := len ( nodes . Items ) * podsPerNode
2017-09-23 15:40:37 +00:00
testID := string ( uuid . NewUUID ( ) ) // So that we can label and find pods
labelMap := map [ string ] string { "test_id" : testID }
2017-08-09 13:56:56 +00:00
framework . ExpectNoError ( runReplicatedPodOnEachNode ( f , nodes . Items , namespace , podsPerNode , "reschedulable-pods" , labelMap , 0 ) )
2017-03-17 17:10:43 +00:00
2017-06-08 19:39:29 +00:00
defer framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , namespace , "reschedulable-pods" )
2017-03-17 17:10:43 +00:00
By ( "Create a PodDisruptionBudget" )
2017-05-15 23:51:00 +00:00
minAvailable := intstr . FromInt ( numPods - pdbSize )
2017-03-17 17:10:43 +00:00
pdb := & policy . PodDisruptionBudget {
ObjectMeta : metav1 . ObjectMeta {
Name : "test_pdb" ,
Namespace : namespace ,
} ,
Spec : policy . PodDisruptionBudgetSpec {
Selector : & metav1 . LabelSelector { MatchLabels : labelMap } ,
2017-05-15 23:51:00 +00:00
MinAvailable : & minAvailable ,
2017-03-17 17:10:43 +00:00
} ,
}
2017-09-12 23:02:17 +00:00
_ , err = f . ClientSet . Policy ( ) . PodDisruptionBudgets ( namespace ) . Create ( pdb )
2017-03-17 17:10:43 +00:00
defer func ( ) {
2017-09-12 23:02:17 +00:00
f . ClientSet . Policy ( ) . PodDisruptionBudgets ( namespace ) . Delete ( pdb . Name , & metav1 . DeleteOptions { } )
2017-03-17 17:10:43 +00:00
} ( )
framework . ExpectNoError ( err )
verifyFunction ( increasedSize )
}
2017-11-02 12:24:24 +00:00
func getGKEURL ( apiVersion string , suffix string ) string {
2017-04-24 14:12:47 +00:00
out , err := execCmd ( "gcloud" , "auth" , "print-access-token" ) . Output ( )
2016-06-03 10:39:17 +00:00
framework . ExpectNoError ( err )
token := strings . Replace ( string ( out ) , "\n" , "" , - 1 )
2017-11-02 12:24:24 +00:00
return fmt . Sprintf ( "%s/%s/%s?access_token=%s" ,
2016-06-03 10:39:17 +00:00
gkeEndpoint ,
2017-11-02 12:24:24 +00:00
apiVersion ,
suffix ,
token )
}
func getGKEClusterURL ( apiVersion string ) string {
return getGKEURL ( apiVersion , fmt . Sprintf ( "projects/%s/zones/%s/clusters/%s" ,
2016-06-03 10:39:17 +00:00
framework . TestContext . CloudConfig . ProjectID ,
framework . TestContext . CloudConfig . Zone ,
2017-11-02 12:24:24 +00:00
framework . TestContext . CloudConfig . Cluster ) )
2016-06-03 10:39:17 +00:00
}
2017-11-02 12:24:24 +00:00
func getCluster ( apiVersion string ) ( string , error ) {
resp , err := http . Get ( getGKEClusterURL ( apiVersion ) )
2016-06-03 10:39:17 +00:00
if err != nil {
2017-11-02 12:24:24 +00:00
return "" , err
2016-06-03 10:39:17 +00:00
}
defer resp . Body . Close ( )
body , err := ioutil . ReadAll ( resp . Body )
2017-11-02 12:24:24 +00:00
if err != nil {
return "" , err
}
if resp . StatusCode != http . StatusOK {
return "" , fmt . Errorf ( "error: %s %s" , resp . Status , body )
}
return string ( body ) , nil
}
func isAutoscalerEnabled ( expectedMaxNodeCountInTargetPool int ) ( bool , error ) {
strBody , err := getCluster ( "v1" )
2016-06-03 10:39:17 +00:00
if err != nil {
return false , err
}
2017-10-02 11:10:08 +00:00
if strings . Contains ( strBody , "\"maxNodeCount\": " + strconv . Itoa ( expectedMaxNodeCountInTargetPool ) ) {
2016-06-03 10:39:17 +00:00
return true , nil
}
return false , nil
}
2016-06-15 12:42:03 +00:00
func enableAutoscaler ( nodePool string , minCount , maxCount int ) error {
2017-10-10 12:42:53 +00:00
glog . Infof ( "Using gcloud to enable autoscaling for pool %s" , nodePool )
output , err := execCmd ( "gcloud" , "container" , "clusters" , "update" , framework . TestContext . CloudConfig . Cluster ,
"--enable-autoscaling" ,
"--min-nodes=" + strconv . Itoa ( minCount ) ,
"--max-nodes=" + strconv . Itoa ( maxCount ) ,
"--node-pool=" + nodePool ,
"--project=" + framework . TestContext . CloudConfig . ProjectID ,
"--zone=" + framework . TestContext . CloudConfig . Zone ) . CombinedOutput ( )
if err != nil {
glog . Errorf ( "Failed config update result: %s" , output )
return fmt . Errorf ( "Failed to enable autoscaling: %v" , err )
2016-06-03 10:39:17 +00:00
}
2017-10-10 12:42:53 +00:00
glog . Infof ( "Config update result: %s" , output )
2016-06-03 10:39:17 +00:00
2017-09-26 08:57:59 +00:00
var finalErr error
2016-06-03 10:39:17 +00:00
for startTime := time . Now ( ) ; startTime . Add ( gkeUpdateTimeout ) . After ( time . Now ( ) ) ; time . Sleep ( 30 * time . Second ) {
2017-10-02 11:10:08 +00:00
val , err := isAutoscalerEnabled ( maxCount )
2017-09-26 08:57:59 +00:00
if err == nil && val {
2016-06-03 10:39:17 +00:00
return nil
}
2017-09-26 08:57:59 +00:00
finalErr = err
2016-06-03 10:39:17 +00:00
}
2017-09-26 08:57:59 +00:00
return fmt . Errorf ( "autoscaler not enabled, last error: %v" , finalErr )
2016-06-03 10:39:17 +00:00
}
2016-06-29 10:19:49 +00:00
func disableAutoscaler ( nodePool string , minCount , maxCount int ) error {
2017-10-10 12:42:53 +00:00
glog . Infof ( "Using gcloud to disable autoscaling for pool %s" , nodePool )
2016-06-29 10:19:49 +00:00
2017-10-10 12:42:53 +00:00
output , err := execCmd ( "gcloud" , "container" , "clusters" , "update" , framework . TestContext . CloudConfig . Cluster ,
"--no-enable-autoscaling" ,
"--node-pool=" + nodePool ,
"--project=" + framework . TestContext . CloudConfig . ProjectID ,
"--zone=" + framework . TestContext . CloudConfig . Zone ) . CombinedOutput ( )
2016-06-29 10:19:49 +00:00
2017-10-10 12:42:53 +00:00
if err != nil {
glog . Errorf ( "Failed config update result: %s" , output )
return fmt . Errorf ( "Failed to disable autoscaling: %v" , err )
2016-06-29 10:19:49 +00:00
}
2017-10-10 12:42:53 +00:00
glog . Infof ( "Config update result: %s" , output )
2016-06-29 10:19:49 +00:00
2017-10-03 08:16:29 +00:00
var finalErr error
2016-06-29 10:19:49 +00:00
for startTime := time . Now ( ) ; startTime . Add ( gkeUpdateTimeout ) . After ( time . Now ( ) ) ; time . Sleep ( 30 * time . Second ) {
2017-10-03 08:16:29 +00:00
val , err := isAutoscalerEnabled ( maxCount )
if err == nil && ! val {
2016-06-29 10:19:49 +00:00
return nil
}
2017-10-03 08:16:29 +00:00
finalErr = err
2016-06-29 10:19:49 +00:00
}
2017-10-03 08:16:29 +00:00
return fmt . Errorf ( "autoscaler still enabled, last error: %v" , finalErr )
2016-06-29 10:19:49 +00:00
}
2017-11-02 12:24:24 +00:00
func isAutoprovisioningEnabled ( ) ( bool , error ) {
strBody , err := getCluster ( "v1alpha1" )
if err != nil {
return false , err
}
if strings . Contains ( strBody , "\"enableNodeAutoprovisioning\": true" ) {
return true , nil
}
return false , nil
}
func executeHTTPRequest ( method string , url string , body string ) ( string , error ) {
client := & http . Client { }
req , err := http . NewRequest ( method , url , strings . NewReader ( body ) )
if err != nil {
By ( fmt . Sprintf ( "Can't create request: %s" , err . Error ( ) ) )
return "" , err
}
resp , err := client . Do ( req )
respBody , err := ioutil . ReadAll ( resp . Body )
if err != nil {
return "" , err
}
if resp . StatusCode != http . StatusOK {
return "" , fmt . Errorf ( "error: %s %s" , resp . Status , string ( respBody ) )
}
return string ( respBody ) , nil
}
2017-11-03 13:55:39 +00:00
func enableAutoprovisioning ( resourceLimits string ) error {
2017-11-02 12:24:24 +00:00
By ( "Using API to enable autoprovisioning." )
2017-11-03 13:55:39 +00:00
var body string
if resourceLimits != "" {
body = fmt . Sprintf ( ` { "update": { "desired_cluster_autoscaling": { "enable_node_autoprovisioning": true, %s}}} ` , resourceLimits )
} else {
body = ` { "update": { "desired_cluster_autoscaling": { "enable_node_autoprovisioning": true, "resource_limits": { "name":"cpu", "minimum":0, "maximum":100}, "resource_limits": { "name":"memory", "minimum":0, "maximum":10000000}}}} `
}
_ , err := executeHTTPRequest ( http . MethodPut , getGKEClusterURL ( "v1alpha1" ) , body )
2017-11-02 12:24:24 +00:00
if err != nil {
glog . Errorf ( "Request error: %s" , err . Error ( ) )
return err
}
glog . Infof ( "Wait for enabling autoprovisioning." )
for start := time . Now ( ) ; time . Since ( start ) < gkeUpdateTimeout ; time . Sleep ( 30 * time . Second ) {
enabled , err := isAutoprovisioningEnabled ( )
if err != nil {
glog . Errorf ( "Error: %s" , err . Error ( ) )
return err
}
if enabled {
By ( "Autoprovisioning enabled." )
return nil
}
glog . Infof ( "Waiting for enabling autoprovisioning" )
}
return fmt . Errorf ( "autoprovisioning wasn't enabled (timeout)." )
}
func disableAutoprovisioning ( ) error {
enabled , err := isAutoprovisioningEnabled ( )
if err != nil {
glog . Errorf ( "Error: %s" , err . Error ( ) )
return err
}
if ! enabled {
By ( "Autoprovisioning disabled." )
return nil
}
By ( "Using API to disable autoprovisioning." )
_ , err = executeHTTPRequest ( http . MethodPut , getGKEClusterURL ( "v1alpha1" ) , "{\"update\": {\"desired_cluster_autoscaling\": {}}}" )
if err != nil {
glog . Errorf ( "Request error: %s" , err . Error ( ) )
return err
}
By ( "Wait for disabling autoprovisioning." )
for start := time . Now ( ) ; time . Since ( start ) < gkeUpdateTimeout ; time . Sleep ( 30 * time . Second ) {
enabled , err := isAutoprovisioningEnabled ( )
if err != nil {
glog . Errorf ( "Error: %s" , err . Error ( ) )
return err
}
if ! enabled {
By ( "Autoprovisioning disabled." )
return nil
}
By ( "Waiting for disabling autoprovisioning" )
}
return fmt . Errorf ( "autoprovisioning wasn't disabled (timeout)." )
}
func getNAPNodePools ( ) ( [ ] string , error ) {
if framework . ProviderIs ( "gke" ) {
output , err := exec . Command ( "gcloud" , "container" , "node-pools" , "list" ,
"--project=" + framework . TestContext . CloudConfig . ProjectID ,
"--zone=" + framework . TestContext . CloudConfig . Zone ,
"--cluster=" + framework . TestContext . CloudConfig . Cluster ) . CombinedOutput ( )
if err != nil {
glog . Errorf ( "Failed to get instance groups: %v" , string ( output ) )
return nil , err
}
re := regexp . MustCompile ( "nap.* " )
lines := re . FindAllString ( string ( output ) , - 1 )
for i , line := range lines {
lines [ i ] = line [ : strings . Index ( line , " " ) ]
}
return lines , nil
} else {
return nil , fmt . Errorf ( "provider does not support NAP" )
}
}
func removeNAPNodePools ( ) error {
By ( "Remove NAP node pools" )
pools , err := getNAPNodePools ( )
if err != nil {
return err
}
for _ , pool := range pools {
By ( "Remove node pool: " + pool )
suffix := fmt . Sprintf ( "projects/%s/zones/%s/clusters/%s/nodePools/%s" ,
framework . TestContext . CloudConfig . ProjectID ,
framework . TestContext . CloudConfig . Zone ,
framework . TestContext . CloudConfig . Cluster ,
pool )
_ , err := executeHTTPRequest ( http . MethodDelete , getGKEURL ( "v1alpha1" , suffix ) , "" )
if err != nil {
glog . Errorf ( "Request error: %s" , err . Error ( ) )
return err
}
}
err = waitTillAllNAPNodePoolsAreRemoved ( )
if err != nil {
glog . Errorf ( fmt . Sprintf ( "Couldn't remove NAP groups: %s" , err . Error ( ) ) )
}
return err
}
func getNAPNodePoolsNumber ( ) int {
groups , err := getNAPNodePools ( )
framework . ExpectNoError ( err )
return len ( groups )
}
func waitTillAllNAPNodePoolsAreRemoved ( ) error {
By ( "Wait till all NAP node pools are removed" )
err := wait . PollImmediate ( 5 * time . Second , defaultTimeout , func ( ) ( bool , error ) {
return getNAPNodePoolsNumber ( ) == 0 , nil
} )
return err
}
2016-07-01 09:02:44 +00:00
func addNodePool ( name string , machineType string , numNodes int ) {
2017-04-24 14:12:47 +00:00
output , err := execCmd ( "gcloud" , "alpha" , "container" , "node-pools" , "create" , name , "--quiet" ,
2016-06-29 10:19:49 +00:00
"--machine-type=" + machineType ,
2016-07-01 09:02:44 +00:00
"--num-nodes=" + strconv . Itoa ( numNodes ) ,
2016-06-29 10:19:49 +00:00
"--project=" + framework . TestContext . CloudConfig . ProjectID ,
"--zone=" + framework . TestContext . CloudConfig . Zone ,
"--cluster=" + framework . TestContext . CloudConfig . Cluster ) . CombinedOutput ( )
glog . Infof ( "Creating node-pool %s: %s" , name , output )
2017-04-27 14:52:53 +00:00
framework . ExpectNoError ( err )
2016-06-29 10:19:49 +00:00
}
func deleteNodePool ( name string ) {
glog . Infof ( "Deleting node pool %s" , name )
2017-04-24 14:12:47 +00:00
output , err := execCmd ( "gcloud" , "alpha" , "container" , "node-pools" , "delete" , name , "--quiet" ,
2016-06-29 10:19:49 +00:00
"--project=" + framework . TestContext . CloudConfig . ProjectID ,
"--zone=" + framework . TestContext . CloudConfig . Zone ,
"--cluster=" + framework . TestContext . CloudConfig . Cluster ) . CombinedOutput ( )
if err != nil {
glog . Infof ( "Error: %v" , err )
}
glog . Infof ( "Node-pool deletion output: %s" , output )
}
2017-09-14 10:06:34 +00:00
func getPoolNodes ( f * framework . Framework , poolName string ) [ ] * v1 . Node {
nodes := make ( [ ] * v1 . Node , 0 , 1 )
nodeList := framework . GetReadySchedulableNodesOrDie ( f . ClientSet )
for _ , node := range nodeList . Items {
2017-10-02 11:12:25 +00:00
if node . Labels [ gkeNodepoolNameKey ] == poolName {
2017-09-14 10:06:34 +00:00
nodes = append ( nodes , & node )
}
}
return nodes
}
2017-10-02 11:12:25 +00:00
func getPoolSize ( f * framework . Framework , poolName string ) int {
size := 0
nodeList := framework . GetReadySchedulableNodesOrDie ( f . ClientSet )
for _ , node := range nodeList . Items {
if node . Labels [ gkeNodepoolNameKey ] == poolName {
size ++
}
}
return size
}
2016-06-03 10:39:17 +00:00
func doPut ( url , content string ) ( string , error ) {
req , err := http . NewRequest ( "PUT" , url , bytes . NewBuffer ( [ ] byte ( content ) ) )
2017-09-27 10:22:47 +00:00
if err != nil {
return "" , err
}
2016-06-03 10:39:17 +00:00
req . Header . Set ( "Content-Type" , "application/json" )
client := & http . Client { }
resp , err := client . Do ( req )
if err != nil {
return "" , err
}
defer resp . Body . Close ( )
body , err := ioutil . ReadAll ( resp . Body )
if err != nil {
return "" , err
}
strBody := string ( body )
return strBody , nil
}
2017-11-03 13:55:39 +00:00
// ReserveMemoryWithSelector creates a replication controller with pods with node selector that, in summation,
2017-09-23 15:40:37 +00:00
// request the specified amount of memory.
2017-11-03 13:55:39 +00:00
func ReserveMemoryWithSelector ( f * framework . Framework , id string , replicas , megabytes int , expectRunning bool , timeout time . Duration , selector map [ string ] string ) func ( ) error {
2015-09-25 09:41:26 +00:00
By ( fmt . Sprintf ( "Running RC which reserves %v MB of memory" , megabytes ) )
2016-05-16 11:27:18 +00:00
request := int64 ( 1024 * 1024 * megabytes / replicas )
2016-10-12 11:37:37 +00:00
config := & testutils . RCConfig {
2016-11-18 20:55:17 +00:00
Client : f . ClientSet ,
InternalClient : f . InternalClientset ,
Name : id ,
Namespace : f . Namespace . Name ,
2017-04-04 08:26:38 +00:00
Timeout : timeout ,
2016-11-18 20:55:17 +00:00
Image : framework . GetPauseImageName ( f . ClientSet ) ,
Replicas : replicas ,
MemRequest : request ,
2017-11-03 13:55:39 +00:00
NodeSelector : selector ,
2016-05-16 11:27:18 +00:00
}
2017-06-28 14:10:41 +00:00
for start := time . Now ( ) ; time . Since ( start ) < rcCreationRetryTimeout ; time . Sleep ( rcCreationRetryDelay ) {
err := framework . RunRC ( * config )
if err != nil && strings . Contains ( err . Error ( ) , "Error creating replication controller" ) {
glog . Warningf ( "Failed to create memory reservation: %v" , err )
continue
}
if expectRunning {
framework . ExpectNoError ( err )
}
2017-08-09 13:56:56 +00:00
return func ( ) error {
return framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , f . Namespace . Name , id )
}
2015-07-21 14:15:55 +00:00
}
2017-06-28 14:10:41 +00:00
framework . Failf ( "Failed to reserve memory within timeout" )
2017-08-09 13:56:56 +00:00
return nil
2015-07-21 14:15:55 +00:00
}
2016-05-30 14:45:47 +00:00
2017-11-03 13:55:39 +00:00
// ReserveMemory creates a replication controller with pods that, in summation,
// request the specified amount of memory.
func ReserveMemory ( f * framework . Framework , id string , replicas , megabytes int , expectRunning bool , timeout time . Duration ) func ( ) error {
return ReserveMemoryWithSelector ( f , id , replicas , megabytes , expectRunning , timeout , nil )
}
2017-08-29 09:42:30 +00:00
// WaitForClusterSizeFunc waits until the cluster size matches the given function.
2016-10-18 13:00:38 +00:00
func WaitForClusterSizeFunc ( c clientset . Interface , sizeFunc func ( int ) bool , timeout time . Duration ) error {
2017-04-12 21:35:50 +00:00
return WaitForClusterSizeFuncWithUnready ( c , sizeFunc , timeout , 0 )
}
2017-09-23 15:40:37 +00:00
// WaitForClusterSizeFuncWithUnready waits until the cluster size matches the given function and assumes some unready nodes.
2017-04-12 21:35:50 +00:00
func WaitForClusterSizeFuncWithUnready ( c clientset . Interface , sizeFunc func ( int ) bool , timeout time . Duration , expectedUnready int ) error {
2016-05-30 14:45:47 +00:00
for start := time . Now ( ) ; time . Since ( start ) < timeout ; time . Sleep ( 20 * time . Second ) {
2017-10-25 15:54:32 +00:00
nodes , err := c . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { FieldSelector : fields . Set {
2016-05-30 14:45:47 +00:00
"spec.unschedulable" : "false" ,
2016-11-18 20:55:17 +00:00
} . AsSelector ( ) . String ( ) } )
2016-05-30 14:45:47 +00:00
if err != nil {
glog . Warningf ( "Failed to list nodes: %v" , err )
continue
}
numNodes := len ( nodes . Items )
// Filter out not-ready nodes.
2016-11-18 20:55:17 +00:00
framework . FilterNodes ( nodes , func ( node v1 . Node ) bool {
return framework . IsNodeConditionSetAsExpected ( & node , v1 . NodeReady , true )
2016-05-30 14:45:47 +00:00
} )
numReady := len ( nodes . Items )
2017-04-12 21:35:50 +00:00
if numNodes == numReady + expectedUnready && sizeFunc ( numNodes ) {
2016-05-30 14:45:47 +00:00
glog . Infof ( "Cluster has reached the desired size" )
return nil
}
2017-08-09 13:56:56 +00:00
glog . Infof ( "Waiting for cluster with func, current size %d, not ready nodes %d" , numNodes , numNodes - numReady )
2016-05-30 14:45:47 +00:00
}
return fmt . Errorf ( "timeout waiting %v for appropriate cluster size" , timeout )
}
2016-05-23 12:10:40 +00:00
2017-08-03 10:25:30 +00:00
func waitForCaPodsReadyInNamespace ( f * framework . Framework , c clientset . Interface , tolerateUnreadyCount int ) error {
2016-06-13 14:06:06 +00:00
var notready [ ] string
for start := time . Now ( ) ; time . Now ( ) . Before ( start . Add ( scaleUpTimeout ) ) ; time . Sleep ( 20 * time . Second ) {
2017-10-25 15:54:32 +00:00
pods , err := c . CoreV1 ( ) . Pods ( f . Namespace . Name ) . List ( metav1 . ListOptions { } )
2016-06-13 14:06:06 +00:00
if err != nil {
return fmt . Errorf ( "failed to get pods: %v" , err )
}
notready = make ( [ ] string , 0 )
for _ , pod := range pods . Items {
ready := false
for _ , c := range pod . Status . Conditions {
2016-11-18 20:55:17 +00:00
if c . Type == v1 . PodReady && c . Status == v1 . ConditionTrue {
2016-06-13 14:06:06 +00:00
ready = true
}
}
2016-10-24 05:21:35 +00:00
// Failed pods in this context generally mean that they have been
// double scheduled onto a node, but then failed a constraint check.
2016-11-18 20:55:17 +00:00
if pod . Status . Phase == v1 . PodFailed {
2016-10-24 05:21:35 +00:00
glog . Warningf ( "Pod has failed: %v" , pod )
}
2016-11-18 20:55:17 +00:00
if ! ready && pod . Status . Phase != v1 . PodFailed {
2016-06-13 14:06:06 +00:00
notready = append ( notready , pod . Name )
}
}
2017-08-03 10:25:30 +00:00
if len ( notready ) <= tolerateUnreadyCount {
glog . Infof ( "sufficient number of pods ready. Tolerating %d unready" , tolerateUnreadyCount )
2016-06-13 14:06:06 +00:00
return nil
}
2017-08-03 10:25:30 +00:00
glog . Infof ( "Too many pods are not ready yet: %v" , notready )
2016-06-13 14:06:06 +00:00
}
2016-06-16 10:00:55 +00:00
glog . Info ( "Timeout on waiting for pods being ready" )
2016-10-08 21:14:55 +00:00
glog . Info ( framework . RunKubectlOrDie ( "get" , "pods" , "-o" , "json" , "--all-namespaces" ) )
glog . Info ( framework . RunKubectlOrDie ( "get" , "nodes" , "-o" , "json" ) )
2016-06-16 10:00:55 +00:00
2016-06-13 14:06:06 +00:00
// Some pods are still not running.
2017-08-03 10:25:30 +00:00
return fmt . Errorf ( "Too many pods are still not running: %v" , notready )
}
func waitForAllCaPodsReadyInNamespace ( f * framework . Framework , c clientset . Interface ) error {
return waitForCaPodsReadyInNamespace ( f , c , 0 )
2016-06-13 14:06:06 +00:00
}
2017-03-17 15:31:07 +00:00
func getAnyNode ( c clientset . Interface ) * v1 . Node {
2017-10-25 15:54:32 +00:00
nodes , err := c . CoreV1 ( ) . Nodes ( ) . List ( metav1 . ListOptions { FieldSelector : fields . Set {
2017-03-17 15:31:07 +00:00
"spec.unschedulable" : "false" ,
} . AsSelector ( ) . String ( ) } )
if err != nil {
glog . Errorf ( "Failed to get node list: %v" , err )
return nil
}
if len ( nodes . Items ) == 0 {
glog . Errorf ( "No nodes" )
return nil
}
return & nodes . Items [ 0 ]
}
2017-04-10 17:16:30 +00:00
func setMigSizes ( sizes map [ string ] int ) bool {
madeChanges := false
2016-05-23 12:10:40 +00:00
for mig , desiredSize := range sizes {
2017-04-03 17:37:32 +00:00
currentSize , err := framework . GroupSize ( mig )
2016-05-23 12:10:40 +00:00
framework . ExpectNoError ( err )
if desiredSize != currentSize {
By ( fmt . Sprintf ( "Setting size of %s to %d" , mig , desiredSize ) )
2017-04-03 17:37:32 +00:00
err = framework . ResizeGroup ( mig , int32 ( desiredSize ) )
2016-05-23 12:10:40 +00:00
framework . ExpectNoError ( err )
2017-04-10 17:16:30 +00:00
madeChanges = true
2016-05-23 12:10:40 +00:00
}
}
2017-04-10 17:16:30 +00:00
return madeChanges
2016-05-23 12:10:40 +00:00
}
2017-03-17 17:10:43 +00:00
2017-10-02 11:12:25 +00:00
func drainNode ( f * framework . Framework , node * v1 . Node ) {
By ( "Make the single node unschedulable" )
makeNodeUnschedulable ( f . ClientSet , node )
By ( "Manually drain the single node" )
podOpts := metav1 . ListOptions { FieldSelector : fields . OneTermEqualSelector ( api . PodHostField , node . Name ) . String ( ) }
2017-10-25 15:54:32 +00:00
pods , err := f . ClientSet . CoreV1 ( ) . Pods ( metav1 . NamespaceAll ) . List ( podOpts )
2017-10-02 11:12:25 +00:00
framework . ExpectNoError ( err )
for _ , pod := range pods . Items {
2017-10-25 15:54:32 +00:00
err = f . ClientSet . CoreV1 ( ) . Pods ( pod . Namespace ) . Delete ( pod . Name , metav1 . NewDeleteOptions ( 0 ) )
2017-10-02 11:12:25 +00:00
framework . ExpectNoError ( err )
}
}
2017-03-17 17:10:43 +00:00
func makeNodeUnschedulable ( c clientset . Interface , node * v1 . Node ) error {
By ( fmt . Sprintf ( "Taint node %s" , node . Name ) )
2017-03-28 15:10:46 +00:00
for j := 0 ; j < 3 ; j ++ {
2017-10-25 15:54:32 +00:00
freshNode , err := c . CoreV1 ( ) . Nodes ( ) . Get ( node . Name , metav1 . GetOptions { } )
2017-03-28 15:10:46 +00:00
if err != nil {
return err
}
for _ , taint := range freshNode . Spec . Taints {
if taint . Key == disabledTaint {
return nil
}
}
freshNode . Spec . Taints = append ( freshNode . Spec . Taints , v1 . Taint {
Key : disabledTaint ,
Value : "DisabledForTest" ,
Effect : v1 . TaintEffectNoSchedule ,
} )
2017-10-25 15:54:32 +00:00
_ , err = c . CoreV1 ( ) . Nodes ( ) . Update ( freshNode )
2017-03-28 15:10:46 +00:00
if err == nil {
2017-03-17 17:10:43 +00:00
return nil
}
2017-03-28 15:10:46 +00:00
if ! errors . IsConflict ( err ) {
return err
}
glog . Warningf ( "Got 409 conflict when trying to taint node, retries left: %v" , 3 - j )
2017-03-17 17:10:43 +00:00
}
2017-03-28 15:10:46 +00:00
return fmt . Errorf ( "Failed to taint node in allowed number of retries" )
2017-03-17 17:10:43 +00:00
}
2017-09-23 15:40:37 +00:00
// CriticalAddonsOnlyError implements the `error` interface, and signifies the
// presence of the `CriticalAddonsOnly` taint on the node.
2017-06-30 12:15:17 +00:00
type CriticalAddonsOnlyError struct { }
2017-09-23 15:40:37 +00:00
func ( CriticalAddonsOnlyError ) Error ( ) string {
2017-06-30 12:15:17 +00:00
return fmt . Sprintf ( "CriticalAddonsOnly taint found on node" )
}
func makeNodeSchedulable ( c clientset . Interface , node * v1 . Node , failOnCriticalAddonsOnly bool ) error {
2017-03-17 17:10:43 +00:00
By ( fmt . Sprintf ( "Remove taint from node %s" , node . Name ) )
2017-03-28 15:10:46 +00:00
for j := 0 ; j < 3 ; j ++ {
2017-10-25 15:54:32 +00:00
freshNode , err := c . CoreV1 ( ) . Nodes ( ) . Get ( node . Name , metav1 . GetOptions { } )
2017-03-28 15:10:46 +00:00
if err != nil {
return err
}
2017-09-23 15:40:37 +00:00
var newTaints [ ] v1 . Taint
2017-03-28 15:10:46 +00:00
for _ , taint := range freshNode . Spec . Taints {
2017-06-30 12:15:17 +00:00
if failOnCriticalAddonsOnly && taint . Key == criticalAddonsOnlyTaint {
return CriticalAddonsOnlyError { }
}
2017-03-28 15:10:46 +00:00
if taint . Key != disabledTaint {
newTaints = append ( newTaints , taint )
}
2017-03-17 17:10:43 +00:00
}
2017-03-28 15:10:46 +00:00
if len ( newTaints ) == len ( freshNode . Spec . Taints ) {
return nil
}
2017-03-17 17:10:43 +00:00
freshNode . Spec . Taints = newTaints
2017-10-25 15:54:32 +00:00
_ , err = c . CoreV1 ( ) . Nodes ( ) . Update ( freshNode )
2017-03-28 15:10:46 +00:00
if err == nil {
return nil
}
if ! errors . IsConflict ( err ) {
return err
}
glog . Warningf ( "Got 409 conflict when trying to taint node, retries left: %v" , 3 - j )
2017-03-17 17:10:43 +00:00
}
2017-03-28 15:10:46 +00:00
return fmt . Errorf ( "Failed to remove taint from node in allowed number of retries" )
2017-03-17 17:10:43 +00:00
}
2017-06-20 12:23:38 +00:00
// Create an RC running a given number of pods with anti-affinity
func runAntiAffinityPods ( f * framework . Framework , namespace string , pods int , id string , podLabels , antiAffinityLabels map [ string ] string ) error {
config := & testutils . RCConfig {
Affinity : buildAntiAffinity ( antiAffinityLabels ) ,
Client : f . ClientSet ,
InternalClient : f . InternalClientset ,
Name : id ,
Namespace : namespace ,
Timeout : scaleUpTimeout ,
Image : framework . GetPauseImageName ( f . ClientSet ) ,
Replicas : pods ,
Labels : podLabels ,
}
err := framework . RunRC ( * config )
if err != nil {
return err
}
2017-10-25 15:54:32 +00:00
_ , err = f . ClientSet . CoreV1 ( ) . ReplicationControllers ( namespace ) . Get ( id , metav1 . GetOptions { } )
2017-06-20 12:23:38 +00:00
if err != nil {
return err
}
return nil
}
2017-06-22 14:12:15 +00:00
func runVolumeAntiAffinityPods ( f * framework . Framework , namespace string , pods int , id string , podLabels , antiAffinityLabels map [ string ] string , volumes [ ] v1 . Volume ) error {
config := & testutils . RCConfig {
Affinity : buildAntiAffinity ( antiAffinityLabels ) ,
Volumes : volumes ,
Client : f . ClientSet ,
InternalClient : f . InternalClientset ,
Name : id ,
Namespace : namespace ,
Timeout : scaleUpTimeout ,
Image : framework . GetPauseImageName ( f . ClientSet ) ,
Replicas : pods ,
Labels : podLabels ,
}
err := framework . RunRC ( * config )
if err != nil {
return err
}
2017-10-25 15:54:32 +00:00
_ , err = f . ClientSet . CoreV1 ( ) . ReplicationControllers ( namespace ) . Get ( id , metav1 . GetOptions { } )
2017-06-22 14:12:15 +00:00
if err != nil {
return err
}
return nil
}
var emptyDirVolumes = [ ] v1 . Volume {
{
Name : "empty-volume" ,
VolumeSource : v1 . VolumeSource {
EmptyDir : & v1 . EmptyDirVolumeSource { } ,
} ,
} ,
}
func buildVolumes ( pv * v1 . PersistentVolume , pvc * v1 . PersistentVolumeClaim ) [ ] v1 . Volume {
return [ ] v1 . Volume {
{
Name : pv . Name ,
VolumeSource : v1 . VolumeSource {
PersistentVolumeClaim : & v1 . PersistentVolumeClaimVolumeSource {
ClaimName : pvc . Name ,
ReadOnly : false ,
} ,
} ,
} ,
}
}
2017-06-20 12:23:38 +00:00
func buildAntiAffinity ( labels map [ string ] string ) * v1 . Affinity {
return & v1 . Affinity {
PodAntiAffinity : & v1 . PodAntiAffinity {
RequiredDuringSchedulingIgnoredDuringExecution : [ ] v1 . PodAffinityTerm {
{
LabelSelector : & metav1 . LabelSelector {
MatchLabels : labels ,
} ,
TopologyKey : "kubernetes.io/hostname" ,
} ,
} ,
} ,
}
}
// Create an RC running a given number of pods on each node without adding any constraint forcing
2017-03-21 13:02:55 +00:00
// such pod distribution. This is meant to create a bunch of underutilized (but not unused) nodes
2017-03-17 17:10:43 +00:00
// with pods that can be rescheduled on different nodes.
// This is achieved using the following method:
// 1. disable scheduling on each node
// 2. create an empty RC
// 3. for each node:
// 3a. enable scheduling on that node
2017-03-21 13:02:55 +00:00
// 3b. increase number of replicas in RC by podsPerNode
2017-08-09 13:56:56 +00:00
func runReplicatedPodOnEachNode ( f * framework . Framework , nodes [ ] v1 . Node , namespace string , podsPerNode int , id string , labels map [ string ] string , memRequest int64 ) error {
2017-03-17 17:10:43 +00:00
By ( "Run a pod on each node" )
for _ , node := range nodes {
err := makeNodeUnschedulable ( f . ClientSet , & node )
defer func ( n v1 . Node ) {
2017-06-30 12:15:17 +00:00
makeNodeSchedulable ( f . ClientSet , & n , false )
2017-03-17 17:10:43 +00:00
} ( node )
if err != nil {
return err
}
}
config := & testutils . RCConfig {
Client : f . ClientSet ,
InternalClient : f . InternalClientset ,
Name : id ,
2017-06-08 19:39:29 +00:00
Namespace : namespace ,
2017-03-17 17:10:43 +00:00
Timeout : defaultTimeout ,
Image : framework . GetPauseImageName ( f . ClientSet ) ,
Replicas : 0 ,
Labels : labels ,
2017-08-09 13:56:56 +00:00
MemRequest : memRequest ,
2017-03-17 17:10:43 +00:00
}
err := framework . RunRC ( * config )
if err != nil {
return err
}
2017-10-25 15:54:32 +00:00
rc , err := f . ClientSet . CoreV1 ( ) . ReplicationControllers ( namespace ) . Get ( id , metav1 . GetOptions { } )
2017-03-17 17:10:43 +00:00
if err != nil {
return err
}
for i , node := range nodes {
2017-06-30 12:15:17 +00:00
err = makeNodeSchedulable ( f . ClientSet , & node , false )
2017-03-17 17:10:43 +00:00
if err != nil {
return err
}
2017-03-24 10:45:25 +00:00
// Update replicas count, to create new pods that will be allocated on node
// (we retry 409 errors in case rc reference got out of sync)
for j := 0 ; j < 3 ; j ++ {
2017-03-21 13:02:55 +00:00
* rc . Spec . Replicas = int32 ( ( i + 1 ) * podsPerNode )
2017-10-25 15:54:32 +00:00
rc , err = f . ClientSet . CoreV1 ( ) . ReplicationControllers ( namespace ) . Update ( rc )
2017-03-24 10:45:25 +00:00
if err == nil {
break
}
if ! errors . IsConflict ( err ) {
return err
}
glog . Warningf ( "Got 409 conflict when trying to scale RC, retries left: %v" , 3 - j )
2017-10-25 15:54:32 +00:00
rc , err = f . ClientSet . CoreV1 ( ) . ReplicationControllers ( namespace ) . Get ( id , metav1 . GetOptions { } )
2017-03-24 10:45:25 +00:00
if err != nil {
return err
}
2017-03-17 17:10:43 +00:00
}
2017-03-24 10:45:25 +00:00
2017-03-17 17:10:43 +00:00
err = wait . PollImmediate ( 5 * time . Second , podTimeout , func ( ) ( bool , error ) {
2017-10-25 15:54:32 +00:00
rc , err = f . ClientSet . CoreV1 ( ) . ReplicationControllers ( namespace ) . Get ( id , metav1 . GetOptions { } )
2017-03-21 13:02:55 +00:00
if err != nil || rc . Status . ReadyReplicas < int32 ( ( i + 1 ) * podsPerNode ) {
2017-03-17 17:10:43 +00:00
return false , nil
}
return true , nil
} )
if err != nil {
return fmt . Errorf ( "failed to coerce RC into spawning a pod on node %s within timeout" , node . Name )
}
err = makeNodeUnschedulable ( f . ClientSet , & node )
if err != nil {
return err
}
}
return nil
}
2017-08-09 13:56:56 +00:00
// wrap runReplicatedPodOnEachNode to return cleanup
func runReplicatedPodOnEachNodeWithCleanup ( f * framework . Framework , nodes [ ] v1 . Node , namespace string , podsPerNode int , id string , labels map [ string ] string , memRequest int64 ) ( func ( ) , error ) {
err := runReplicatedPodOnEachNode ( f , nodes , namespace , podsPerNode , id , labels , memRequest )
return func ( ) {
framework . DeleteRCAndPods ( f . ClientSet , f . InternalClientset , namespace , id )
} , err
}
2017-03-17 17:10:43 +00:00
// Increase cluster size by newNodesForScaledownTests to create some unused nodes
// that can be later removed by cluster autoscaler.
func manuallyIncreaseClusterSize ( f * framework . Framework , originalSizes map [ string ] int ) int {
By ( "Manually increase cluster size" )
increasedSize := 0
newSizes := make ( map [ string ] int )
for key , val := range originalSizes {
newSizes [ key ] = val + newNodesForScaledownTests
increasedSize += val + newNodesForScaledownTests
}
setMigSizes ( newSizes )
2017-04-10 17:16:30 +00:00
checkClusterSize := func ( size int ) bool {
if size >= increasedSize {
return true
}
resized := setMigSizes ( newSizes )
if resized {
glog . Warning ( "Unexpected node group size while waiting for cluster resize. Setting size to target again." )
}
return false
}
framework . ExpectNoError ( WaitForClusterSizeFunc ( f . ClientSet , checkClusterSize , scaleUpTimeout ) )
2017-03-17 17:10:43 +00:00
return increasedSize
}
2017-03-23 09:53:57 +00:00
// Try to get clusterwide health from CA status configmap.
// Status configmap is not parsing-friendly, so evil regexpery follows.
func getClusterwideStatus ( c clientset . Interface ) ( string , error ) {
configMap , err := c . CoreV1 ( ) . ConfigMaps ( "kube-system" ) . Get ( "cluster-autoscaler-status" , metav1 . GetOptions { } )
if err != nil {
return "" , err
}
status , ok := configMap . Data [ "status" ]
if ! ok {
return "" , fmt . Errorf ( "Status information not found in configmap" )
}
matcher , err := regexp . Compile ( "Cluster-wide:\\s*\n\\s*Health:\\s*([A-Za-z]+)" )
if err != nil {
return "" , err
}
result := matcher . FindStringSubmatch ( status )
if len ( result ) < 2 {
2017-07-13 09:40:49 +00:00
return "" , fmt . Errorf ( "Failed to parse CA status configmap, raw status: %v" , status )
2017-03-23 09:53:57 +00:00
}
return result [ 1 ] , nil
}
2017-04-04 08:26:38 +00:00
type scaleUpStatus struct {
2017-09-20 17:56:52 +00:00
status string
ready int
target int
timestamp time . Time
}
// Try to get timestamp from status.
// Status configmap is not parsing-friendly, so evil regexpery follows.
func getStatusTimestamp ( status string ) ( time . Time , error ) {
timestampMatcher , err := regexp . Compile ( "Cluster-autoscaler status at \\s*([0-9\\-]+ [0-9]+:[0-9]+:[0-9]+\\.[0-9]+ \\+[0-9]+ [A-Za-z]+):" )
if err != nil {
return time . Time { } , err
}
2017-09-22 08:45:22 +00:00
timestampMatch := timestampMatcher . FindStringSubmatch ( status )
if len ( timestampMatch ) < 2 {
2017-09-20 17:56:52 +00:00
return time . Time { } , fmt . Errorf ( "Failed to parse CA status timestamp, raw status: %v" , status )
}
2017-09-22 08:45:22 +00:00
timestamp , err := time . Parse ( timestampFormat , timestampMatch [ 1 ] )
2017-09-20 17:56:52 +00:00
if err != nil {
return time . Time { } , err
}
return timestamp , nil
2017-04-04 08:26:38 +00:00
}
// Try to get scaleup statuses of all node groups.
// Status configmap is not parsing-friendly, so evil regexpery follows.
func getScaleUpStatus ( c clientset . Interface ) ( * scaleUpStatus , error ) {
configMap , err := c . CoreV1 ( ) . ConfigMaps ( "kube-system" ) . Get ( "cluster-autoscaler-status" , metav1 . GetOptions { } )
if err != nil {
return nil , err
}
status , ok := configMap . Data [ "status" ]
if ! ok {
return nil , fmt . Errorf ( "Status information not found in configmap" )
}
2017-09-20 17:56:52 +00:00
timestamp , err := getStatusTimestamp ( status )
if err != nil {
return nil , err
}
2017-04-04 08:26:38 +00:00
matcher , err := regexp . Compile ( "s*ScaleUp:\\s*([A-Za-z]+)\\s*\\(ready=([0-9]+)\\s*cloudProviderTarget=([0-9]+)\\s*\\)" )
if err != nil {
return nil , err
}
matches := matcher . FindAllStringSubmatch ( status , - 1 )
if len ( matches ) < 1 {
2017-07-13 09:40:49 +00:00
return nil , fmt . Errorf ( "Failed to parse CA status configmap, raw status: %v" , status )
2017-04-04 08:26:38 +00:00
}
2017-09-20 17:56:52 +00:00
2017-04-04 08:26:38 +00:00
result := scaleUpStatus {
2017-09-20 17:56:52 +00:00
status : caNoScaleUpStatus ,
ready : 0 ,
target : 0 ,
timestamp : timestamp ,
2017-04-04 08:26:38 +00:00
}
for _ , match := range matches {
if match [ 1 ] == caOngoingScaleUpStatus {
result . status = caOngoingScaleUpStatus
}
newReady , err := strconv . Atoi ( match [ 2 ] )
if err != nil {
return nil , err
}
result . ready += newReady
newTarget , err := strconv . Atoi ( match [ 3 ] )
if err != nil {
return nil , err
}
result . target += newTarget
}
glog . Infof ( "Cluster-Autoscaler scale-up status: %v (%v, %v)" , result . status , result . ready , result . target )
return & result , nil
}
2017-09-20 17:56:52 +00:00
func waitForScaleUpStatus ( c clientset . Interface , cond func ( s * scaleUpStatus ) bool , timeout time . Duration ) ( * scaleUpStatus , error ) {
var finalErr error
var status * scaleUpStatus
err := wait . PollImmediate ( 5 * time . Second , timeout , func ( ) ( bool , error ) {
status , finalErr = getScaleUpStatus ( c )
if finalErr != nil {
return false , nil
2017-04-04 08:26:38 +00:00
}
2017-09-20 17:56:52 +00:00
if status . timestamp . Add ( freshStatusLimit ) . Before ( time . Now ( ) ) {
// stale status
finalErr = fmt . Errorf ( "Status too old" )
return false , nil
2017-04-04 08:26:38 +00:00
}
2017-09-20 17:56:52 +00:00
return cond ( status ) , nil
} )
if err != nil {
err = fmt . Errorf ( "Failed to find expected scale up status: %v, last status: %v, final err: %v" , err , status , finalErr )
2017-04-04 08:26:38 +00:00
}
2017-09-20 17:56:52 +00:00
return status , err
2017-04-04 08:26:38 +00:00
}
2017-06-20 14:37:20 +00:00
// This is a temporary fix to allow CA to migrate some kube-system pods
2017-09-20 12:07:30 +00:00
// TODO: Remove this when the PDB is added for some of those components
2017-06-20 14:37:20 +00:00
func addKubeSystemPdbs ( f * framework . Framework ) ( func ( ) , error ) {
By ( "Create PodDisruptionBudgets for kube-system components, so they can be migrated if required" )
2017-09-23 15:40:37 +00:00
var newPdbs [ ] string
2017-06-20 14:37:20 +00:00
cleanup := func ( ) {
2017-09-21 11:57:02 +00:00
var finalErr error
2017-06-20 14:37:20 +00:00
for _ , newPdbName := range newPdbs {
2017-09-21 11:57:02 +00:00
By ( fmt . Sprintf ( "Delete PodDisruptionBudget %v" , newPdbName ) )
err := f . ClientSet . Policy ( ) . PodDisruptionBudgets ( "kube-system" ) . Delete ( newPdbName , & metav1 . DeleteOptions { } )
if err != nil {
// log error, but attempt to remove other pdbs
glog . Errorf ( "Failed to delete PodDisruptionBudget %v, err: %v" , newPdbName , err )
finalErr = err
}
}
if finalErr != nil {
framework . Failf ( "Error during PodDisruptionBudget cleanup: %v" , finalErr )
2017-06-20 14:37:20 +00:00
}
}
type pdbInfo struct {
2017-09-23 15:40:37 +00:00
label string
minAvailable int
2017-06-20 14:37:20 +00:00
}
pdbsToAdd := [ ] pdbInfo {
2017-09-23 15:40:37 +00:00
{ label : "kube-dns" , minAvailable : 1 } ,
{ label : "kube-dns-autoscaler" , minAvailable : 0 } ,
{ label : "metrics-server" , minAvailable : 0 } ,
{ label : "kubernetes-dashboard" , minAvailable : 0 } ,
{ label : "glbc" , minAvailable : 0 } ,
2017-06-20 14:37:20 +00:00
}
for _ , pdbData := range pdbsToAdd {
By ( fmt . Sprintf ( "Create PodDisruptionBudget for %v" , pdbData . label ) )
labelMap := map [ string ] string { "k8s-app" : pdbData . label }
pdbName := fmt . Sprintf ( "test-pdb-for-%v" , pdbData . label )
2017-09-23 15:40:37 +00:00
minAvailable := intstr . FromInt ( pdbData . minAvailable )
2017-06-20 14:37:20 +00:00
pdb := & policy . PodDisruptionBudget {
ObjectMeta : metav1 . ObjectMeta {
Name : pdbName ,
Namespace : "kube-system" ,
} ,
Spec : policy . PodDisruptionBudgetSpec {
Selector : & metav1 . LabelSelector { MatchLabels : labelMap } ,
MinAvailable : & minAvailable ,
} ,
}
2017-09-12 23:02:17 +00:00
_ , err := f . ClientSet . Policy ( ) . PodDisruptionBudgets ( "kube-system" ) . Create ( pdb )
2017-06-20 14:37:20 +00:00
newPdbs = append ( newPdbs , pdbName )
if err != nil {
return cleanup , err
}
}
return cleanup , nil
}