2015-04-29 22:28:48 +00:00
/ *
2016-06-03 00:25:58 +00:00
Copyright 2015 The Kubernetes Authors .
2015-04-29 22:28:48 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package e2e
import (
"fmt"
2015-11-08 05:53:36 +00:00
"sync"
2015-04-29 22:28:48 +00:00
"time"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/api"
2016-10-19 13:55:39 +00:00
clientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
2015-08-13 19:01:50 +00:00
client "k8s.io/kubernetes/pkg/client/unversioned"
2015-08-05 22:03:47 +00:00
"k8s.io/kubernetes/pkg/fields"
"k8s.io/kubernetes/pkg/labels"
2016-01-27 23:20:06 +00:00
"k8s.io/kubernetes/pkg/util/sets"
2016-04-07 17:21:31 +00:00
"k8s.io/kubernetes/test/e2e/framework"
2016-10-12 11:37:37 +00:00
testutils "k8s.io/kubernetes/test/utils"
2015-04-29 22:28:48 +00:00
. "github.com/onsi/ginkgo"
2015-11-08 05:53:36 +00:00
. "github.com/onsi/gomega"
2015-04-29 22:28:48 +00:00
)
const (
// How long a node is allowed to go from "Ready" to "NotReady" after a
// reboot is issued before the test is considered failed.
2015-05-21 21:43:42 +00:00
rebootNodeNotReadyTimeout = 2 * time . Minute
2015-04-29 22:28:48 +00:00
// How long a node is allowed to go from "NotReady" to "Ready" after a
// reboot is issued and it is found to be "NotReady" before the test is
// considered failed.
2015-05-21 21:43:42 +00:00
rebootNodeReadyAgainTimeout = 5 * time . Minute
2015-04-29 22:28:48 +00:00
// How long pods have to be "ready" after the reboot.
2015-05-21 21:43:42 +00:00
rebootPodReadyAgainTimeout = 5 * time . Minute
2015-04-29 22:28:48 +00:00
)
2016-04-07 17:21:31 +00:00
var _ = framework . KubeDescribe ( "Reboot [Disruptive] [Feature:Reboot]" , func ( ) {
var f * framework . Framework
2015-05-21 21:32:57 +00:00
2015-04-29 22:28:48 +00:00
BeforeEach ( func ( ) {
2015-07-10 17:47:22 +00:00
// These tests requires SSH to nodes, so the provider check should be identical to there
2016-04-07 17:21:31 +00:00
// (the limiting factor is the implementation of util.go's framework.GetSigner(...)).
2015-06-22 21:14:54 +00:00
// Cluster must support node reboot
2016-04-07 17:21:31 +00:00
framework . SkipUnlessProviderIs ( framework . ProvidersWithSSH ... )
2015-04-29 22:28:48 +00:00
} )
2015-11-08 05:53:36 +00:00
AfterEach ( func ( ) {
if CurrentGinkgoTestDescription ( ) . Failed {
// Most of the reboot tests just make sure that addon/system pods are running, so dump
// events for the kube-system namespace on failures
namespaceName := api . NamespaceSystem
By ( fmt . Sprintf ( "Collecting events from namespace %q." , namespaceName ) )
2015-12-10 09:39:03 +00:00
events , err := f . Client . Events ( namespaceName ) . List ( api . ListOptions { } )
2015-11-08 05:53:36 +00:00
Expect ( err ) . NotTo ( HaveOccurred ( ) )
for _ , e := range events . Items {
2016-04-07 17:21:31 +00:00
framework . Logf ( "event for %v: %v %v: %v" , e . InvolvedObject . Name , e . Source , e . Reason , e . Message )
2015-11-08 05:53:36 +00:00
}
}
2016-01-11 23:30:23 +00:00
// In GKE, our current tunneling setup has the potential to hold on to a broken tunnel (from a
// rebooted/deleted node) for up to 5 minutes before all tunnels are dropped and recreated. Most tests
// make use of some proxy feature to verify functionality. So, if a reboot test runs right before a test
// that tries to get logs, for example, we may get unlucky and try to use a closed tunnel to a node that
2016-04-07 17:21:31 +00:00
// was recently rebooted. There's no good way to framework.Poll for proxies being closed, so we sleep.
2016-01-11 23:30:23 +00:00
//
// TODO(cjcullen) reduce this sleep (#19314)
2016-04-07 17:21:31 +00:00
if framework . ProviderIs ( "gke" ) {
2016-01-11 23:30:23 +00:00
By ( "waiting 5 minutes for all dead tunnels to be dropped" )
time . Sleep ( 5 * time . Minute )
}
2015-11-08 05:53:36 +00:00
} )
2016-04-07 17:21:31 +00:00
f = framework . NewDefaultFramework ( "reboot" )
2015-11-08 05:53:36 +00:00
2015-05-25 14:15:27 +00:00
It ( "each node by ordering clean reboot and ensure they function upon restart" , func ( ) {
// clean shutdown and restart
2015-06-02 06:05:20 +00:00
// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is rebooted.
2016-10-19 13:55:39 +00:00
testReboot ( f . Client , f . ClientSet , "nohup sh -c 'sleep 10 && sudo reboot' >/dev/null 2>&1 &" )
2015-05-25 14:15:27 +00:00
} )
2015-04-29 22:28:48 +00:00
2015-05-25 14:15:27 +00:00
It ( "each node by ordering unclean reboot and ensure they function upon restart" , func ( ) {
// unclean shutdown and restart
2015-06-02 06:05:20 +00:00
// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is shutdown.
2016-10-19 13:55:39 +00:00
testReboot ( f . Client , f . ClientSet , "nohup sh -c 'sleep 10 && echo b | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &" )
2015-05-25 14:15:27 +00:00
} )
2015-04-29 22:28:48 +00:00
2015-05-25 14:15:27 +00:00
It ( "each node by triggering kernel panic and ensure they function upon restart" , func ( ) {
// kernel panic
2015-06-02 06:05:20 +00:00
// We sleep 10 seconds to give some time for ssh command to cleanly finish before kernel panic is triggered.
2016-10-19 13:55:39 +00:00
testReboot ( f . Client , f . ClientSet , "nohup sh -c 'sleep 10 && echo c | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &" )
2015-05-25 14:15:27 +00:00
} )
It ( "each node by switching off the network interface and ensure they function upon switch on" , func ( ) {
// switch the network interface off for a while to simulate a network outage
2015-06-02 06:05:20 +00:00
// We sleep 10 seconds to give some time for ssh command to cleanly finish before network is down.
2016-10-19 13:55:39 +00:00
testReboot ( f . Client , f . ClientSet , "nohup sh -c 'sleep 10 && (sudo ifdown eth0 || sudo ip link set eth0 down) && sleep 120 && (sudo ifup eth0 || sudo ip link set eth0 up)' >/dev/null 2>&1 &" )
2015-05-25 14:15:27 +00:00
} )
2015-06-02 06:05:20 +00:00
It ( "each node by dropping all inbound packets for a while and ensure they function afterwards" , func ( ) {
2015-05-25 14:15:27 +00:00
// tell the firewall to drop all inbound packets for a while
2015-06-02 06:05:20 +00:00
// We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping inbound packets.
2015-07-03 08:27:02 +00:00
// We still accept packages send from localhost to prevent monit from restarting kubelet.
2016-10-19 13:55:39 +00:00
testReboot ( f . Client , f . ClientSet , "nohup sh -c 'sleep 10 && sudo iptables -I INPUT 1 -s 127.0.0.1 -j ACCEPT && sudo iptables -I INPUT 2 -j DROP && " +
2015-07-03 08:27:02 +00:00
" sleep 120 && sudo iptables -D INPUT -j DROP && sudo iptables -D INPUT -s 127.0.0.1 -j ACCEPT' >/dev/null 2>&1 &" )
2015-05-25 14:15:27 +00:00
} )
2015-06-02 06:05:20 +00:00
It ( "each node by dropping all outbound packets for a while and ensure they function afterwards" , func ( ) {
2015-05-25 14:15:27 +00:00
// tell the firewall to drop all outbound packets for a while
2015-06-02 06:05:20 +00:00
// We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping outbound packets.
2015-07-03 08:27:02 +00:00
// We still accept packages send to localhost to prevent monit from restarting kubelet.
2016-10-19 13:55:39 +00:00
testReboot ( f . Client , f . ClientSet , "nohup sh -c 'sleep 10 && sudo iptables -I OUTPUT 1 -s 127.0.0.1 -j ACCEPT && sudo iptables -I OUTPUT 2 -j DROP && " +
2015-07-03 08:27:02 +00:00
" sleep 120 && sudo iptables -D OUTPUT -j DROP && sudo iptables -D OUTPUT -s 127.0.0.1 -j ACCEPT' >/dev/null 2>&1 &" )
2015-04-29 22:28:48 +00:00
} )
} )
2016-10-19 13:55:39 +00:00
func testReboot ( c * client . Client , cs clientset . Interface , rebootCmd string ) {
2015-05-25 14:15:27 +00:00
// Get all nodes, and kick off the test on each.
2016-10-19 13:55:39 +00:00
nodelist := framework . GetReadySchedulableNodesOrDie ( cs )
2015-11-08 05:53:36 +00:00
result := make ( [ ] bool , len ( nodelist . Items ) )
wg := sync . WaitGroup { }
wg . Add ( len ( nodelist . Items ) )
2015-05-25 14:15:27 +00:00
failed := false
2015-11-08 05:53:36 +00:00
for ix := range nodelist . Items {
go func ( ix int ) {
defer wg . Done ( )
n := nodelist . Items [ ix ]
2016-04-07 17:21:31 +00:00
result [ ix ] = rebootNode ( c , framework . TestContext . Provider , n . ObjectMeta . Name , rebootCmd )
2015-11-08 05:53:36 +00:00
if ! result [ ix ] {
failed = true
}
} ( ix )
2015-05-25 14:15:27 +00:00
}
2015-11-08 05:53:36 +00:00
// Wait for all to finish and check the final result.
wg . Wait ( )
2015-05-25 14:15:27 +00:00
if failed {
2015-11-08 05:53:36 +00:00
for ix := range nodelist . Items {
n := nodelist . Items [ ix ]
if ! result [ ix ] {
2016-04-07 17:21:31 +00:00
framework . Logf ( "Node %s failed reboot test." , n . ObjectMeta . Name )
2015-11-08 05:53:36 +00:00
}
}
2016-04-07 17:21:31 +00:00
framework . Failf ( "Test failed; at least one node failed to reboot in the time given." )
2015-05-25 14:15:27 +00:00
}
}
2016-01-27 23:20:06 +00:00
func printStatusAndLogsForNotReadyPods ( c * client . Client , ns string , podNames [ ] string , pods [ ] * api . Pod ) {
2016-01-25 19:36:44 +00:00
printFn := func ( id , log string , err error , previous bool ) {
prefix := "Retrieving log for container"
if previous {
prefix = "Retrieving log for the last terminated container"
}
if err != nil {
2016-04-07 17:21:31 +00:00
framework . Logf ( "%s %s, err: %v:\n%s\n" , prefix , id , err , log )
2016-01-25 19:36:44 +00:00
} else {
2016-04-07 17:21:31 +00:00
framework . Logf ( "%s %s:\n%s\n" , prefix , id , log )
2016-01-25 19:36:44 +00:00
}
}
2016-01-27 23:20:06 +00:00
podNameSet := sets . NewString ( podNames ... )
for _ , p := range pods {
if p . Namespace != ns {
continue
}
if ! podNameSet . Has ( p . Name ) {
continue
}
2016-10-12 11:37:37 +00:00
if ok , _ := testutils . PodRunningReady ( p ) ; ok {
2016-01-27 23:20:06 +00:00
continue
}
2016-04-07 17:21:31 +00:00
framework . Logf ( "Status for not ready pod %s/%s: %+v" , p . Namespace , p . Name , p . Status )
2016-01-27 23:20:06 +00:00
// Print the log of the containers if pod is not running and ready.
for _ , container := range p . Status . ContainerStatuses {
cIdentifer := fmt . Sprintf ( "%s/%s/%s" , p . Namespace , p . Name , container . Name )
2016-04-07 17:21:31 +00:00
log , err := framework . GetPodLogs ( c , p . Namespace , p . Name , container . Name )
2016-01-27 23:20:06 +00:00
printFn ( cIdentifer , log , err , false )
// Get log from the previous container.
if container . RestartCount > 0 {
printFn ( cIdentifer , log , err , true )
2016-01-25 19:36:44 +00:00
}
}
}
}
2015-04-29 22:28:48 +00:00
// rebootNode takes node name on provider through the following steps using c:
// - ensures the node is ready
// - ensures all pods on the node are running and ready
2015-05-25 14:15:27 +00:00
// - reboots the node (by executing rebootCmd over ssh)
2015-04-29 22:28:48 +00:00
// - ensures the node reaches some non-ready state
// - ensures the node becomes ready again
// - ensures all pods on the node become running and ready again
//
// It returns true through result only if all of the steps pass; at the first
// failed step, it will return false through result and not run the rest.
2015-11-08 05:53:36 +00:00
func rebootNode ( c * client . Client , provider , name , rebootCmd string ) bool {
2015-05-21 21:43:42 +00:00
// Setup
2015-07-10 21:01:45 +00:00
ns := api . NamespaceSystem
2016-10-12 11:37:37 +00:00
ps := testutils . NewPodStore ( c , ns , labels . Everything ( ) , fields . OneTermEqualSelector ( api . PodHostField , name ) )
2015-05-21 21:43:42 +00:00
defer ps . Stop ( )
2015-04-29 22:28:48 +00:00
// Get the node initially.
2016-04-07 17:21:31 +00:00
framework . Logf ( "Getting %s" , name )
2015-04-29 22:28:48 +00:00
node , err := c . Nodes ( ) . Get ( name )
if err != nil {
2016-04-07 17:21:31 +00:00
framework . Logf ( "Couldn't get node %s" , name )
2015-11-08 05:53:36 +00:00
return false
2015-04-29 22:28:48 +00:00
}
// Node sanity check: ensure it is "ready".
2016-04-07 17:21:31 +00:00
if ! framework . WaitForNodeToBeReady ( c , name , framework . NodeReadyInitialTimeout ) {
2015-11-08 05:53:36 +00:00
return false
2015-04-29 22:28:48 +00:00
}
2015-06-30 14:57:44 +00:00
// Get all the pods on the node that don't have liveness probe set.
// Liveness probe may cause restart of a pod during node reboot, and the pod may not be running.
2015-05-21 21:43:42 +00:00
pods := ps . List ( )
2015-06-30 14:57:44 +00:00
podNames := [ ] string { }
for _ , p := range pods {
probe := false
for _ , c := range p . Spec . Containers {
if c . LivenessProbe != nil {
probe = true
break
}
}
if ! probe {
podNames = append ( podNames , p . ObjectMeta . Name )
}
2015-04-29 22:28:48 +00:00
}
2016-05-25 16:29:50 +00:00
framework . Logf ( "Node %s has %d assigned pods with no liveness probes: %v" , name , len ( podNames ) , podNames )
2015-04-29 22:28:48 +00:00
// For each pod, we do a sanity check to ensure it's running / healthy
2016-05-25 16:29:50 +00:00
// or succeeded now, as that's what we'll be checking later.
if ! framework . CheckPodsRunningReadyOrSucceeded ( c , ns , podNames , framework . PodReadyBeforeTimeout ) {
2016-01-27 23:20:06 +00:00
printStatusAndLogsForNotReadyPods ( c , ns , podNames , pods )
2015-11-08 05:53:36 +00:00
return false
2015-04-29 22:28:48 +00:00
}
// Reboot the node.
2016-04-07 17:21:31 +00:00
if err = framework . IssueSSHCommand ( rebootCmd , provider , node ) ; err != nil {
framework . Logf ( "Error while issuing ssh command: %v" , err )
2015-11-08 05:53:36 +00:00
return false
2015-04-29 22:28:48 +00:00
}
// Wait for some kind of "not ready" status.
2016-04-07 17:21:31 +00:00
if ! framework . WaitForNodeToBeNotReady ( c , name , rebootNodeNotReadyTimeout ) {
2015-11-08 05:53:36 +00:00
return false
2015-04-29 22:28:48 +00:00
}
// Wait for some kind of "ready" status.
2016-04-07 17:21:31 +00:00
if ! framework . WaitForNodeToBeReady ( c , name , rebootNodeReadyAgainTimeout ) {
2015-11-08 05:53:36 +00:00
return false
2015-04-29 22:28:48 +00:00
}
// Ensure all of the pods that we found on this node before the reboot are
2016-05-25 16:29:50 +00:00
// running / healthy, or succeeded.
if ! framework . CheckPodsRunningReadyOrSucceeded ( c , ns , podNames , rebootPodReadyAgainTimeout ) {
2016-01-25 19:36:44 +00:00
newPods := ps . List ( )
2016-01-27 23:20:06 +00:00
printStatusAndLogsForNotReadyPods ( c , ns , podNames , newPods )
2015-11-08 05:53:36 +00:00
return false
2015-04-29 22:28:48 +00:00
}
2016-04-07 17:21:31 +00:00
framework . Logf ( "Reboot successful on node %s" , name )
2015-11-08 05:53:36 +00:00
return true
2015-04-29 22:28:48 +00:00
}