2015-04-29 22:28:48 +00:00
|
|
|
/*
|
|
|
|
Copyright 2015 The Kubernetes Authors All rights reserved.
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package e2e
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
|
|
|
|
"github.com/GoogleCloudPlatform/kubernetes/pkg/client"
|
|
|
|
"github.com/GoogleCloudPlatform/kubernetes/pkg/fields"
|
|
|
|
"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
|
|
|
|
|
|
|
|
. "github.com/onsi/ginkgo"
|
|
|
|
. "github.com/onsi/gomega"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
// How long a node is allowed to go from "Ready" to "NotReady" after a
|
|
|
|
// reboot is issued before the test is considered failed.
|
2015-05-21 21:43:42 +00:00
|
|
|
rebootNodeNotReadyTimeout = 2 * time.Minute
|
2015-04-29 22:28:48 +00:00
|
|
|
|
|
|
|
// How long a node is allowed to go from "NotReady" to "Ready" after a
|
|
|
|
// reboot is issued and it is found to be "NotReady" before the test is
|
|
|
|
// considered failed.
|
2015-05-21 21:43:42 +00:00
|
|
|
rebootNodeReadyAgainTimeout = 5 * time.Minute
|
2015-04-29 22:28:48 +00:00
|
|
|
|
|
|
|
// How long pods have to be "ready" after the reboot.
|
2015-05-21 21:43:42 +00:00
|
|
|
rebootPodReadyAgainTimeout = 5 * time.Minute
|
2015-04-29 22:28:48 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
var _ = Describe("Reboot", func() {
|
2015-05-22 01:14:26 +00:00
|
|
|
var c *client.Client
|
2015-05-21 21:32:57 +00:00
|
|
|
|
2015-04-29 22:28:48 +00:00
|
|
|
BeforeEach(func() {
|
|
|
|
var err error
|
|
|
|
c, err = loadClient()
|
|
|
|
Expect(err).NotTo(HaveOccurred())
|
2015-06-22 21:14:54 +00:00
|
|
|
|
|
|
|
// These tests requires SSH, so the provider check should be identical to there
|
|
|
|
// (the limiting factor is the implementation of util.go's getSigner(...)).
|
|
|
|
|
|
|
|
// Cluster must support node reboot
|
|
|
|
SkipUnlessProviderIs("gce", "aws")
|
2015-04-29 22:28:48 +00:00
|
|
|
})
|
|
|
|
|
2015-05-25 14:15:27 +00:00
|
|
|
It("each node by ordering clean reboot and ensure they function upon restart", func() {
|
|
|
|
// clean shutdown and restart
|
2015-06-02 06:05:20 +00:00
|
|
|
// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is rebooted.
|
|
|
|
testReboot(c, "nohup sh -c 'sleep 10 && sudo reboot' >/dev/null 2>&1 &")
|
2015-05-25 14:15:27 +00:00
|
|
|
})
|
2015-04-29 22:28:48 +00:00
|
|
|
|
2015-05-25 14:15:27 +00:00
|
|
|
It("each node by ordering unclean reboot and ensure they function upon restart", func() {
|
|
|
|
// unclean shutdown and restart
|
2015-06-02 06:05:20 +00:00
|
|
|
// We sleep 10 seconds to give some time for ssh command to cleanly finish before the node is shutdown.
|
|
|
|
testReboot(c, "nohup sh -c 'sleep 10 && echo b | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &")
|
2015-05-25 14:15:27 +00:00
|
|
|
})
|
2015-04-29 22:28:48 +00:00
|
|
|
|
2015-05-25 14:15:27 +00:00
|
|
|
It("each node by triggering kernel panic and ensure they function upon restart", func() {
|
|
|
|
// kernel panic
|
2015-06-02 06:05:20 +00:00
|
|
|
// We sleep 10 seconds to give some time for ssh command to cleanly finish before kernel panic is triggered.
|
|
|
|
testReboot(c, "nohup sh -c 'sleep 10 && echo c | sudo tee /proc/sysrq-trigger' >/dev/null 2>&1 &")
|
2015-05-25 14:15:27 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
It("each node by switching off the network interface and ensure they function upon switch on", func() {
|
|
|
|
// switch the network interface off for a while to simulate a network outage
|
2015-06-02 06:05:20 +00:00
|
|
|
// We sleep 10 seconds to give some time for ssh command to cleanly finish before network is down.
|
|
|
|
testReboot(c, "nohup sh -c 'sleep 10 && sudo ifdown eth0 && sleep 120 && sudo ifup eth0' >/dev/null 2>&1 &")
|
2015-05-25 14:15:27 +00:00
|
|
|
})
|
|
|
|
|
2015-06-02 06:05:20 +00:00
|
|
|
It("each node by dropping all inbound packets for a while and ensure they function afterwards", func() {
|
2015-05-25 14:15:27 +00:00
|
|
|
// tell the firewall to drop all inbound packets for a while
|
2015-06-02 06:05:20 +00:00
|
|
|
// We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping inbound packets.
|
2015-07-03 08:27:02 +00:00
|
|
|
// We still accept packages send from localhost to prevent monit from restarting kubelet.
|
|
|
|
testReboot(c, "nohup sh -c 'sleep 10 && sudo iptables -A INPUT -s 127.0.0.1 -j ACCEPT && sudo iptables -A INPUT -j DROP && "+
|
|
|
|
" sleep 120 && sudo iptables -D INPUT -j DROP && sudo iptables -D INPUT -s 127.0.0.1 -j ACCEPT' >/dev/null 2>&1 &")
|
2015-05-25 14:15:27 +00:00
|
|
|
})
|
|
|
|
|
2015-06-02 06:05:20 +00:00
|
|
|
It("each node by dropping all outbound packets for a while and ensure they function afterwards", func() {
|
2015-05-25 14:15:27 +00:00
|
|
|
// tell the firewall to drop all outbound packets for a while
|
2015-06-02 06:05:20 +00:00
|
|
|
// We sleep 10 seconds to give some time for ssh command to cleanly finish before starting dropping outbound packets.
|
2015-07-03 08:27:02 +00:00
|
|
|
// We still accept packages send to localhost to prevent monit from restarting kubelet.
|
|
|
|
testReboot(c, "nohup sh -c 'sleep 10 && sudo iptables -A OUTPUT -s 127.0.0.1 -j ACCEPT && sudo iptables -A OUTPUT -j DROP && "+
|
|
|
|
" sleep 120 && sudo iptables -D OUTPUT -j DROP && sudo iptables -D OUTPUT -s 127.0.0.1 -j ACCEPT' >/dev/null 2>&1 &")
|
2015-04-29 22:28:48 +00:00
|
|
|
})
|
|
|
|
})
|
|
|
|
|
2015-05-25 14:15:27 +00:00
|
|
|
func testReboot(c *client.Client, rebootCmd string) {
|
|
|
|
// Get all nodes, and kick off the test on each.
|
2015-05-21 21:43:42 +00:00
|
|
|
nodelist, err := listNodes(c, labels.Everything(), fields.Everything())
|
2015-05-25 14:15:27 +00:00
|
|
|
if err != nil {
|
|
|
|
Failf("Error getting nodes: %v", err)
|
|
|
|
}
|
|
|
|
result := make(chan bool, len(nodelist.Items))
|
|
|
|
for _, n := range nodelist.Items {
|
2015-06-22 21:14:54 +00:00
|
|
|
go rebootNode(c, testContext.Provider, n.ObjectMeta.Name, rebootCmd, result)
|
2015-05-25 14:15:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for all to finish and check the final result.
|
|
|
|
failed := false
|
|
|
|
// TODO(mbforbes): Change to `for range` syntax and remove logging once
|
|
|
|
// we support only Go >= 1.4.
|
|
|
|
for _, n := range nodelist.Items {
|
|
|
|
if !<-result {
|
|
|
|
Failf("Node %s failed reboot test.", n.ObjectMeta.Name)
|
|
|
|
failed = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if failed {
|
|
|
|
Failf("Test failed; at least one node failed to reboot in the time given.")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func issueSSHCommand(node *api.Node, provider, cmd string) error {
|
|
|
|
Logf("Getting external IP address for %s", node.Name)
|
|
|
|
host := ""
|
|
|
|
for _, a := range node.Status.Addresses {
|
|
|
|
if a.Type == api.NodeExternalIP {
|
|
|
|
host = a.Address + ":22"
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if host == "" {
|
|
|
|
return fmt.Errorf("couldn't find external IP address for node %s", node.Name)
|
|
|
|
}
|
|
|
|
Logf("Calling %s on %s", cmd, node.Name)
|
|
|
|
if _, _, code, err := SSH(cmd, host, provider); code != 0 || err != nil {
|
|
|
|
return fmt.Errorf("when running %s on %s, got %d and %v", cmd, node.Name, code, err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2015-04-29 22:28:48 +00:00
|
|
|
// rebootNode takes node name on provider through the following steps using c:
|
|
|
|
// - ensures the node is ready
|
|
|
|
// - ensures all pods on the node are running and ready
|
2015-05-25 14:15:27 +00:00
|
|
|
// - reboots the node (by executing rebootCmd over ssh)
|
2015-04-29 22:28:48 +00:00
|
|
|
// - ensures the node reaches some non-ready state
|
|
|
|
// - ensures the node becomes ready again
|
|
|
|
// - ensures all pods on the node become running and ready again
|
|
|
|
//
|
|
|
|
// It returns true through result only if all of the steps pass; at the first
|
|
|
|
// failed step, it will return false through result and not run the rest.
|
2015-05-25 14:15:27 +00:00
|
|
|
func rebootNode(c *client.Client, provider, name, rebootCmd string, result chan bool) {
|
2015-05-21 21:43:42 +00:00
|
|
|
// Setup
|
2015-06-17 07:13:26 +00:00
|
|
|
ns := api.NamespaceDefault
|
|
|
|
ps := newPodStore(c, ns, labels.Everything(), fields.OneTermEqualSelector(client.PodHost, name))
|
2015-05-21 21:43:42 +00:00
|
|
|
defer ps.Stop()
|
|
|
|
|
2015-04-29 22:28:48 +00:00
|
|
|
// Get the node initially.
|
|
|
|
Logf("Getting %s", name)
|
|
|
|
node, err := c.Nodes().Get(name)
|
|
|
|
if err != nil {
|
2015-05-16 02:46:00 +00:00
|
|
|
Logf("Couldn't get node %s", name)
|
2015-04-29 22:28:48 +00:00
|
|
|
result <- false
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Node sanity check: ensure it is "ready".
|
|
|
|
if !waitForNodeToBeReady(c, name, nodeReadyInitialTimeout) {
|
|
|
|
result <- false
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2015-06-30 14:57:44 +00:00
|
|
|
// Get all the pods on the node that don't have liveness probe set.
|
|
|
|
// Liveness probe may cause restart of a pod during node reboot, and the pod may not be running.
|
2015-05-21 21:43:42 +00:00
|
|
|
pods := ps.List()
|
2015-06-30 14:57:44 +00:00
|
|
|
podNames := []string{}
|
|
|
|
for _, p := range pods {
|
|
|
|
probe := false
|
|
|
|
for _, c := range p.Spec.Containers {
|
|
|
|
if c.LivenessProbe != nil {
|
|
|
|
probe = true
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if !probe {
|
|
|
|
podNames = append(podNames, p.ObjectMeta.Name)
|
|
|
|
}
|
2015-04-29 22:28:48 +00:00
|
|
|
}
|
|
|
|
Logf("Node %s has %d pods: %v", name, len(podNames), podNames)
|
|
|
|
|
|
|
|
// For each pod, we do a sanity check to ensure it's running / healthy
|
|
|
|
// now, as that's what we'll be checking later.
|
2015-06-17 07:13:26 +00:00
|
|
|
if !checkPodsRunningReady(c, ns, podNames, podReadyBeforeTimeout) {
|
2015-04-29 22:28:48 +00:00
|
|
|
result <- false
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reboot the node.
|
2015-05-25 14:15:27 +00:00
|
|
|
if err = issueSSHCommand(node, provider, rebootCmd); err != nil {
|
|
|
|
Logf("Error while issuing ssh command: %v", err)
|
2015-06-02 06:05:20 +00:00
|
|
|
result <- false
|
|
|
|
return
|
2015-04-29 22:28:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for some kind of "not ready" status.
|
2015-05-21 21:43:42 +00:00
|
|
|
if !waitForNodeToBeNotReady(c, name, rebootNodeNotReadyTimeout) {
|
2015-04-29 22:28:48 +00:00
|
|
|
result <- false
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for some kind of "ready" status.
|
2015-05-21 21:43:42 +00:00
|
|
|
if !waitForNodeToBeReady(c, name, rebootNodeReadyAgainTimeout) {
|
2015-04-29 22:28:48 +00:00
|
|
|
result <- false
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure all of the pods that we found on this node before the reboot are
|
|
|
|
// running / healthy.
|
2015-06-17 07:13:26 +00:00
|
|
|
if !checkPodsRunningReady(c, ns, podNames, rebootPodReadyAgainTimeout) {
|
2015-04-29 22:28:48 +00:00
|
|
|
result <- false
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
Logf("Reboot successful on node %s", name)
|
|
|
|
result <- true
|
|
|
|
}
|