From 9e493e169d0c4bf1c0bdc15c695965bb7971a407 Mon Sep 17 00:00:00 2001 From: Maciej Borsz Date: Wed, 21 Nov 2018 15:56:07 +0100 Subject: [PATCH] Implement NodeKiller -- a util to simulate node failures. --- test/e2e/e2e.go | 11 +++++- test/e2e/framework/nodes_util.go | 61 ++++++++++++++++++++++++++++++ test/e2e/framework/test_context.go | 27 +++++++++++++ 3 files changed, 98 insertions(+), 1 deletion(-) diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go index d05483d20c..9b960fe0c8 100644 --- a/test/e2e/e2e.go +++ b/test/e2e/e2e.go @@ -53,7 +53,8 @@ import ( ) var ( - cloudConfig = &framework.TestContext.CloudConfig + cloudConfig = &framework.TestContext.CloudConfig + nodeKillerStopCh = make(chan struct{}) ) // There are certain operations we only want to run once per overall test invocation @@ -136,6 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { // Reference common test to make the import valid. commontest.CurrentSuite = commontest.E2E + if framework.TestContext.NodeKiller.Enabled { + nodeKiller := framework.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider) + nodeKillerStopCh = make(chan struct{}) + go nodeKiller.Run(nodeKillerStopCh) + } return nil }, func(data []byte) { @@ -160,6 +166,9 @@ var _ = ginkgo.SynchronizedAfterSuite(func() { framework.Logf("Error gathering metrics: %v", err) } } + if framework.TestContext.NodeKiller.Enabled { + close(nodeKillerStopCh) + } }) func gatherTestSuiteMetrics() error { diff --git a/test/e2e/framework/nodes_util.go b/test/e2e/framework/nodes_util.go index d0731cbff5..1151b02630 100644 --- a/test/e2e/framework/nodes_util.go +++ b/test/e2e/framework/nodes_util.go @@ -22,9 +22,12 @@ import ( "path" "path/filepath" "strings" + "sync" "time" + "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" ) func EtcdUpgrade(target_storage, target_version string) error { @@ -331,3 +334,61 @@ func waitForSSHTunnels() { return err == nil, nil }) } + +// NodeKiller is a utility to simulate node failures. +type NodeKiller struct { + config NodeKillerConfig + client clientset.Interface + provider string +} + +// NewNodeKiller creates new NodeKiller. +func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller { + return &NodeKiller{config, client, provider} +} + +// Run starts NodeKiller until stopCh is closed. +func (k *NodeKiller) Run(stopCh <-chan struct{}) { + wait.JitterUntil(func() { + nodes := k.pickNodes() + k.kill(nodes) + }, k.config.Interval, k.config.JitterFactor, true, stopCh) +} + +func (k *NodeKiller) pickNodes() []v1.Node { + nodes := GetReadySchedulableNodesOrDie(k.client) + numNodes := int(k.config.FailureRatio * float64(len(nodes.Items))) + shuffledNodes := shuffleNodes(nodes.Items) + if len(shuffledNodes) > numNodes { + return shuffledNodes[:numNodes] + } + return shuffledNodes +} + +func (k *NodeKiller) kill(nodes []v1.Node) { + wg := sync.WaitGroup{} + wg.Add(len(nodes)) + for _, node := range nodes { + node := node + go func() { + defer wg.Done() + + Logf("Stopping docker and kubelet on %q to simulate failure", node.Name) + err := IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node) + if err != nil { + Logf("ERROR while stopping node %q: %v", node.Name, err) + return + } + + time.Sleep(k.config.SimulatedDowntime) + + Logf("Rebooting %q to repair the node", node.Name) + err = IssueSSHCommand("sudo reboot", k.provider, &node) + if err != nil { + Logf("ERROR while rebooting node %q: %v", node.Name, err) + return + } + }() + } + wg.Wait() +} diff --git a/test/e2e/framework/test_context.go b/test/e2e/framework/test_context.go index 7ac659465a..4a2f13f0d9 100644 --- a/test/e2e/framework/test_context.go +++ b/test/e2e/framework/test_context.go @@ -148,6 +148,26 @@ type TestContextType struct { // The DNS Domain of the cluster. ClusterDNSDomain string + + // The configration of NodeKiller. + NodeKiller NodeKillerConfig +} + +// NodeKillerConfig describes configuration of NodeKiller -- a utility to +// simulate node failures. +type NodeKillerConfig struct { + // Enabled determines whether NodeKill should do anything at all. + // All other options below are ignored if Enabled = false. + Enabled bool + // FailureRatio is a percentage of all nodes that could fail simultinously. + FailureRatio float64 + // Interval is time between node failures. + Interval time.Duration + // JitterFactor is factor used to jitter node failures. + // Node will be killed between [Interval, Interval + (1.0 + JitterFactor)]. + JitterFactor float64 + // SimulatedDowntime is a duration between node is killed and recreated. + SimulatedDowntime time.Duration } // NodeTestContextType is part of TestContextType, it is shared by all node e2e test. @@ -281,6 +301,13 @@ func RegisterClusterFlags() { flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.") flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.") flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.") + + nodeKiller := &TestContext.NodeKiller + flag.BoolVar(&nodeKiller.Enabled, "node-killer", false, "Whether NodeKiller should kill any nodes.") + flag.Float64Var(&nodeKiller.FailureRatio, "node-killer-failure-ratio", 0.01, "Percentage of nodes to be killed") + flag.DurationVar(&nodeKiller.Interval, "node-killer-interval", 1*time.Minute, "Time between node failures.") + flag.Float64Var(&nodeKiller.JitterFactor, "node-killer-jitter-factor", 60, "Factor used to jitter node failures.") + flag.DurationVar(&nodeKiller.SimulatedDowntime, "node-killer-simulated-downtime", 10*time.Minute, "A delay between node death and recreation") } // Register flags specific to the node e2e test suite.