Merge pull request #71320 from mborsz/nodekiller

Introduce NodeKiller -- a utility to simulate node failures in e2e tests
2018-11-29 05:18:52 -08:00 · 2018-11-29 05:18:52 -08:00 · ba5f1cbfba
parent 9f75e03481 9e493e169d
commit ba5f1cbfba
3 changed files with 98 additions and 1 deletions
--- a/test/e2e/e2e.go
+++ b/test/e2e/e2e.go
@ -54,6 +54,7 @@ import (

 var (
 	cloudConfig      = &framework.TestContext.CloudConfig
+	nodeKillerStopCh = make(chan struct{})
 )

 // There are certain operations we only want to run once per overall test invocation
@ -136,6 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 	// Reference common test to make the import valid.
 	commontest.CurrentSuite = commontest.E2E

+	if framework.TestContext.NodeKiller.Enabled {
+		nodeKiller := framework.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider)
+		nodeKillerStopCh = make(chan struct{})
+		go nodeKiller.Run(nodeKillerStopCh)
+	}
 	return nil

 }, func(data []byte) {
@ -160,6 +166,9 @@ var _ = ginkgo.SynchronizedAfterSuite(func() {
 			framework.Logf("Error gathering metrics: %v", err)
 		}
 	}
+	if framework.TestContext.NodeKiller.Enabled {
+		close(nodeKillerStopCh)
+	}
 })

 func gatherTestSuiteMetrics() error {
--- a/test/e2e/framework/nodes_util.go
+++ b/test/e2e/framework/nodes_util.go
@ -22,9 +22,12 @@ import (
 	"path"
 	"path/filepath"
 	"strings"
+	"sync"
 	"time"

+	"k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
+	clientset "k8s.io/client-go/kubernetes"
 )

 func EtcdUpgrade(target_storage, target_version string) error {
@ -331,3 +334,61 @@ func waitForSSHTunnels() {
 		return err == nil, nil
 	})
 }
+
+// NodeKiller is a utility to simulate node failures.
+type NodeKiller struct {
+	config   NodeKillerConfig
+	client   clientset.Interface
+	provider string
+}
+
+// NewNodeKiller creates new NodeKiller.
+func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller {
+	return &NodeKiller{config, client, provider}
+}
+
+// Run starts NodeKiller until stopCh is closed.
+func (k *NodeKiller) Run(stopCh <-chan struct{}) {
+	wait.JitterUntil(func() {
+		nodes := k.pickNodes()
+		k.kill(nodes)
+	}, k.config.Interval, k.config.JitterFactor, true, stopCh)
+}
+
+func (k *NodeKiller) pickNodes() []v1.Node {
+	nodes := GetReadySchedulableNodesOrDie(k.client)
+	numNodes := int(k.config.FailureRatio * float64(len(nodes.Items)))
+	shuffledNodes := shuffleNodes(nodes.Items)
+	if len(shuffledNodes) > numNodes {
+		return shuffledNodes[:numNodes]
+	}
+	return shuffledNodes
+}
+
+func (k *NodeKiller) kill(nodes []v1.Node) {
+	wg := sync.WaitGroup{}
+	wg.Add(len(nodes))
+	for _, node := range nodes {
+		node := node
+		go func() {
+			defer wg.Done()
+
+			Logf("Stopping docker and kubelet on %q to simulate failure", node.Name)
+			err := IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node)
+			if err != nil {
+				Logf("ERROR while stopping node %q: %v", node.Name, err)
+				return
+			}
+
+			time.Sleep(k.config.SimulatedDowntime)
+
+			Logf("Rebooting %q to repair the node", node.Name)
+			err = IssueSSHCommand("sudo reboot", k.provider, &node)
+			if err != nil {
+				Logf("ERROR while rebooting node %q: %v", node.Name, err)
+				return
+			}
+		}()
+	}
+	wg.Wait()
+}
--- a/test/e2e/framework/test_context.go
+++ b/test/e2e/framework/test_context.go
@ -148,6 +148,26 @@ type TestContextType struct {

 	// The DNS Domain of the cluster.
 	ClusterDNSDomain string
+
+	// The configration of NodeKiller.
+	NodeKiller NodeKillerConfig
+}
+
+// NodeKillerConfig describes configuration of NodeKiller -- a utility to
+// simulate node failures.
+type NodeKillerConfig struct {
+	// Enabled determines whether NodeKill should do anything at all.
+	// All other options below are ignored if Enabled = false.
+	Enabled bool
+	// FailureRatio is a percentage of all nodes that could fail simultinously.
+	FailureRatio float64
+	// Interval is time between node failures.
+	Interval time.Duration
+	// JitterFactor is factor used to jitter node failures.
+	// Node will be killed between [Interval, Interval + (1.0 + JitterFactor)].
+	JitterFactor float64
+	// SimulatedDowntime is a duration between node is killed and recreated.
+	SimulatedDowntime time.Duration
 }

 // NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
@ -281,6 +301,13 @@ func RegisterClusterFlags() {
 	flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.")
 	flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.")
 	flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.")
+
+	nodeKiller := &TestContext.NodeKiller
+	flag.BoolVar(&nodeKiller.Enabled, "node-killer", false, "Whether NodeKiller should kill any nodes.")
+	flag.Float64Var(&nodeKiller.FailureRatio, "node-killer-failure-ratio", 0.01, "Percentage of nodes to be killed")
+	flag.DurationVar(&nodeKiller.Interval, "node-killer-interval", 1*time.Minute, "Time between node failures.")
+	flag.Float64Var(&nodeKiller.JitterFactor, "node-killer-jitter-factor", 60, "Factor used to jitter node failures.")
+	flag.DurationVar(&nodeKiller.SimulatedDowntime, "node-killer-simulated-downtime", 10*time.Minute, "A delay between node death and recreation")
 }

 // Register flags specific to the node e2e test suite.