From 9e493e169d0c4bf1c0bdc15c695965bb7971a407 Mon Sep 17 00:00:00 2001
From: Maciej Borsz <maciejborsz@google.com>
Date: Wed, 21 Nov 2018 15:56:07 +0100
Subject: [PATCH] Implement NodeKiller -- a util to simulate node failures.

---
 test/e2e/e2e.go                    | 11 +++++-
 test/e2e/framework/nodes_util.go   | 61 ++++++++++++++++++++++++++++++
 test/e2e/framework/test_context.go | 27 +++++++++++++
 3 files changed, 98 insertions(+), 1 deletion(-)

diff --git a/test/e2e/e2e.go b/test/e2e/e2e.go
index d05483d20c..9b960fe0c8 100644
--- a/test/e2e/e2e.go
+++ b/test/e2e/e2e.go
@@ -53,7 +53,8 @@ import (
 )
 
 var (
-	cloudConfig = &framework.TestContext.CloudConfig
+	cloudConfig      = &framework.TestContext.CloudConfig
+	nodeKillerStopCh = make(chan struct{})
 )
 
 // There are certain operations we only want to run once per overall test invocation
@@ -136,6 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
 	// Reference common test to make the import valid.
 	commontest.CurrentSuite = commontest.E2E
 
+	if framework.TestContext.NodeKiller.Enabled {
+		nodeKiller := framework.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider)
+		nodeKillerStopCh = make(chan struct{})
+		go nodeKiller.Run(nodeKillerStopCh)
+	}
 	return nil
 
 }, func(data []byte) {
@@ -160,6 +166,9 @@ var _ = ginkgo.SynchronizedAfterSuite(func() {
 			framework.Logf("Error gathering metrics: %v", err)
 		}
 	}
+	if framework.TestContext.NodeKiller.Enabled {
+		close(nodeKillerStopCh)
+	}
 })
 
 func gatherTestSuiteMetrics() error {
diff --git a/test/e2e/framework/nodes_util.go b/test/e2e/framework/nodes_util.go
index d0731cbff5..1151b02630 100644
--- a/test/e2e/framework/nodes_util.go
+++ b/test/e2e/framework/nodes_util.go
@@ -22,9 +22,12 @@ import (
 	"path"
 	"path/filepath"
 	"strings"
+	"sync"
 	"time"
 
+	"k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
+	clientset "k8s.io/client-go/kubernetes"
 )
 
 func EtcdUpgrade(target_storage, target_version string) error {
@@ -331,3 +334,61 @@ func waitForSSHTunnels() {
 		return err == nil, nil
 	})
 }
+
+// NodeKiller is a utility to simulate node failures.
+type NodeKiller struct {
+	config   NodeKillerConfig
+	client   clientset.Interface
+	provider string
+}
+
+// NewNodeKiller creates new NodeKiller.
+func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller {
+	return &NodeKiller{config, client, provider}
+}
+
+// Run starts NodeKiller until stopCh is closed.
+func (k *NodeKiller) Run(stopCh <-chan struct{}) {
+	wait.JitterUntil(func() {
+		nodes := k.pickNodes()
+		k.kill(nodes)
+	}, k.config.Interval, k.config.JitterFactor, true, stopCh)
+}
+
+func (k *NodeKiller) pickNodes() []v1.Node {
+	nodes := GetReadySchedulableNodesOrDie(k.client)
+	numNodes := int(k.config.FailureRatio * float64(len(nodes.Items)))
+	shuffledNodes := shuffleNodes(nodes.Items)
+	if len(shuffledNodes) > numNodes {
+		return shuffledNodes[:numNodes]
+	}
+	return shuffledNodes
+}
+
+func (k *NodeKiller) kill(nodes []v1.Node) {
+	wg := sync.WaitGroup{}
+	wg.Add(len(nodes))
+	for _, node := range nodes {
+		node := node
+		go func() {
+			defer wg.Done()
+
+			Logf("Stopping docker and kubelet on %q to simulate failure", node.Name)
+			err := IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node)
+			if err != nil {
+				Logf("ERROR while stopping node %q: %v", node.Name, err)
+				return
+			}
+
+			time.Sleep(k.config.SimulatedDowntime)
+
+			Logf("Rebooting %q to repair the node", node.Name)
+			err = IssueSSHCommand("sudo reboot", k.provider, &node)
+			if err != nil {
+				Logf("ERROR while rebooting node %q: %v", node.Name, err)
+				return
+			}
+		}()
+	}
+	wg.Wait()
+}
diff --git a/test/e2e/framework/test_context.go b/test/e2e/framework/test_context.go
index 7ac659465a..4a2f13f0d9 100644
--- a/test/e2e/framework/test_context.go
+++ b/test/e2e/framework/test_context.go
@@ -148,6 +148,26 @@ type TestContextType struct {
 
 	// The DNS Domain of the cluster.
 	ClusterDNSDomain string
+
+	// The configration of NodeKiller.
+	NodeKiller NodeKillerConfig
+}
+
+// NodeKillerConfig describes configuration of NodeKiller -- a utility to
+// simulate node failures.
+type NodeKillerConfig struct {
+	// Enabled determines whether NodeKill should do anything at all.
+	// All other options below are ignored if Enabled = false.
+	Enabled bool
+	// FailureRatio is a percentage of all nodes that could fail simultinously.
+	FailureRatio float64
+	// Interval is time between node failures.
+	Interval time.Duration
+	// JitterFactor is factor used to jitter node failures.
+	// Node will be killed between [Interval, Interval + (1.0 + JitterFactor)].
+	JitterFactor float64
+	// SimulatedDowntime is a duration between node is killed and recreated.
+	SimulatedDowntime time.Duration
 }
 
 // NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
@@ -281,6 +301,13 @@ func RegisterClusterFlags() {
 	flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.")
 	flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.")
 	flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.")
+
+	nodeKiller := &TestContext.NodeKiller
+	flag.BoolVar(&nodeKiller.Enabled, "node-killer", false, "Whether NodeKiller should kill any nodes.")
+	flag.Float64Var(&nodeKiller.FailureRatio, "node-killer-failure-ratio", 0.01, "Percentage of nodes to be killed")
+	flag.DurationVar(&nodeKiller.Interval, "node-killer-interval", 1*time.Minute, "Time between node failures.")
+	flag.Float64Var(&nodeKiller.JitterFactor, "node-killer-jitter-factor", 60, "Factor used to jitter node failures.")
+	flag.DurationVar(&nodeKiller.SimulatedDowntime, "node-killer-simulated-downtime", 10*time.Minute, "A delay between node death and recreation")
 }
 
 // Register flags specific to the node e2e test suite.