Merge pull request #71320 from mborsz/nodekiller

Introduce NodeKiller -- a utility to simulate node failures in e2e tests
pull/564/head
k8s-ci-robot 2018-11-29 05:18:52 -08:00 committed by GitHub
commit ba5f1cbfba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 98 additions and 1 deletions

View File

@ -53,7 +53,8 @@ import (
) )
var ( var (
cloudConfig = &framework.TestContext.CloudConfig cloudConfig = &framework.TestContext.CloudConfig
nodeKillerStopCh = make(chan struct{})
) )
// There are certain operations we only want to run once per overall test invocation // There are certain operations we only want to run once per overall test invocation
@ -136,6 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
// Reference common test to make the import valid. // Reference common test to make the import valid.
commontest.CurrentSuite = commontest.E2E commontest.CurrentSuite = commontest.E2E
if framework.TestContext.NodeKiller.Enabled {
nodeKiller := framework.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider)
nodeKillerStopCh = make(chan struct{})
go nodeKiller.Run(nodeKillerStopCh)
}
return nil return nil
}, func(data []byte) { }, func(data []byte) {
@ -160,6 +166,9 @@ var _ = ginkgo.SynchronizedAfterSuite(func() {
framework.Logf("Error gathering metrics: %v", err) framework.Logf("Error gathering metrics: %v", err)
} }
} }
if framework.TestContext.NodeKiller.Enabled {
close(nodeKillerStopCh)
}
}) })
func gatherTestSuiteMetrics() error { func gatherTestSuiteMetrics() error {

View File

@ -22,9 +22,12 @@ import (
"path" "path"
"path/filepath" "path/filepath"
"strings" "strings"
"sync"
"time" "time"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
) )
func EtcdUpgrade(target_storage, target_version string) error { func EtcdUpgrade(target_storage, target_version string) error {
@ -331,3 +334,61 @@ func waitForSSHTunnels() {
return err == nil, nil return err == nil, nil
}) })
} }
// NodeKiller is a utility to simulate node failures.
type NodeKiller struct {
config NodeKillerConfig
client clientset.Interface
provider string
}
// NewNodeKiller creates new NodeKiller.
func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller {
return &NodeKiller{config, client, provider}
}
// Run starts NodeKiller until stopCh is closed.
func (k *NodeKiller) Run(stopCh <-chan struct{}) {
wait.JitterUntil(func() {
nodes := k.pickNodes()
k.kill(nodes)
}, k.config.Interval, k.config.JitterFactor, true, stopCh)
}
func (k *NodeKiller) pickNodes() []v1.Node {
nodes := GetReadySchedulableNodesOrDie(k.client)
numNodes := int(k.config.FailureRatio * float64(len(nodes.Items)))
shuffledNodes := shuffleNodes(nodes.Items)
if len(shuffledNodes) > numNodes {
return shuffledNodes[:numNodes]
}
return shuffledNodes
}
func (k *NodeKiller) kill(nodes []v1.Node) {
wg := sync.WaitGroup{}
wg.Add(len(nodes))
for _, node := range nodes {
node := node
go func() {
defer wg.Done()
Logf("Stopping docker and kubelet on %q to simulate failure", node.Name)
err := IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node)
if err != nil {
Logf("ERROR while stopping node %q: %v", node.Name, err)
return
}
time.Sleep(k.config.SimulatedDowntime)
Logf("Rebooting %q to repair the node", node.Name)
err = IssueSSHCommand("sudo reboot", k.provider, &node)
if err != nil {
Logf("ERROR while rebooting node %q: %v", node.Name, err)
return
}
}()
}
wg.Wait()
}

View File

@ -148,6 +148,26 @@ type TestContextType struct {
// The DNS Domain of the cluster. // The DNS Domain of the cluster.
ClusterDNSDomain string ClusterDNSDomain string
// The configration of NodeKiller.
NodeKiller NodeKillerConfig
}
// NodeKillerConfig describes configuration of NodeKiller -- a utility to
// simulate node failures.
type NodeKillerConfig struct {
// Enabled determines whether NodeKill should do anything at all.
// All other options below are ignored if Enabled = false.
Enabled bool
// FailureRatio is a percentage of all nodes that could fail simultinously.
FailureRatio float64
// Interval is time between node failures.
Interval time.Duration
// JitterFactor is factor used to jitter node failures.
// Node will be killed between [Interval, Interval + (1.0 + JitterFactor)].
JitterFactor float64
// SimulatedDowntime is a duration between node is killed and recreated.
SimulatedDowntime time.Duration
} }
// NodeTestContextType is part of TestContextType, it is shared by all node e2e test. // NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
@ -281,6 +301,13 @@ func RegisterClusterFlags() {
flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.") flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.")
flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.") flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.")
flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.") flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.")
nodeKiller := &TestContext.NodeKiller
flag.BoolVar(&nodeKiller.Enabled, "node-killer", false, "Whether NodeKiller should kill any nodes.")
flag.Float64Var(&nodeKiller.FailureRatio, "node-killer-failure-ratio", 0.01, "Percentage of nodes to be killed")
flag.DurationVar(&nodeKiller.Interval, "node-killer-interval", 1*time.Minute, "Time between node failures.")
flag.Float64Var(&nodeKiller.JitterFactor, "node-killer-jitter-factor", 60, "Factor used to jitter node failures.")
flag.DurationVar(&nodeKiller.SimulatedDowntime, "node-killer-simulated-downtime", 10*time.Minute, "A delay between node death and recreation")
} }
// Register flags specific to the node e2e test suite. // Register flags specific to the node e2e test suite.