mirror of https://github.com/k3s-io/k3s
Merge pull request #71320 from mborsz/nodekiller
Introduce NodeKiller -- a utility to simulate node failures in e2e testspull/564/head
commit
ba5f1cbfba
|
@ -54,6 +54,7 @@ import (
|
|||
|
||||
var (
|
||||
cloudConfig = &framework.TestContext.CloudConfig
|
||||
nodeKillerStopCh = make(chan struct{})
|
||||
)
|
||||
|
||||
// There are certain operations we only want to run once per overall test invocation
|
||||
|
@ -136,6 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
|
|||
// Reference common test to make the import valid.
|
||||
commontest.CurrentSuite = commontest.E2E
|
||||
|
||||
if framework.TestContext.NodeKiller.Enabled {
|
||||
nodeKiller := framework.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider)
|
||||
nodeKillerStopCh = make(chan struct{})
|
||||
go nodeKiller.Run(nodeKillerStopCh)
|
||||
}
|
||||
return nil
|
||||
|
||||
}, func(data []byte) {
|
||||
|
@ -160,6 +166,9 @@ var _ = ginkgo.SynchronizedAfterSuite(func() {
|
|||
framework.Logf("Error gathering metrics: %v", err)
|
||||
}
|
||||
}
|
||||
if framework.TestContext.NodeKiller.Enabled {
|
||||
close(nodeKillerStopCh)
|
||||
}
|
||||
})
|
||||
|
||||
func gatherTestSuiteMetrics() error {
|
||||
|
|
|
@ -22,9 +22,12 @@ import (
|
|||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"k8s.io/api/core/v1"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
)
|
||||
|
||||
func EtcdUpgrade(target_storage, target_version string) error {
|
||||
|
@ -331,3 +334,61 @@ func waitForSSHTunnels() {
|
|||
return err == nil, nil
|
||||
})
|
||||
}
|
||||
|
||||
// NodeKiller is a utility to simulate node failures.
|
||||
type NodeKiller struct {
|
||||
config NodeKillerConfig
|
||||
client clientset.Interface
|
||||
provider string
|
||||
}
|
||||
|
||||
// NewNodeKiller creates new NodeKiller.
|
||||
func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller {
|
||||
return &NodeKiller{config, client, provider}
|
||||
}
|
||||
|
||||
// Run starts NodeKiller until stopCh is closed.
|
||||
func (k *NodeKiller) Run(stopCh <-chan struct{}) {
|
||||
wait.JitterUntil(func() {
|
||||
nodes := k.pickNodes()
|
||||
k.kill(nodes)
|
||||
}, k.config.Interval, k.config.JitterFactor, true, stopCh)
|
||||
}
|
||||
|
||||
func (k *NodeKiller) pickNodes() []v1.Node {
|
||||
nodes := GetReadySchedulableNodesOrDie(k.client)
|
||||
numNodes := int(k.config.FailureRatio * float64(len(nodes.Items)))
|
||||
shuffledNodes := shuffleNodes(nodes.Items)
|
||||
if len(shuffledNodes) > numNodes {
|
||||
return shuffledNodes[:numNodes]
|
||||
}
|
||||
return shuffledNodes
|
||||
}
|
||||
|
||||
func (k *NodeKiller) kill(nodes []v1.Node) {
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(len(nodes))
|
||||
for _, node := range nodes {
|
||||
node := node
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
|
||||
Logf("Stopping docker and kubelet on %q to simulate failure", node.Name)
|
||||
err := IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node)
|
||||
if err != nil {
|
||||
Logf("ERROR while stopping node %q: %v", node.Name, err)
|
||||
return
|
||||
}
|
||||
|
||||
time.Sleep(k.config.SimulatedDowntime)
|
||||
|
||||
Logf("Rebooting %q to repair the node", node.Name)
|
||||
err = IssueSSHCommand("sudo reboot", k.provider, &node)
|
||||
if err != nil {
|
||||
Logf("ERROR while rebooting node %q: %v", node.Name, err)
|
||||
return
|
||||
}
|
||||
}()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
|
|
@ -148,6 +148,26 @@ type TestContextType struct {
|
|||
|
||||
// The DNS Domain of the cluster.
|
||||
ClusterDNSDomain string
|
||||
|
||||
// The configration of NodeKiller.
|
||||
NodeKiller NodeKillerConfig
|
||||
}
|
||||
|
||||
// NodeKillerConfig describes configuration of NodeKiller -- a utility to
|
||||
// simulate node failures.
|
||||
type NodeKillerConfig struct {
|
||||
// Enabled determines whether NodeKill should do anything at all.
|
||||
// All other options below are ignored if Enabled = false.
|
||||
Enabled bool
|
||||
// FailureRatio is a percentage of all nodes that could fail simultinously.
|
||||
FailureRatio float64
|
||||
// Interval is time between node failures.
|
||||
Interval time.Duration
|
||||
// JitterFactor is factor used to jitter node failures.
|
||||
// Node will be killed between [Interval, Interval + (1.0 + JitterFactor)].
|
||||
JitterFactor float64
|
||||
// SimulatedDowntime is a duration between node is killed and recreated.
|
||||
SimulatedDowntime time.Duration
|
||||
}
|
||||
|
||||
// NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
|
||||
|
@ -281,6 +301,13 @@ func RegisterClusterFlags() {
|
|||
flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.")
|
||||
flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.")
|
||||
flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.")
|
||||
|
||||
nodeKiller := &TestContext.NodeKiller
|
||||
flag.BoolVar(&nodeKiller.Enabled, "node-killer", false, "Whether NodeKiller should kill any nodes.")
|
||||
flag.Float64Var(&nodeKiller.FailureRatio, "node-killer-failure-ratio", 0.01, "Percentage of nodes to be killed")
|
||||
flag.DurationVar(&nodeKiller.Interval, "node-killer-interval", 1*time.Minute, "Time between node failures.")
|
||||
flag.Float64Var(&nodeKiller.JitterFactor, "node-killer-jitter-factor", 60, "Factor used to jitter node failures.")
|
||||
flag.DurationVar(&nodeKiller.SimulatedDowntime, "node-killer-simulated-downtime", 10*time.Minute, "A delay between node death and recreation")
|
||||
}
|
||||
|
||||
// Register flags specific to the node e2e test suite.
|
||||
|
|
Loading…
Reference in New Issue