mirror of https://github.com/k3s-io/k3s
Merge pull request #71320 from mborsz/nodekiller
Introduce NodeKiller -- a utility to simulate node failures in e2e testspull/564/head
commit
ba5f1cbfba
|
@ -53,7 +53,8 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
cloudConfig = &framework.TestContext.CloudConfig
|
cloudConfig = &framework.TestContext.CloudConfig
|
||||||
|
nodeKillerStopCh = make(chan struct{})
|
||||||
)
|
)
|
||||||
|
|
||||||
// There are certain operations we only want to run once per overall test invocation
|
// There are certain operations we only want to run once per overall test invocation
|
||||||
|
@ -136,6 +137,11 @@ var _ = ginkgo.SynchronizedBeforeSuite(func() []byte {
|
||||||
// Reference common test to make the import valid.
|
// Reference common test to make the import valid.
|
||||||
commontest.CurrentSuite = commontest.E2E
|
commontest.CurrentSuite = commontest.E2E
|
||||||
|
|
||||||
|
if framework.TestContext.NodeKiller.Enabled {
|
||||||
|
nodeKiller := framework.NewNodeKiller(framework.TestContext.NodeKiller, c, framework.TestContext.Provider)
|
||||||
|
nodeKillerStopCh = make(chan struct{})
|
||||||
|
go nodeKiller.Run(nodeKillerStopCh)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|
||||||
}, func(data []byte) {
|
}, func(data []byte) {
|
||||||
|
@ -160,6 +166,9 @@ var _ = ginkgo.SynchronizedAfterSuite(func() {
|
||||||
framework.Logf("Error gathering metrics: %v", err)
|
framework.Logf("Error gathering metrics: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if framework.TestContext.NodeKiller.Enabled {
|
||||||
|
close(nodeKillerStopCh)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
func gatherTestSuiteMetrics() error {
|
func gatherTestSuiteMetrics() error {
|
||||||
|
|
|
@ -22,9 +22,12 @@ import (
|
||||||
"path"
|
"path"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"k8s.io/api/core/v1"
|
||||||
"k8s.io/apimachinery/pkg/util/wait"
|
"k8s.io/apimachinery/pkg/util/wait"
|
||||||
|
clientset "k8s.io/client-go/kubernetes"
|
||||||
)
|
)
|
||||||
|
|
||||||
func EtcdUpgrade(target_storage, target_version string) error {
|
func EtcdUpgrade(target_storage, target_version string) error {
|
||||||
|
@ -331,3 +334,61 @@ func waitForSSHTunnels() {
|
||||||
return err == nil, nil
|
return err == nil, nil
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NodeKiller is a utility to simulate node failures.
|
||||||
|
type NodeKiller struct {
|
||||||
|
config NodeKillerConfig
|
||||||
|
client clientset.Interface
|
||||||
|
provider string
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewNodeKiller creates new NodeKiller.
|
||||||
|
func NewNodeKiller(config NodeKillerConfig, client clientset.Interface, provider string) *NodeKiller {
|
||||||
|
return &NodeKiller{config, client, provider}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run starts NodeKiller until stopCh is closed.
|
||||||
|
func (k *NodeKiller) Run(stopCh <-chan struct{}) {
|
||||||
|
wait.JitterUntil(func() {
|
||||||
|
nodes := k.pickNodes()
|
||||||
|
k.kill(nodes)
|
||||||
|
}, k.config.Interval, k.config.JitterFactor, true, stopCh)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (k *NodeKiller) pickNodes() []v1.Node {
|
||||||
|
nodes := GetReadySchedulableNodesOrDie(k.client)
|
||||||
|
numNodes := int(k.config.FailureRatio * float64(len(nodes.Items)))
|
||||||
|
shuffledNodes := shuffleNodes(nodes.Items)
|
||||||
|
if len(shuffledNodes) > numNodes {
|
||||||
|
return shuffledNodes[:numNodes]
|
||||||
|
}
|
||||||
|
return shuffledNodes
|
||||||
|
}
|
||||||
|
|
||||||
|
func (k *NodeKiller) kill(nodes []v1.Node) {
|
||||||
|
wg := sync.WaitGroup{}
|
||||||
|
wg.Add(len(nodes))
|
||||||
|
for _, node := range nodes {
|
||||||
|
node := node
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
Logf("Stopping docker and kubelet on %q to simulate failure", node.Name)
|
||||||
|
err := IssueSSHCommand("sudo systemctl stop docker kubelet", k.provider, &node)
|
||||||
|
if err != nil {
|
||||||
|
Logf("ERROR while stopping node %q: %v", node.Name, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(k.config.SimulatedDowntime)
|
||||||
|
|
||||||
|
Logf("Rebooting %q to repair the node", node.Name)
|
||||||
|
err = IssueSSHCommand("sudo reboot", k.provider, &node)
|
||||||
|
if err != nil {
|
||||||
|
Logf("ERROR while rebooting node %q: %v", node.Name, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|
|
@ -148,6 +148,26 @@ type TestContextType struct {
|
||||||
|
|
||||||
// The DNS Domain of the cluster.
|
// The DNS Domain of the cluster.
|
||||||
ClusterDNSDomain string
|
ClusterDNSDomain string
|
||||||
|
|
||||||
|
// The configration of NodeKiller.
|
||||||
|
NodeKiller NodeKillerConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// NodeKillerConfig describes configuration of NodeKiller -- a utility to
|
||||||
|
// simulate node failures.
|
||||||
|
type NodeKillerConfig struct {
|
||||||
|
// Enabled determines whether NodeKill should do anything at all.
|
||||||
|
// All other options below are ignored if Enabled = false.
|
||||||
|
Enabled bool
|
||||||
|
// FailureRatio is a percentage of all nodes that could fail simultinously.
|
||||||
|
FailureRatio float64
|
||||||
|
// Interval is time between node failures.
|
||||||
|
Interval time.Duration
|
||||||
|
// JitterFactor is factor used to jitter node failures.
|
||||||
|
// Node will be killed between [Interval, Interval + (1.0 + JitterFactor)].
|
||||||
|
JitterFactor float64
|
||||||
|
// SimulatedDowntime is a duration between node is killed and recreated.
|
||||||
|
SimulatedDowntime time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
|
// NodeTestContextType is part of TestContextType, it is shared by all node e2e test.
|
||||||
|
@ -281,6 +301,13 @@ func RegisterClusterFlags() {
|
||||||
flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.")
|
flag.StringVar(&TestContext.IngressUpgradeImage, "ingress-upgrade-image", "", "Image to upgrade to if doing an upgrade test for ingress.")
|
||||||
flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.")
|
flag.StringVar(&TestContext.GCEUpgradeScript, "gce-upgrade-script", "", "Script to use to upgrade a GCE cluster.")
|
||||||
flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.")
|
flag.BoolVar(&TestContext.CleanStart, "clean-start", false, "If true, purge all namespaces except default and system before running tests. This serves to Cleanup test namespaces from failed/interrupted e2e runs in a long-lived cluster.")
|
||||||
|
|
||||||
|
nodeKiller := &TestContext.NodeKiller
|
||||||
|
flag.BoolVar(&nodeKiller.Enabled, "node-killer", false, "Whether NodeKiller should kill any nodes.")
|
||||||
|
flag.Float64Var(&nodeKiller.FailureRatio, "node-killer-failure-ratio", 0.01, "Percentage of nodes to be killed")
|
||||||
|
flag.DurationVar(&nodeKiller.Interval, "node-killer-interval", 1*time.Minute, "Time between node failures.")
|
||||||
|
flag.Float64Var(&nodeKiller.JitterFactor, "node-killer-jitter-factor", 60, "Factor used to jitter node failures.")
|
||||||
|
flag.DurationVar(&nodeKiller.SimulatedDowntime, "node-killer-simulated-downtime", 10*time.Minute, "A delay between node death and recreation")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register flags specific to the node e2e test suite.
|
// Register flags specific to the node e2e test suite.
|
||||||
|
|
Loading…
Reference in New Issue