Merge pull request #76156 from chardch/recreate-master-retry

Recreate nodes e2e test
k3s-v1.15.3
Kubernetes Prow Robot 2019-04-12 23:26:02 -07:00 committed by GitHub
commit df6ca6ab9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 289 additions and 60 deletions

View File

@ -640,6 +640,7 @@ test/e2e/autoscaling
test/e2e/chaosmonkey
test/e2e/common
test/e2e/framework
test/e2e/framework/providers/gce
test/e2e/lifecycle
test/e2e/lifecycle/bootstrap
test/e2e/network

View File

@ -22,7 +22,7 @@ import (
"text/template"
"time"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/apimachinery/pkg/util/sets"
@ -172,7 +172,7 @@ func RestartNodes(c clientset.Interface, nodes []v1.Node) error {
// Wait for their boot IDs to change.
for i := range nodes {
node := &nodes[i]
if err := wait.Poll(30*time.Second, 5*time.Minute, func() (bool, error) {
if err := wait.Poll(30*time.Second, framework.RestartNodeReadyAgainTimeout, func() (bool, error) {
newNode, err := c.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{})
if err != nil {
return false, fmt.Errorf("error getting node info after reboot: %s", err)

View File

@ -54,6 +54,7 @@ go_library(
"//pkg/kubelet/dockershim/metrics:go_default_library",
"//pkg/kubelet/events:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//pkg/kubelet/pod:go_default_library",
"//pkg/kubelet/sysctl:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
"//pkg/master/ports:go_default_library",

View File

@ -6,6 +6,8 @@ go_library(
"firewall.go",
"gce.go",
"ingress.go",
"recreate_node.go",
"util.go",
],
importpath = "k8s.io/kubernetes/test/e2e/framework/providers/gce",
visibility = ["//visibility:public"],
@ -13,13 +15,17 @@ go_library(
"//pkg/cloudprovider/providers/gce:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/fields:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/cloud-provider:go_default_library",
"//test/e2e/framework:go_default_library",
"//test/utils:go_default_library",
"//vendor/github.com/onsi/ginkgo:go_default_library",
"//vendor/github.com/onsi/gomega:go_default_library",
"//vendor/google.golang.org/api/compute/v1:go_default_library",
"//vendor/google.golang.org/api/googleapi:go_default_library",
"//vendor/k8s.io/utils/exec:go_default_library",

View File

@ -0,0 +1,131 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gce
import (
"fmt"
"time"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
testutils "k8s.io/kubernetes/test/utils"
)
func nodeNames(nodes []v1.Node) []string {
result := make([]string, 0, len(nodes))
for i := range nodes {
result = append(result, nodes[i].Name)
}
return result
}
func podNames(pods []v1.Pod) []string {
result := make([]string, 0, len(pods))
for i := range pods {
result = append(result, pods[i].Name)
}
return result
}
var _ = Describe("Recreate [Feature:Recreate]", func() {
f := framework.NewDefaultFramework("recreate")
var originalNodes []v1.Node
var originalPodNames []string
var ps *testutils.PodStore
systemNamespace := metav1.NamespaceSystem
BeforeEach(func() {
framework.SkipUnlessProviderIs("gce", "gke")
var err error
numNodes, err := framework.NumberOfRegisteredNodes(f.ClientSet)
Expect(err).NotTo(HaveOccurred())
originalNodes, err = framework.CheckNodesReady(f.ClientSet, numNodes, framework.NodeReadyInitialTimeout)
Expect(err).NotTo(HaveOccurred())
framework.Logf("Got the following nodes before recreate %v", nodeNames(originalNodes))
ps, err = testutils.NewPodStore(f.ClientSet, systemNamespace, labels.Everything(), fields.Everything())
allPods := ps.List()
originalPods := framework.FilterNonRestartablePods(allPods)
originalPodNames = make([]string, len(originalPods))
for i, p := range originalPods {
originalPodNames[i] = p.ObjectMeta.Name
}
if !framework.CheckPodsRunningReadyOrSucceeded(f.ClientSet, systemNamespace, originalPodNames, framework.PodReadyBeforeTimeout) {
framework.Failf("At least one pod wasn't running and ready or succeeded at test start.")
}
})
AfterEach(func() {
if CurrentGinkgoTestDescription().Failed {
// Make sure that addon/system pods are running, so dump
// events for the kube-system namespace on failures
By(fmt.Sprintf("Collecting events from namespace %q.", systemNamespace))
events, err := f.ClientSet.CoreV1().Events(systemNamespace).List(metav1.ListOptions{})
Expect(err).NotTo(HaveOccurred())
for _, e := range events.Items {
framework.Logf("event for %v: %v %v: %v", e.InvolvedObject.Name, e.Source, e.Reason, e.Message)
}
}
if ps != nil {
ps.Stop()
}
})
It("recreate nodes and ensure they function upon restart", func() {
testRecreate(f.ClientSet, ps, systemNamespace, originalNodes, originalPodNames)
})
})
// Recreate all the nodes in the test instance group
func testRecreate(c clientset.Interface, ps *testutils.PodStore, systemNamespace string, nodes []v1.Node, podNames []string) {
err := recreateNodes(c, nodes)
if err != nil {
framework.Failf("Test failed; failed to start the restart instance group command.")
}
err = waitForNodeBootIdsToChange(c, nodes, framework.RecreateNodeReadyAgainTimeout)
if err != nil {
framework.Failf("Test failed; failed to recreate at least one node in %v.", framework.RecreateNodeReadyAgainTimeout)
}
nodesAfter, err := framework.CheckNodesReady(c, len(nodes), framework.RestartNodeReadyAgainTimeout)
Expect(err).NotTo(HaveOccurred())
framework.Logf("Got the following nodes after recreate: %v", nodeNames(nodesAfter))
if len(nodes) != len(nodesAfter) {
framework.Failf("Had %d nodes before nodes were recreated, but now only have %d",
len(nodes), len(nodesAfter))
}
// Make sure the pods from before node recreation are running/completed
podCheckStart := time.Now()
podNamesAfter, err := framework.WaitForNRestartablePods(ps, len(podNames), framework.RestartPodReadyAgainTimeout)
Expect(err).NotTo(HaveOccurred())
remaining := framework.RestartPodReadyAgainTimeout - time.Since(podCheckStart)
if !framework.CheckPodsRunningReadyOrSucceeded(c, systemNamespace, podNamesAfter, remaining) {
framework.Failf("At least one pod wasn't running and ready after the restart.")
}
}

View File

@ -0,0 +1,91 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package gce
import (
"fmt"
"strings"
"time"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/test/e2e/framework"
)
func recreateNodes(c clientset.Interface, nodes []v1.Node) error {
// Build mapping from zone to nodes in that zone.
nodeNamesByZone := make(map[string][]string)
for i := range nodes {
node := &nodes[i]
zone := framework.TestContext.CloudConfig.Zone
if z, ok := node.Labels[v1.LabelZoneFailureDomain]; ok {
zone = z
}
nodeNamesByZone[zone] = append(nodeNamesByZone[zone], node.Name)
}
// Find the sole managed instance group name
var instanceGroup string
if strings.Index(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") >= 0 {
return fmt.Errorf("Test does not support cluster setup with more than one managed instance group: %s", framework.TestContext.CloudConfig.NodeInstanceGroup)
}
instanceGroup = framework.TestContext.CloudConfig.NodeInstanceGroup
// Recreate the nodes.
for zone, nodeNames := range nodeNamesByZone {
args := []string{
"compute",
fmt.Sprintf("--project=%s", framework.TestContext.CloudConfig.ProjectID),
"instance-groups",
"managed",
"recreate-instances",
instanceGroup,
}
args = append(args, fmt.Sprintf("--instances=%s", strings.Join(nodeNames, ",")))
args = append(args, fmt.Sprintf("--zone=%s", zone))
framework.Logf("Recreating instance group %s.", instanceGroup)
stdout, stderr, err := framework.RunCmd("gcloud", args...)
if err != nil {
return fmt.Errorf("error restarting nodes: %s\nstdout: %s\nstderr: %s", err, stdout, stderr)
}
}
return nil
}
func waitForNodeBootIdsToChange(c clientset.Interface, nodes []v1.Node, timeout time.Duration) error {
errMsg := []string{}
for i := range nodes {
node := &nodes[i]
if err := wait.Poll(30*time.Second, timeout, func() (bool, error) {
newNode, err := c.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{})
if err != nil {
framework.Logf("Could not get node info: %s. Retrying in %v.", err, 30*time.Second)
return false, nil
}
return node.Status.NodeInfo.BootID != newNode.Status.NodeInfo.BootID, nil
}); err != nil {
errMsg = append(errMsg, "Error waiting for node %s boot ID to change: %s", node.Name, err.Error())
}
}
if len(errMsg) > 0 {
return fmt.Errorf(strings.Join(errMsg, ","))
}
return nil
}

View File

@ -85,6 +85,7 @@ import (
nodectlr "k8s.io/kubernetes/pkg/controller/nodelifecycle"
"k8s.io/kubernetes/pkg/controller/service"
"k8s.io/kubernetes/pkg/features"
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/master/ports"
"k8s.io/kubernetes/pkg/scheduler/algorithm/predicates"
@ -177,6 +178,10 @@ const (
// How long PVs have to become deleted
PVDeletingTimeout = 3 * time.Minute
// How long a node is allowed to become "Ready" after it is recreated before
// the test is considered failed.
RecreateNodeReadyAgainTimeout = 10 * time.Minute
// How long a node is allowed to become "Ready" after it is restarted before
// the test is considered failed.
RestartNodeReadyAgainTimeout = 5 * time.Minute
@ -3304,6 +3309,55 @@ func WaitForPodsReady(c clientset.Interface, ns, name string, minReadySeconds in
})
}
// WaitForNPods tries to list restarting pods using ps until it finds expect of them,
// returning their names if it can do so before timeout.
func WaitForNRestartablePods(ps *testutils.PodStore, expect int, timeout time.Duration) ([]string, error) {
var pods []*v1.Pod
var errLast error
found := wait.Poll(Poll, timeout, func() (bool, error) {
allPods := ps.List()
pods = FilterNonRestartablePods(allPods)
if len(pods) != expect {
errLast = fmt.Errorf("expected to find %d pods but found only %d", expect, len(pods))
Logf("Error getting pods: %v", errLast)
return false, nil
}
return true, nil
}) == nil
podNames := make([]string, len(pods))
for i, p := range pods {
podNames[i] = p.ObjectMeta.Name
}
if !found {
return podNames, fmt.Errorf("couldn't find %d pods within %v; last error: %v",
expect, timeout, errLast)
}
return podNames, nil
}
// FilterIrrelevantPods filters out pods that will never get recreated if deleted after termination.
func FilterNonRestartablePods(pods []*v1.Pod) []*v1.Pod {
var results []*v1.Pod
for _, p := range pods {
if isNotRestartAlwaysMirrorPod(p) {
// Mirror pods with restart policy == Never will not get
// recreated if they are deleted after the pods have
// terminated. For now, we discount such pods.
// https://github.com/kubernetes/kubernetes/issues/34003
continue
}
results = append(results, p)
}
return results
}
func isNotRestartAlwaysMirrorPod(p *v1.Pod) bool {
if !kubepod.IsMirrorPod(p) {
return false
}
return p.Spec.RestartPolicy != v1.RestartPolicyAlways
}
// Waits for the number of events on the given object to reach a desired count.
func WaitForEvents(c clientset.Interface, ns string, objOrRef runtime.Object, desiredEventsCount int) error {
return wait.Poll(Poll, 5*time.Minute, func() (bool, error) {

View File

@ -21,7 +21,6 @@ go_library(
importpath = "k8s.io/kubernetes/test/e2e/lifecycle",
deps = [
"//pkg/apis/core:go_default_library",
"//pkg/kubelet/pod:go_default_library",
"//pkg/master/ports:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
@ -29,7 +28,6 @@ go_library(
"//staging/src/k8s.io/apimachinery/pkg/labels:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/version:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/client-go/discovery:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//test/e2e/chaosmonkey:go_default_library",

View File

@ -17,15 +17,12 @@ limitations under the License.
package lifecycle
import (
"fmt"
"time"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/wait"
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
"k8s.io/kubernetes/test/e2e/common"
"k8s.io/kubernetes/test/e2e/framework"
testutils "k8s.io/kubernetes/test/utils"
@ -34,28 +31,6 @@ import (
. "github.com/onsi/gomega"
)
func isNotRestartAlwaysMirrorPod(p *v1.Pod) bool {
if !kubepod.IsMirrorPod(p) {
return false
}
return p.Spec.RestartPolicy != v1.RestartPolicyAlways
}
func filterIrrelevantPods(pods []*v1.Pod) []*v1.Pod {
var results []*v1.Pod
for _, p := range pods {
if isNotRestartAlwaysMirrorPod(p) {
// Mirror pods with restart policy == Never will not get
// recreated if they are deleted after the pods have
// terminated. For now, we discount such pods.
// https://github.com/kubernetes/kubernetes/issues/34003
continue
}
results = append(results, p)
}
return results
}
func nodeNames(nodes []v1.Node) []string {
result := make([]string, 0, len(nodes))
for i := range nodes {
@ -90,7 +65,7 @@ var _ = SIGDescribe("Restart [Disruptive]", func() {
By("ensuring all pods are running and ready")
allPods := ps.List()
pods := filterIrrelevantPods(allPods)
pods := framework.FilterNonRestartablePods(allPods)
originalPodNames = make([]string, len(pods))
for i, p := range pods {
@ -131,7 +106,7 @@ var _ = SIGDescribe("Restart [Disruptive]", func() {
// across node restarts.
By("ensuring the same number of pods are running and ready after restart")
podCheckStart := time.Now()
podNamesAfter, err := waitForNPods(ps, len(originalPodNames), framework.RestartPodReadyAgainTimeout)
podNamesAfter, err := framework.WaitForNRestartablePods(ps, len(originalPodNames), framework.RestartPodReadyAgainTimeout)
Expect(err).NotTo(HaveOccurred())
remaining := framework.RestartPodReadyAgainTimeout - time.Since(podCheckStart)
if !framework.CheckPodsRunningReadyOrSucceeded(f.ClientSet, systemNamespace, podNamesAfter, remaining) {
@ -141,31 +116,3 @@ var _ = SIGDescribe("Restart [Disruptive]", func() {
}
})
})
// waitForNPods tries to list pods using c until it finds expect of them,
// returning their names if it can do so before timeout.
func waitForNPods(ps *testutils.PodStore, expect int, timeout time.Duration) ([]string, error) {
// Loop until we find expect pods or timeout is passed.
var pods []*v1.Pod
var errLast error
found := wait.Poll(framework.Poll, timeout, func() (bool, error) {
allPods := ps.List()
pods = filterIrrelevantPods(allPods)
if len(pods) != expect {
errLast = fmt.Errorf("expected to find %d pods but found only %d", expect, len(pods))
framework.Logf("Error getting pods: %v", errLast)
return false, nil
}
return true, nil
}) == nil
// Extract the names of all found pods.
podNames := make([]string, len(pods))
for i, p := range pods {
podNames[i] = p.ObjectMeta.Name
}
if !found {
return podNames, fmt.Errorf("couldn't find %d pods within %v; last error: %v",
expect, timeout, errLast)
}
return podNames, nil
}