Merge pull request #75063 from wangzhen127/npd-test-fix

Fix NPD e2e test on Ubuntu node and update NPD container version
pull/564/head
Kubernetes Prow Robot 2019-03-08 14:19:09 -08:00 committed by GitHub
commit d778b9308a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 87 additions and 31 deletions

View File

@ -1,6 +1,7 @@
# Maintainers
Random-Liu <lantaol@google.com>
wangzhen127 <zhenw@google.com>
[![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/addons/node-problem-detector/MAINTAINERS.md?pixel)]()

View File

@ -0,0 +1,8 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- Random-Liu
- wangzhen127
reviewers:
- Random-Liu
- wangzhen127

View File

@ -26,28 +26,28 @@ subjects:
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: npd-v0.4.1
name: npd-v0.6.2
namespace: kube-system
labels:
k8s-app: node-problem-detector
version: v0.4.1
version: v0.6.2
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
spec:
selector:
matchLabels:
k8s-app: node-problem-detector
version: v0.4.1
version: v0.6.2
template:
metadata:
labels:
k8s-app: node-problem-detector
version: v0.4.1
version: v0.6.2
kubernetes.io/cluster-service: "true"
spec:
containers:
- name: node-problem-detector
image: k8s.gcr.io/node-problem-detector:v0.4.1
image: k8s.gcr.io/node-problem-detector:v0.6.2
command:
- "/bin/sh"
- "-c"

View File

@ -281,8 +281,8 @@ func HighLatencyKubeletOperations(c clientset.Interface, threshold time.Duration
return badMetrics, nil
}
// getStatsSummary contacts kubelet for the container information.
func getStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) {
// GetStatsSummary contacts kubelet for the container information.
func GetStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) {
ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout)
defer cancel()
@ -348,7 +348,7 @@ func getOneTimeResourceUsageOnNode(
return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
}
// Get information of all containers on the node.
summary, err := getStatsSummary(c, nodeName)
summary, err := GetStatsSummary(c, nodeName)
if err != nil {
return nil, err
}

View File

@ -18,6 +18,7 @@ package node
import (
"fmt"
"net"
"sort"
"strconv"
"strings"
@ -51,13 +52,21 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
})
It("should run without error", func() {
By("Getting all nodes' SSH-able IP addresses")
hosts, err := framework.NodeSSHHosts(f.ClientSet)
if err != nil {
framework.Failf("Error getting node hostnames: %v", err)
By("Getting all nodes and their SSH-able IP addresses")
nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
Expect(len(nodes.Items)).NotTo(BeZero())
hosts := []string{}
for _, node := range nodes.Items {
for _, addr := range node.Status.Addresses {
if addr.Type == v1.NodeExternalIP {
hosts = append(hosts, net.JoinHostPort(addr.Address, "22"))
break
}
}
}
Expect(len(hosts)).NotTo(BeZero())
Expect(len(hosts)).To(Equal(len(nodes.Items)))
isStandaloneMode := make(map[string]bool)
cpuUsageStats := make(map[string][]float64)
uptimeStats := make(map[string][]float64)
rssStats := make(map[string][]float64)
@ -69,12 +78,16 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
rssStats[host] = []float64{}
workingSetStats[host] = []float64{}
cmd := "systemctl status node-problem-detector.service"
result, err := framework.SSH(cmd, host, framework.TestContext.Provider)
isStandaloneMode[host] = (err == nil && result.Code == 0)
By(fmt.Sprintf("Check node %q has node-problem-detector process", host))
// Using brackets "[n]" is a trick to prevent grep command itself from
// showing up, because string text "[n]ode-problem-detector" does not
// match regular expression "[n]ode-problem-detector".
psCmd := "ps aux | grep [n]ode-problem-detector"
result, err := framework.SSH(psCmd, host, framework.TestContext.Provider)
result, err = framework.SSH(psCmd, host, framework.TestContext.Provider)
framework.ExpectNoError(err)
Expect(result.Code).To(BeZero())
Expect(result.Stdout).To(ContainSubstring("node-problem-detector"))
@ -86,9 +99,11 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
Expect(result.Code).To(BeZero())
Expect(result.Stdout).NotTo(ContainSubstring("node-problem-detector.service: Failed"))
cpuUsage, uptime := getCpuStat(f, host)
cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
uptimeStats[host] = append(uptimeStats[host], uptime)
if isStandaloneMode[host] {
cpuUsage, uptime := getCpuStat(f, host)
cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
uptimeStats[host] = append(uptimeStats[host], uptime)
}
By(fmt.Sprintf("Inject log to trigger AUFSUmountHung on node %q", host))
log := "INFO: task umount.aufs:21568 blocked for more than 120 seconds."
@ -99,8 +114,6 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
}
By("Check node-problem-detector can post conditions and events to API server")
nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
Expect(len(nodes.Items)).To(Equal(len(hosts)))
for _, node := range nodes.Items {
By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name))
Eventually(func() error {
@ -117,14 +130,21 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
By("Gather node-problem-detector cpu and memory stats")
numIterations := 60
for i := 1; i <= numIterations; i++ {
for _, host := range hosts {
rss, workingSet := getMemoryStat(f, host)
rssStats[host] = append(rssStats[host], rss)
workingSetStats[host] = append(workingSetStats[host], workingSet)
if i == numIterations {
cpuUsage, uptime := getCpuStat(f, host)
for j, host := range hosts {
if isStandaloneMode[host] {
rss, workingSet := getMemoryStat(f, host)
rssStats[host] = append(rssStats[host], rss)
workingSetStats[host] = append(workingSetStats[host], workingSet)
if i == numIterations {
cpuUsage, uptime := getCpuStat(f, host)
cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
uptimeStats[host] = append(uptimeStats[host], uptime)
}
} else {
cpuUsage, rss, workingSet := getNpdPodStat(f, nodes.Items[j].Name)
cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
uptimeStats[host] = append(uptimeStats[host], uptime)
rssStats[host] = append(rssStats[host], rss)
workingSetStats[host] = append(workingSetStats[host], workingSet)
}
}
time.Sleep(time.Second)
@ -134,16 +154,24 @@ var _ = SIGDescribe("NodeProblemDetector", func() {
rssStatsMsg := "RSS (MB):"
workingSetStatsMsg := "WorkingSet (MB):"
for i, host := range hosts {
cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0]
totaltime := uptimeStats[host][1] - uptimeStats[host][0]
cpuStatsMsg += fmt.Sprintf(" Node%d[%.3f];", i, cpuUsage/totaltime)
if isStandaloneMode[host] {
// When in standalone mode, NPD is running as systemd service. We
// calculate its cpu usage from cgroup cpuacct value differences.
cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0]
totaltime := uptimeStats[host][1] - uptimeStats[host][0]
cpuStatsMsg += fmt.Sprintf(" %s[%.3f];", nodes.Items[i].Name, cpuUsage/totaltime)
} else {
sort.Float64s(cpuUsageStats[host])
cpuStatsMsg += fmt.Sprintf(" %s[%.3f|%.3f|%.3f];", nodes.Items[i].Name,
cpuUsageStats[host][0], cpuUsageStats[host][len(cpuUsageStats[host])/2], cpuUsageStats[host][len(cpuUsageStats[host])-1])
}
sort.Float64s(rssStats[host])
rssStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i,
rssStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name,
rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1])
sort.Float64s(workingSetStats[host])
workingSetStatsMsg += fmt.Sprintf(" Node%d[%.1f|%.1f|%.1f];", i,
workingSetStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name,
workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1])
}
framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg)
@ -233,3 +261,22 @@ func getCpuStat(f *framework.Framework, host string) (usage, uptime float64) {
usage *= 1e-9
return
}
func getNpdPodStat(f *framework.Framework, nodeName string) (cpuUsage, rss, workingSet float64) {
summary, err := framework.GetStatsSummary(f.ClientSet, nodeName)
framework.ExpectNoError(err)
hasNpdPod := false
for _, pod := range summary.Pods {
if !strings.HasPrefix(pod.PodRef.Name, "npd") {
continue
}
cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9
rss = float64(*pod.Memory.RSSBytes) / 1024 / 1024
workingSet = float64(*pod.Memory.WorkingSetBytes) / 1024 / 1024
hasNpdPod = true
break
}
Expect(hasNpdPod).To(BeTrue())
return
}