mirror of https://github.com/k3s-io/k3s
Merge pull request #48739 from yguo0905/fix-dup-metrics
Automatic merge from submit-queue Remove the status of the terminated containers in the summary endpoint Ref: https://github.com/kubernetes/kubernetes/issues/47853 - When building summary, a container is considered to be terminated if it has an older creation time and no CPU instantaneous or memory RSS usage. - We remove the terminated containers in the summary by grouping the containers with the same name in the same pod, sorting them in each group by creation time, and skipping the oldest ones with no usage in each group. Let me know if there's simpler way. **Release note**: ``` None ``` /assign @yujuhongpull/6/head
commit
f59b04b82d
|
@ -18,6 +18,7 @@ package stats
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
@ -128,7 +129,7 @@ func (sb *summaryBuilder) build() (*stats.Summary, error) {
|
|||
}
|
||||
|
||||
rootStats := sb.containerInfoV2ToStats("", &rootInfo)
|
||||
cStats, _ := sb.latestContainerStats(&rootInfo)
|
||||
cStats, _ := latestContainerStats(&rootInfo)
|
||||
nodeStats := stats.NodeStats{
|
||||
NodeName: sb.node.Name,
|
||||
CPU: rootStats.CPU,
|
||||
|
@ -184,7 +185,7 @@ func (sb *summaryBuilder) containerInfoV2FsStats(
|
|||
info *cadvisorapiv2.ContainerInfo,
|
||||
cs *stats.ContainerStats) {
|
||||
|
||||
lcs, found := sb.latestContainerStats(info)
|
||||
lcs, found := latestContainerStats(info)
|
||||
if !found {
|
||||
return
|
||||
}
|
||||
|
@ -230,7 +231,7 @@ func (sb *summaryBuilder) containerInfoV2FsStats(
|
|||
}
|
||||
|
||||
// latestContainerStats returns the latest container stats from cadvisor, or nil if none exist
|
||||
func (sb *summaryBuilder) latestContainerStats(info *cadvisorapiv2.ContainerInfo) (*cadvisorapiv2.ContainerStats, bool) {
|
||||
func latestContainerStats(info *cadvisorapiv2.ContainerInfo) (*cadvisorapiv2.ContainerStats, bool) {
|
||||
stats := info.Stats
|
||||
if len(stats) < 1 {
|
||||
return nil, false
|
||||
|
@ -242,12 +243,101 @@ func (sb *summaryBuilder) latestContainerStats(info *cadvisorapiv2.ContainerInfo
|
|||
return latest, true
|
||||
}
|
||||
|
||||
// hasMemoryAndCPUInstUsage returns true if the specified container info has
|
||||
// both non-zero CPU instantaneous usage and non-zero memory RSS usage, and
|
||||
// false otherwise.
|
||||
func hasMemoryAndCPUInstUsage(info *cadvisorapiv2.ContainerInfo) bool {
|
||||
if !info.Spec.HasCpu || !info.Spec.HasMemory {
|
||||
return false
|
||||
}
|
||||
cstat, found := latestContainerStats(info)
|
||||
if !found {
|
||||
return false
|
||||
}
|
||||
if cstat.CpuInst == nil {
|
||||
return false
|
||||
}
|
||||
return cstat.CpuInst.Usage.Total != 0 && cstat.Memory.RSS != 0
|
||||
}
|
||||
|
||||
// ByCreationTime implements sort.Interface for []containerInfoWithCgroup based
|
||||
// on the cinfo.Spec.CreationTime field.
|
||||
type ByCreationTime []containerInfoWithCgroup
|
||||
|
||||
func (a ByCreationTime) Len() int { return len(a) }
|
||||
func (a ByCreationTime) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a ByCreationTime) Less(i, j int) bool {
|
||||
if a[i].cinfo.Spec.CreationTime.Equal(a[j].cinfo.Spec.CreationTime) {
|
||||
// There shouldn't be two containers with the same name and/or the same
|
||||
// creation time. However, to make the logic here robust, we break the
|
||||
// tie by moving the one without CPU instantaneous or memory RSS usage
|
||||
// to the beginning.
|
||||
return hasMemoryAndCPUInstUsage(&a[j].cinfo)
|
||||
}
|
||||
return a[i].cinfo.Spec.CreationTime.Before(a[j].cinfo.Spec.CreationTime)
|
||||
}
|
||||
|
||||
// containerID is the identity of a container in a pod.
|
||||
type containerID struct {
|
||||
podRef stats.PodReference
|
||||
containerName string
|
||||
}
|
||||
|
||||
// containerInfoWithCgroup contains the ContainerInfo and its cgroup name.
|
||||
type containerInfoWithCgroup struct {
|
||||
cinfo cadvisorapiv2.ContainerInfo
|
||||
cgroup string
|
||||
}
|
||||
|
||||
// removeTerminatedContainerInfo returns the specified containerInfo but with
|
||||
// the stats of the terminated containers removed.
|
||||
//
|
||||
// A ContainerInfo is considered to be of a terminated container if it has an
|
||||
// older CreationTime and zero CPU instantaneous and memory RSS usage.
|
||||
func removeTerminatedContainerInfo(containerInfo map[string]cadvisorapiv2.ContainerInfo) map[string]cadvisorapiv2.ContainerInfo {
|
||||
cinfoMap := make(map[containerID][]containerInfoWithCgroup)
|
||||
for key, cinfo := range containerInfo {
|
||||
if !isPodManagedContainer(&cinfo) {
|
||||
continue
|
||||
}
|
||||
cinfoID := containerID{
|
||||
podRef: buildPodRef(&cinfo),
|
||||
containerName: types.GetContainerName(cinfo.Spec.Labels),
|
||||
}
|
||||
cinfoMap[cinfoID] = append(cinfoMap[cinfoID], containerInfoWithCgroup{
|
||||
cinfo: cinfo,
|
||||
cgroup: key,
|
||||
})
|
||||
}
|
||||
result := make(map[string]cadvisorapiv2.ContainerInfo)
|
||||
for _, refs := range cinfoMap {
|
||||
if len(refs) == 1 {
|
||||
result[refs[0].cgroup] = refs[0].cinfo
|
||||
continue
|
||||
}
|
||||
sort.Sort(ByCreationTime(refs))
|
||||
i := 0
|
||||
for ; i < len(refs); i++ {
|
||||
if hasMemoryAndCPUInstUsage(&refs[i].cinfo) {
|
||||
// Stops removing when we first see an info with non-zero
|
||||
// CPU/Memory usage.
|
||||
break
|
||||
}
|
||||
}
|
||||
for ; i < len(refs); i++ {
|
||||
result[refs[i].cgroup] = refs[i].cinfo
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// buildSummaryPods aggregates and returns the container stats in cinfos by the Pod managing the container.
|
||||
// Containers not managed by a Pod are omitted.
|
||||
func (sb *summaryBuilder) buildSummaryPods() []stats.PodStats {
|
||||
// Map each container to a pod and update the PodStats with container data
|
||||
podToStats := map[stats.PodReference]*stats.PodStats{}
|
||||
for key, cinfo := range sb.infos {
|
||||
infos := removeTerminatedContainerInfo(sb.infos)
|
||||
for key, cinfo := range infos {
|
||||
// on systemd using devicemapper each mount into the container has an associated cgroup.
|
||||
// we ignore them to ensure we do not get duplicate entries in our summary.
|
||||
// for details on .mount units: http://man7.org/linux/man-pages/man5/systemd.mount.5.html
|
||||
|
@ -255,10 +345,10 @@ func (sb *summaryBuilder) buildSummaryPods() []stats.PodStats {
|
|||
continue
|
||||
}
|
||||
// Build the Pod key if this container is managed by a Pod
|
||||
if !sb.isPodManagedContainer(&cinfo) {
|
||||
if !isPodManagedContainer(&cinfo) {
|
||||
continue
|
||||
}
|
||||
ref := sb.buildPodRef(&cinfo)
|
||||
ref := buildPodRef(&cinfo)
|
||||
|
||||
// Lookup the PodStats for the pod using the PodRef. If none exists, initialize a new entry.
|
||||
podStats, found := podToStats[ref]
|
||||
|
@ -292,7 +382,7 @@ func (sb *summaryBuilder) buildSummaryPods() []stats.PodStats {
|
|||
}
|
||||
|
||||
// buildPodRef returns a PodReference that identifies the Pod managing cinfo
|
||||
func (sb *summaryBuilder) buildPodRef(cinfo *cadvisorapiv2.ContainerInfo) stats.PodReference {
|
||||
func buildPodRef(cinfo *cadvisorapiv2.ContainerInfo) stats.PodReference {
|
||||
podName := types.GetPodName(cinfo.Spec.Labels)
|
||||
podNamespace := types.GetPodNamespace(cinfo.Spec.Labels)
|
||||
podUID := types.GetPodUID(cinfo.Spec.Labels)
|
||||
|
@ -300,7 +390,7 @@ func (sb *summaryBuilder) buildPodRef(cinfo *cadvisorapiv2.ContainerInfo) stats.
|
|||
}
|
||||
|
||||
// isPodManagedContainer returns true if the cinfo container is managed by a Pod
|
||||
func (sb *summaryBuilder) isPodManagedContainer(cinfo *cadvisorapiv2.ContainerInfo) bool {
|
||||
func isPodManagedContainer(cinfo *cadvisorapiv2.ContainerInfo) bool {
|
||||
podName := types.GetPodName(cinfo.Spec.Labels)
|
||||
podNamespace := types.GetPodNamespace(cinfo.Spec.Labels)
|
||||
managed := podName != "" && podNamespace != ""
|
||||
|
@ -319,7 +409,7 @@ func (sb *summaryBuilder) containerInfoV2ToStats(
|
|||
StartTime: metav1.NewTime(info.Spec.CreationTime),
|
||||
Name: name,
|
||||
}
|
||||
cstat, found := sb.latestContainerStats(info)
|
||||
cstat, found := latestContainerStats(info)
|
||||
if !found {
|
||||
return cStats
|
||||
}
|
||||
|
@ -371,7 +461,7 @@ func (sb *summaryBuilder) containerInfoV2ToNetworkStats(name string, info *cadvi
|
|||
if !info.Spec.HasNetwork {
|
||||
return nil
|
||||
}
|
||||
cstat, found := sb.latestContainerStats(info)
|
||||
cstat, found := latestContainerStats(info)
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -53,6 +53,50 @@ var (
|
|||
creationTime = timestamp.Add(-5 * time.Minute)
|
||||
)
|
||||
|
||||
func TestRemoveTerminatedContainerInfo(t *testing.T) {
|
||||
const (
|
||||
seedPastPod0Infra = 1000
|
||||
seedPastPod0Container0 = 2000
|
||||
seedPod0Infra = 3000
|
||||
seedPod0Container0 = 4000
|
||||
)
|
||||
const (
|
||||
namespace = "test"
|
||||
pName0 = "pod0"
|
||||
cName00 = "c0"
|
||||
)
|
||||
infos := map[string]v2.ContainerInfo{
|
||||
// ContainerInfo with past creation time and no CPU/memory usage for
|
||||
// simulating uncleaned cgroups of already terminated containers, which
|
||||
// should not be shown in the results.
|
||||
"/pod0-i-terminated-1": summaryTerminatedContainerInfo(seedPastPod0Infra, pName0, namespace, leaky.PodInfraContainerName),
|
||||
"/pod0-c0-terminated-1": summaryTerminatedContainerInfo(seedPastPod0Container0, pName0, namespace, cName00),
|
||||
|
||||
// Same as above but uses the same creation time as the latest
|
||||
// containers. They are terminated containers, so they should not be in
|
||||
// the results.
|
||||
"/pod0-i-terminated-2": summaryTerminatedContainerInfo(seedPod0Infra, pName0, namespace, leaky.PodInfraContainerName),
|
||||
"/pod0-c0-terminated-2": summaryTerminatedContainerInfo(seedPod0Container0, pName0, namespace, cName00),
|
||||
|
||||
// The latest containers, which should be in the results.
|
||||
"/pod0-i": summaryTestContainerInfo(seedPod0Infra, pName0, namespace, leaky.PodInfraContainerName),
|
||||
"/pod0-c0": summaryTestContainerInfo(seedPod0Container0, pName0, namespace, cName00),
|
||||
|
||||
// Duplicated containers with non-zero CPU and memory usage. This case
|
||||
// shouldn't happen unless something goes wrong, but we want to test
|
||||
// that the metrics reporting logic works in this scenario.
|
||||
"/pod0-i-duplicated": summaryTestContainerInfo(seedPod0Infra, pName0, namespace, leaky.PodInfraContainerName),
|
||||
"/pod0-c0-duplicated": summaryTestContainerInfo(seedPod0Container0, pName0, namespace, cName00),
|
||||
}
|
||||
output := removeTerminatedContainerInfo(infos)
|
||||
assert.Len(t, output, 4)
|
||||
for _, c := range []string{"/pod0-i", "/pod0-c0", "/pod0-i-duplicated", "/pod0-c0-duplicated"} {
|
||||
if _, found := output[c]; !found {
|
||||
t.Errorf("%q is expected to be in the output\n", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSummary(t *testing.T) {
|
||||
node := k8sv1.Node{}
|
||||
node.Name = "FooNode"
|
||||
|
@ -280,6 +324,13 @@ func generateCustomMetrics(spec []v1.MetricSpec) map[string][]v1.MetricVal {
|
|||
return ret
|
||||
}
|
||||
|
||||
func summaryTerminatedContainerInfo(seed int, podName string, podNamespace string, containerName string) v2.ContainerInfo {
|
||||
cinfo := summaryTestContainerInfo(seed, podName, podNamespace, containerName)
|
||||
cinfo.Stats[0].Memory.RSS = 0
|
||||
cinfo.Stats[0].CpuInst.Usage.Total = 0
|
||||
return cinfo
|
||||
}
|
||||
|
||||
func summaryTestContainerInfo(seed int, podName string, podNamespace string, containerName string) v2.ContainerInfo {
|
||||
labels := map[string]string{}
|
||||
if podName != "" {
|
||||
|
|
Loading…
Reference in New Issue