From 84b0e82db4d485ac474e59a34e4eba878c755944 Mon Sep 17 00:00:00 2001 From: abhi Date: Wed, 1 Nov 2017 07:06:01 -0700 Subject: [PATCH 1/2] Integrating cadvisor stats to CRI Pod stats collection This commit addresses the issue described here https://github.com/kubernetes-incubator/cri-containerd/issues/341 The changes include using cadvisor stats in addition to CRI stats for CRI runtimes. As described in the issue above , the CRI stats currently doesnt provide all the necessary stats for the kubelet. This commit addreses the need to extract stats from cadvisor which is not available as CRI stats. Signed-off-by: abhi --- pkg/kubelet/stats/cadvisor_stats_provider.go | 33 +++++---- pkg/kubelet/stats/cri_stats_provider.go | 77 +++++++++++++++++++- pkg/kubelet/stats/cri_stats_provider_test.go | 3 +- pkg/kubelet/stats/helper.go | 47 +++++++----- 4 files changed, 122 insertions(+), 38 deletions(-) diff --git a/pkg/kubelet/stats/cadvisor_stats_provider.go b/pkg/kubelet/stats/cadvisor_stats_provider.go index b498c5158e..17a10df2f6 100644 --- a/pkg/kubelet/stats/cadvisor_stats_provider.go +++ b/pkg/kubelet/stats/cadvisor_stats_provider.go @@ -76,20 +76,9 @@ func (p *cadvisorStatsProvider) ListPodStats() ([]statsapi.PodStats, error) { if err != nil { return nil, fmt.Errorf("failed to get imageFs info: %v", err) } - - infos, err := p.cadvisor.ContainerInfoV2("/", cadvisorapiv2.RequestOptions{ - IdType: cadvisorapiv2.TypeName, - Count: 2, // 2 samples are needed to compute "instantaneous" CPU - Recursive: true, - }) + infos, err := getCadvisorContainerInfo(p.cadvisor) if err != nil { - if _, ok := infos["/"]; ok { - // If the failure is partial, log it and return a best-effort - // response. - glog.Errorf("Partial failure issuing cadvisor.ContainerInfoV2: %v", err) - } else { - return nil, fmt.Errorf("failed to get root cgroup stats: %v", err) - } + return nil, fmt.Errorf("failed to get container info from cadvisor: %v", err) } // removeTerminatedContainerInfo will also remove pod level cgroups, so save the infos into allInfos first allInfos := infos @@ -352,3 +341,21 @@ func hasMemoryAndCPUInstUsage(info *cadvisorapiv2.ContainerInfo) bool { } return cstat.CpuInst.Usage.Total != 0 && cstat.Memory.RSS != 0 } + +func getCadvisorContainerInfo(ca cadvisor.Interface) (map[string]cadvisorapiv2.ContainerInfo, error) { + infos, err := ca.ContainerInfoV2("/", cadvisorapiv2.RequestOptions{ + IdType: cadvisorapiv2.TypeName, + Count: 2, // 2 samples are needed to compute "instantaneous" CPU + Recursive: true, + }) + if err != nil { + if _, ok := infos["/"]; ok { + // If the failure is partial, log it and return a best-effort + // response. + glog.Errorf("Partial failure issuing cadvisor.ContainerInfoV2: %v", err) + } else { + return nil, fmt.Errorf("failed to get root cgroup stats: %v", err) + } + } + return infos, nil +} diff --git a/pkg/kubelet/stats/cri_stats_provider.go b/pkg/kubelet/stats/cri_stats_provider.go index afc364068b..5934c2962b 100644 --- a/pkg/kubelet/stats/cri_stats_provider.go +++ b/pkg/kubelet/stats/cri_stats_provider.go @@ -18,7 +18,9 @@ package stats import ( "fmt" + "path" "sort" + "strings" "time" "github.com/golang/glog" @@ -112,6 +114,11 @@ func (p *criStatsProvider) ListPodStats() ([]statsapi.PodStats, error) { containerMap[c.Id] = c } + caInfos, err := getCRICadvisorStats(p.cadvisor) + if err != nil { + return nil, fmt.Errorf("failed to get container info from cadvisor: %v", err) + } + for _, stats := range resp { containerID := stats.Attributes.Id container, found := containerMap[containerID] @@ -132,10 +139,25 @@ func (p *criStatsProvider) ListPodStats() ([]statsapi.PodStats, error) { ps, found := sandboxIDToPodStats[podSandboxID] if !found { ps = buildPodStats(podSandbox) + // Fill stats from cadvisor is available for full set of required pod stats + caPodSandbox, found := caInfos[podSandboxID] + if !found { + glog.V(4).Info("Unable to find cadvisor stats for sandbox %q", podSandboxID) + } else { + p.addCadvisorPodStats(ps, &caPodSandbox) + } sandboxIDToPodStats[podSandboxID] = ps } - containerStats := p.makeContainerStats(stats, container, &rootFsInfo, uuidToFsInfo) - ps.Containers = append(ps.Containers, *containerStats) + cs := p.makeContainerStats(stats, container, &rootFsInfo, uuidToFsInfo) + // If cadvisor stats is available for the container, use it to populate + // container stats + caStats, caFound := caInfos[containerID] + if !caFound { + glog.V(4).Info("Unable to find cadvisor stats for %q", containerID) + } else { + p.addCadvisorContainerStats(cs, &caStats) + } + ps.Containers = append(ps.Containers, *cs) } result := make([]statsapi.PodStats, 0, len(sandboxIDToPodStats)) @@ -201,7 +223,7 @@ func (p *criStatsProvider) getFsInfo(storageID *runtimeapi.StorageIdentifier) *c return &fsInfo } -// buildPodRef returns a PodStats that identifies the Pod managing cinfo +// buildPodStats returns a PodStats that identifies the Pod managing cinfo func buildPodStats(podSandbox *runtimeapi.PodSandbox) *statsapi.PodStats { return &statsapi.PodStats{ PodRef: statsapi.PodReference{ @@ -211,7 +233,6 @@ func buildPodStats(podSandbox *runtimeapi.PodSandbox) *statsapi.PodStats { }, // The StartTime in the summary API is the pod creation time. StartTime: metav1.NewTime(time.Unix(0, podSandbox.CreatedAt)), - // Network stats are not supported by CRI. } } @@ -226,6 +247,13 @@ func (p *criStatsProvider) makePodStorageStats(s *statsapi.PodStats, rootFsInfo return s } +func (p *criStatsProvider) addCadvisorPodStats( + ps *statsapi.PodStats, + caPodSandbox *cadvisorapiv2.ContainerInfo, +) { + ps.Network = cadvisorInfoToNetworkStats(ps.PodRef.Name, caPodSandbox) +} + func (p *criStatsProvider) makeContainerStats( stats *runtimeapi.ContainerStats, container *runtimeapi.Container, @@ -336,3 +364,44 @@ func removeTerminatedContainer(containers []*runtimeapi.Container) []*runtimeapi } return result } + +func (p *criStatsProvider) addCadvisorContainerStats( + cs *statsapi.ContainerStats, + caPodStats *cadvisorapiv2.ContainerInfo, +) { + if caPodStats.Spec.HasCustomMetrics { + cs.UserDefinedMetrics = cadvisorInfoToUserDefinedMetrics(caPodStats) + } + + cpu, memory := cadvisorInfoToCPUandMemoryStats(caPodStats) + if cpu != nil { + cs.CPU = cpu + } + if memory != nil { + cs.Memory = memory + } +} + +func getCRICadvisorStats(ca cadvisor.Interface) (map[string]cadvisorapiv2.ContainerInfo, error) { + stats := make(map[string]cadvisorapiv2.ContainerInfo) + infos, err := getCadvisorContainerInfo(ca) + if err != nil { + return nil, fmt.Errorf("failed to fetch cadvisor stats: %v", err) + } + infos = removeTerminatedContainerInfo(infos) + for key, info := range infos { + // On systemd using devicemapper each mount into the container has an + // associated cgroup. We ignore them to ensure we do not get duplicate + // entries in our summary. For details on .mount units: + // http://man7.org/linux/man-pages/man5/systemd.mount.5.html + if strings.HasSuffix(key, ".mount") { + continue + } + // Build the Pod key if this container is managed by a Pod + if !isPodManagedContainer(&info) { + continue + } + stats[path.Base(key)] = info + } + return stats, nil +} diff --git a/pkg/kubelet/stats/cri_stats_provider_test.go b/pkg/kubelet/stats/cri_stats_provider_test.go index b966d0ba80..c17793e87b 100644 --- a/pkg/kubelet/stats/cri_stats_provider_test.go +++ b/pkg/kubelet/stats/cri_stats_provider_test.go @@ -22,9 +22,8 @@ import ( "time" cadvisorfs "github.com/google/cadvisor/fs" - "github.com/stretchr/testify/assert" - cadvisorapiv2 "github.com/google/cadvisor/info/v2" + "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" critest "k8s.io/kubernetes/pkg/kubelet/apis/cri/testing" runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime" diff --git a/pkg/kubelet/stats/helper.go b/pkg/kubelet/stats/helper.go index 6315c315f2..e4fde7d163 100644 --- a/pkg/kubelet/stats/helper.go +++ b/pkg/kubelet/stats/helper.go @@ -86,29 +86,12 @@ func cadvisorInfoToContainerStats(name string, info *cadvisorapiv2.ContainerInfo if rootFs != nil { // The container logs live on the node rootfs device - result.Logs = &statsapi.FsStats{ - Time: metav1.NewTime(cstat.Timestamp), - AvailableBytes: &rootFs.Available, - CapacityBytes: &rootFs.Capacity, - InodesFree: rootFs.InodesFree, - Inodes: rootFs.Inodes, - } - - if rootFs.Inodes != nil && rootFs.InodesFree != nil { - logsInodesUsed := *rootFs.Inodes - *rootFs.InodesFree - result.Logs.InodesUsed = &logsInodesUsed - } + result.Logs = buildLogsStats(cstat, rootFs) } if imageFs != nil { // The container rootFs lives on the imageFs devices (which may not be the node root fs) - result.Rootfs = &statsapi.FsStats{ - Time: metav1.NewTime(cstat.Timestamp), - AvailableBytes: &imageFs.Available, - CapacityBytes: &imageFs.Capacity, - InodesFree: imageFs.InodesFree, - Inodes: imageFs.Inodes, - } + result.Rootfs = buildRootfsStats(cstat, imageFs) } cfs := cstat.Filesystem @@ -274,3 +257,29 @@ func getCgroupStats(cadvisor cadvisor.Interface, containerName string) (*cadviso } return stats, nil } + +func buildLogsStats(cstat *cadvisorapiv2.ContainerStats, rootFs *cadvisorapiv2.FsInfo) *statsapi.FsStats { + fsStats := &statsapi.FsStats{ + Time: metav1.NewTime(cstat.Timestamp), + AvailableBytes: &rootFs.Available, + CapacityBytes: &rootFs.Capacity, + InodesFree: rootFs.InodesFree, + Inodes: rootFs.Inodes, + } + + if rootFs.Inodes != nil && rootFs.InodesFree != nil { + logsInodesUsed := *rootFs.Inodes - *rootFs.InodesFree + fsStats.InodesUsed = &logsInodesUsed + } + return fsStats +} + +func buildRootfsStats(cstat *cadvisorapiv2.ContainerStats, imageFs *cadvisorapiv2.FsInfo) *statsapi.FsStats { + return &statsapi.FsStats{ + Time: metav1.NewTime(cstat.Timestamp), + AvailableBytes: &imageFs.Available, + CapacityBytes: &imageFs.Capacity, + InodesFree: imageFs.InodesFree, + Inodes: imageFs.Inodes, + } +} From e19f213027a4184fa87e525aaf58eac07b4c63e9 Mon Sep 17 00:00:00 2001 From: abhi Date: Wed, 15 Nov 2017 09:43:44 -0800 Subject: [PATCH 2/2] Modifying cri stats test cases This commit container modification to cri stats test to verify CPU, Memory, Network stats collected by cadvisor. Signed-off-by: abhi --- pkg/kubelet/stats/cri_stats_provider_test.go | 105 ++++++++++++++----- 1 file changed, 81 insertions(+), 24 deletions(-) diff --git a/pkg/kubelet/stats/cri_stats_provider_test.go b/pkg/kubelet/stats/cri_stats_provider_test.go index c17793e87b..5cff0ac7e9 100644 --- a/pkg/kubelet/stats/cri_stats_provider_test.go +++ b/pkg/kubelet/stats/cri_stats_provider_test.go @@ -30,11 +30,39 @@ import ( statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing" kubecontainertest "k8s.io/kubernetes/pkg/kubelet/container/testing" + "k8s.io/kubernetes/pkg/kubelet/leaky" kubepodtest "k8s.io/kubernetes/pkg/kubelet/pod/testing" serverstats "k8s.io/kubernetes/pkg/kubelet/server/stats" ) func TestCRIListPodStats(t *testing.T) { + const ( + seedRoot = 0 + seedRuntime = 100 + seedKubelet = 200 + seedMisc = 300 + seedSandbox0 = 1000 + seedContainer0 = 2000 + seedSandbox1 = 3000 + seedContainer1 = 4000 + seedContainer2 = 5000 + seedSandbox2 = 6000 + seedContainer3 = 7000 + ) + + const ( + pName0 = "pod0" + pName1 = "pod1" + pName2 = "pod2" + ) + + const ( + cName0 = "container0-name" + cName1 = "container1-name" + cName2 = "container2-name" + cName3 = "container3-name" + ) + var ( imageFsStorageUUID = "imagefs-storage-uuid" unknownStorageUUID = "unknown-storage-uuid" @@ -42,19 +70,19 @@ func TestCRIListPodStats(t *testing.T) { rootFsInfo = getTestFsInfo(1000) sandbox0 = makeFakePodSandbox("sandbox0-name", "sandbox0-uid", "sandbox0-ns") - container0 = makeFakeContainer(sandbox0, "container0-name", 0, false) + container0 = makeFakeContainer(sandbox0, cName0, 0, false) containerStats0 = makeFakeContainerStats(container0, imageFsStorageUUID) - container1 = makeFakeContainer(sandbox0, "container1-name", 0, false) + container1 = makeFakeContainer(sandbox0, cName1, 0, false) containerStats1 = makeFakeContainerStats(container1, unknownStorageUUID) sandbox1 = makeFakePodSandbox("sandbox1-name", "sandbox1-uid", "sandbox1-ns") - container2 = makeFakeContainer(sandbox1, "container2-name", 0, false) + container2 = makeFakeContainer(sandbox1, cName2, 0, false) containerStats2 = makeFakeContainerStats(container2, imageFsStorageUUID) sandbox2 = makeFakePodSandbox("sandbox2-name", "sandbox2-uid", "sandbox2-ns") - container3 = makeFakeContainer(sandbox2, "container3-name", 0, true) + container3 = makeFakeContainer(sandbox2, cName3, 0, true) containerStats3 = makeFakeContainerStats(container3, imageFsStorageUUID) - container4 = makeFakeContainer(sandbox2, "container3-name", 1, false) + container4 = makeFakeContainer(sandbox2, cName3, 1, false) containerStats4 = makeFakeContainerStats(container4, imageFsStorageUUID) ) @@ -67,7 +95,27 @@ func TestCRIListPodStats(t *testing.T) { fakeImageService = critest.NewFakeImageService() ) + infos := map[string]cadvisorapiv2.ContainerInfo{ + "/": getTestContainerInfo(seedRoot, "", "", ""), + "/kubelet": getTestContainerInfo(seedKubelet, "", "", ""), + "/system": getTestContainerInfo(seedMisc, "", "", ""), + sandbox0.PodSandboxStatus.Id: getTestContainerInfo(seedSandbox0, pName0, sandbox0.PodSandboxStatus.Metadata.Namespace, leaky.PodInfraContainerName), + container0.ContainerStatus.Id: getTestContainerInfo(seedContainer0, pName0, sandbox0.PodSandboxStatus.Metadata.Namespace, cName0), + container1.ContainerStatus.Id: getTestContainerInfo(seedContainer1, pName0, sandbox0.PodSandboxStatus.Metadata.Namespace, cName1), + sandbox1.PodSandboxStatus.Id: getTestContainerInfo(seedSandbox1, pName1, sandbox1.PodSandboxStatus.Metadata.Namespace, leaky.PodInfraContainerName), + container2.ContainerStatus.Id: getTestContainerInfo(seedContainer2, pName1, sandbox1.PodSandboxStatus.Metadata.Namespace, cName2), + sandbox2.PodSandboxStatus.Id: getTestContainerInfo(seedSandbox2, pName2, sandbox2.PodSandboxStatus.Metadata.Namespace, leaky.PodInfraContainerName), + container4.ContainerStatus.Id: getTestContainerInfo(seedContainer3, pName2, sandbox2.PodSandboxStatus.Metadata.Namespace, cName3), + } + + options := cadvisorapiv2.RequestOptions{ + IdType: cadvisorapiv2.TypeName, + Count: 2, + Recursive: true, + } + mockCadvisor. + On("ContainerInfoV2", "/", options).Return(infos, nil). On("RootFsInfo").Return(rootFsInfo, nil). On("GetFsInfoByFsUUID", imageFsStorageUUID).Return(imageFsInfo, nil). On("GetFsInfoByFsUUID", unknownStorageUUID).Return(cadvisorapiv2.FsInfo{}, cadvisorfs.ErrNoSuchDevice) @@ -116,16 +164,18 @@ func TestCRIListPodStats(t *testing.T) { for _, s := range p0.Containers { containerStatsMap[s.Name] = s } - c0 := containerStatsMap["container0-name"] + + c0 := containerStatsMap[cName0] assert.Equal(container0.CreatedAt, c0.StartTime.UnixNano()) - checkCRICPUAndMemoryStats(assert, c0, containerStats0) + checkCRICPUAndMemoryStats(assert, c0, infos[container0.ContainerStatus.Id].Stats[0]) checkCRIRootfsStats(assert, c0, containerStats0, &imageFsInfo) checkCRILogsStats(assert, c0, &rootFsInfo) - c1 := containerStatsMap["container1-name"] + c1 := containerStatsMap[cName1] assert.Equal(container1.CreatedAt, c1.StartTime.UnixNano()) - checkCRICPUAndMemoryStats(assert, c1, containerStats1) + checkCRICPUAndMemoryStats(assert, c1, infos[container1.ContainerStatus.Id].Stats[0]) checkCRIRootfsStats(assert, c1, containerStats1, nil) checkCRILogsStats(assert, c1, &rootFsInfo) + checkCRINetworkStats(assert, p0.Network, infos[sandbox0.PodSandboxStatus.Id].Stats[0].Network) p1 := podStatsMap[statsapi.PodReference{Name: "sandbox1-name", UID: "sandbox1-uid", Namespace: "sandbox1-ns"}] assert.Equal(sandbox1.CreatedAt, p1.StartTime.UnixNano()) @@ -133,11 +183,12 @@ func TestCRIListPodStats(t *testing.T) { checkEphemeralStorageStats(assert, p1, ephemeralVolumes, []*runtimeapi.ContainerStats{containerStats2}) c2 := p1.Containers[0] - assert.Equal("container2-name", c2.Name) + assert.Equal(cName2, c2.Name) assert.Equal(container2.CreatedAt, c2.StartTime.UnixNano()) - checkCRICPUAndMemoryStats(assert, c2, containerStats2) + checkCRICPUAndMemoryStats(assert, c2, infos[container2.ContainerStatus.Id].Stats[0]) checkCRIRootfsStats(assert, c2, containerStats2, &imageFsInfo) checkCRILogsStats(assert, c2, &rootFsInfo) + checkCRINetworkStats(assert, p1.Network, infos[sandbox1.PodSandboxStatus.Id].Stats[0].Network) p2 := podStatsMap[statsapi.PodReference{Name: "sandbox2-name", UID: "sandbox2-uid", Namespace: "sandbox2-ns"}] assert.Equal(sandbox2.CreatedAt, p2.StartTime.UnixNano()) @@ -146,12 +197,13 @@ func TestCRIListPodStats(t *testing.T) { checkEphemeralStorageStats(assert, p2, ephemeralVolumes, []*runtimeapi.ContainerStats{containerStats4}) c3 := p2.Containers[0] - assert.Equal("container3-name", c3.Name) + assert.Equal(cName3, c3.Name) assert.Equal(container4.CreatedAt, c3.StartTime.UnixNano()) - checkCRICPUAndMemoryStats(assert, c3, containerStats4) + checkCRICPUAndMemoryStats(assert, c3, infos[container4.ContainerStatus.Id].Stats[0]) checkCRIRootfsStats(assert, c3, containerStats4, &imageFsInfo) checkCRILogsStats(assert, c3, &rootFsInfo) + checkCRINetworkStats(assert, p2.Network, infos[sandbox2.PodSandboxStatus.Id].Stats[0].Network) mockCadvisor.AssertExpectations(t) } @@ -305,18 +357,16 @@ func makeFakeVolumeStats(volumeNames []string) []statsapi.VolumeStats { return volumes } -func checkCRICPUAndMemoryStats(assert *assert.Assertions, actual statsapi.ContainerStats, cs *runtimeapi.ContainerStats) { - assert.Equal(cs.Cpu.Timestamp, actual.CPU.Time.UnixNano()) - assert.Equal(cs.Cpu.UsageCoreNanoSeconds.Value, *actual.CPU.UsageCoreNanoSeconds) - assert.Zero(*actual.CPU.UsageNanoCores) +func checkCRICPUAndMemoryStats(assert *assert.Assertions, actual statsapi.ContainerStats, cs *cadvisorapiv2.ContainerStats) { + assert.Equal(cs.Timestamp.UnixNano(), actual.CPU.Time.UnixNano()) + assert.Equal(cs.Cpu.Usage.Total, *actual.CPU.UsageCoreNanoSeconds) + assert.Equal(cs.CpuInst.Usage.Total, *actual.CPU.UsageNanoCores) - assert.Equal(cs.Memory.Timestamp, actual.Memory.Time.UnixNano()) - assert.Nil(actual.Memory.AvailableBytes) - assert.Nil(actual.Memory.UsageBytes) - assert.Equal(cs.Memory.WorkingSetBytes.Value, *actual.Memory.WorkingSetBytes) - assert.Zero(*actual.Memory.RSSBytes) - assert.Nil(actual.Memory.PageFaults) - assert.Nil(actual.Memory.MajorPageFaults) + assert.Equal(cs.Memory.Usage, *actual.Memory.UsageBytes) + assert.Equal(cs.Memory.WorkingSet, *actual.Memory.WorkingSetBytes) + assert.Equal(cs.Memory.RSS, *actual.Memory.RSSBytes) + assert.Equal(cs.Memory.ContainerData.Pgfault, *actual.Memory.PageFaults) + assert.Equal(cs.Memory.ContainerData.Pgmajfault, *actual.Memory.MajorPageFaults) } func checkCRIRootfsStats(assert *assert.Assertions, actual statsapi.ContainerStats, cs *runtimeapi.ContainerStats, imageFsInfo *cadvisorapiv2.FsInfo) { @@ -360,3 +410,10 @@ func checkEphemeralStorageStats(assert *assert.Assertions, actual statsapi.PodSt assert.Equal(int(*actual.EphemeralStorage.UsedBytes), int(totalUsed)) assert.Equal(int(*actual.EphemeralStorage.InodesUsed), int(inodesUsed)) } + +func checkCRINetworkStats(assert *assert.Assertions, actual *statsapi.NetworkStats, expected *cadvisorapiv2.NetworkStats) { + assert.Equal(expected.Interfaces[0].RxBytes, *actual.RxBytes) + assert.Equal(expected.Interfaces[0].RxErrors, *actual.RxErrors) + assert.Equal(expected.Interfaces[0].TxBytes, *actual.TxBytes) + assert.Equal(expected.Interfaces[0].TxErrors, *actual.TxErrors) +}