2019-01-12 04:58:27 +00:00
|
|
|
// Copyright 2014 Google Inc. All Rights Reserved.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package manager
|
|
|
|
|
|
|
|
import (
|
|
|
|
"flag"
|
|
|
|
"fmt"
|
|
|
|
"io/ioutil"
|
|
|
|
"math"
|
|
|
|
"math/rand"
|
|
|
|
"os/exec"
|
|
|
|
"path"
|
|
|
|
"regexp"
|
|
|
|
"sort"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/google/cadvisor/accelerators"
|
|
|
|
"github.com/google/cadvisor/cache/memory"
|
|
|
|
"github.com/google/cadvisor/collector"
|
|
|
|
"github.com/google/cadvisor/container"
|
|
|
|
info "github.com/google/cadvisor/info/v1"
|
|
|
|
"github.com/google/cadvisor/info/v2"
|
|
|
|
"github.com/google/cadvisor/summary"
|
|
|
|
"github.com/google/cadvisor/utils/cpuload"
|
|
|
|
|
|
|
|
units "github.com/docker/go-units"
|
|
|
|
"k8s.io/klog"
|
|
|
|
"k8s.io/utils/clock"
|
|
|
|
)
|
|
|
|
|
|
|
|
// Housekeeping interval.
|
|
|
|
var enableLoadReader = flag.Bool("enable_load_reader", false, "Whether to enable cpu load reader")
|
|
|
|
var HousekeepingInterval = flag.Duration("housekeeping_interval", 1*time.Second, "Interval between container housekeepings")
|
|
|
|
|
|
|
|
// cgroup type chosen to fetch the cgroup path of a process.
|
|
|
|
// Memory has been chosen, as it is one of the default cgroups that is enabled for most containers.
|
|
|
|
var cgroupPathRegExp = regexp.MustCompile(`memory[^:]*:(.*?)[,;$]`)
|
|
|
|
|
|
|
|
type containerInfo struct {
|
|
|
|
info.ContainerReference
|
|
|
|
Subcontainers []info.ContainerReference
|
|
|
|
Spec info.ContainerSpec
|
|
|
|
}
|
|
|
|
|
|
|
|
type containerData struct {
|
|
|
|
handler container.ContainerHandler
|
|
|
|
info containerInfo
|
|
|
|
memoryCache *memory.InMemoryCache
|
|
|
|
lock sync.Mutex
|
|
|
|
loadReader cpuload.CpuLoadReader
|
|
|
|
summaryReader *summary.StatsSummary
|
|
|
|
loadAvg float64 // smoothed load average seen so far.
|
|
|
|
housekeepingInterval time.Duration
|
|
|
|
maxHousekeepingInterval time.Duration
|
|
|
|
allowDynamicHousekeeping bool
|
|
|
|
infoLastUpdatedTime time.Time
|
|
|
|
statsLastUpdatedTime time.Time
|
|
|
|
lastErrorTime time.Time
|
|
|
|
// used to track time
|
|
|
|
clock clock.Clock
|
|
|
|
|
|
|
|
// Decay value used for load average smoothing. Interval length of 10 seconds is used.
|
|
|
|
loadDecay float64
|
|
|
|
|
|
|
|
// Whether to log the usage of this container when it is updated.
|
|
|
|
logUsage bool
|
|
|
|
|
|
|
|
// Tells the container to stop.
|
|
|
|
stop chan bool
|
|
|
|
|
|
|
|
// Tells the container to immediately collect stats
|
|
|
|
onDemandChan chan chan struct{}
|
|
|
|
|
|
|
|
// Runs custom metric collectors.
|
|
|
|
collectorManager collector.CollectorManager
|
|
|
|
|
|
|
|
// nvidiaCollector updates stats for Nvidia GPUs attached to the container.
|
|
|
|
nvidiaCollector accelerators.AcceleratorCollector
|
|
|
|
}
|
|
|
|
|
|
|
|
// jitter returns a time.Duration between duration and duration + maxFactor * duration,
|
|
|
|
// to allow clients to avoid converging on periodic behavior. If maxFactor is 0.0, a
|
|
|
|
// suggested default value will be chosen.
|
|
|
|
func jitter(duration time.Duration, maxFactor float64) time.Duration {
|
|
|
|
if maxFactor <= 0.0 {
|
|
|
|
maxFactor = 1.0
|
|
|
|
}
|
|
|
|
wait := duration + time.Duration(rand.Float64()*maxFactor*float64(duration))
|
|
|
|
return wait
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) Start() error {
|
|
|
|
go c.housekeeping()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) Stop() error {
|
|
|
|
err := c.memoryCache.RemoveContainer(c.info.Name)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
c.stop <- true
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) allowErrorLogging() bool {
|
|
|
|
if c.clock.Since(c.lastErrorTime) > time.Minute {
|
|
|
|
c.lastErrorTime = c.clock.Now()
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// OnDemandHousekeeping performs housekeeping on the container and blocks until it has completed.
|
|
|
|
// It is designed to be used in conjunction with periodic housekeeping, and will cause the timer for
|
|
|
|
// periodic housekeeping to reset. This should be used sparingly, as calling OnDemandHousekeeping frequently
|
|
|
|
// can have serious performance costs.
|
|
|
|
func (c *containerData) OnDemandHousekeeping(maxAge time.Duration) {
|
|
|
|
if c.clock.Since(c.statsLastUpdatedTime) > maxAge {
|
|
|
|
housekeepingFinishedChan := make(chan struct{})
|
|
|
|
c.onDemandChan <- housekeepingFinishedChan
|
|
|
|
select {
|
|
|
|
case <-c.stop:
|
|
|
|
case <-housekeepingFinishedChan:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// notifyOnDemand notifies all calls to OnDemandHousekeeping that housekeeping is finished
|
|
|
|
func (c *containerData) notifyOnDemand() {
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case finishedChan := <-c.onDemandChan:
|
|
|
|
close(finishedChan)
|
|
|
|
default:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) GetInfo(shouldUpdateSubcontainers bool) (*containerInfo, error) {
|
|
|
|
// Get spec and subcontainers.
|
|
|
|
if c.clock.Since(c.infoLastUpdatedTime) > 5*time.Second {
|
|
|
|
err := c.updateSpec()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if shouldUpdateSubcontainers {
|
|
|
|
err = c.updateSubcontainers()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
c.infoLastUpdatedTime = c.clock.Now()
|
|
|
|
}
|
|
|
|
// Make a copy of the info for the user.
|
|
|
|
c.lock.Lock()
|
|
|
|
defer c.lock.Unlock()
|
|
|
|
return &c.info, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) DerivedStats() (v2.DerivedStats, error) {
|
|
|
|
if c.summaryReader == nil {
|
|
|
|
return v2.DerivedStats{}, fmt.Errorf("derived stats not enabled for container %q", c.info.Name)
|
|
|
|
}
|
|
|
|
return c.summaryReader.DerivedStats()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) getCgroupPath(cgroups string) (string, error) {
|
|
|
|
if cgroups == "-" {
|
|
|
|
return "/", nil
|
|
|
|
}
|
2019-12-12 01:27:03 +00:00
|
|
|
if strings.HasPrefix(cgroups, "0::") {
|
|
|
|
return cgroups[3:], nil
|
|
|
|
}
|
2019-01-12 04:58:27 +00:00
|
|
|
matches := cgroupPathRegExp.FindSubmatch([]byte(cgroups))
|
|
|
|
if len(matches) != 2 {
|
|
|
|
klog.V(3).Infof("failed to get memory cgroup path from %q", cgroups)
|
|
|
|
// return root in case of failures - memory hierarchy might not be enabled.
|
|
|
|
return "/", nil
|
|
|
|
}
|
|
|
|
return string(matches[1]), nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns contents of a file inside the container root.
|
|
|
|
// Takes in a path relative to container root.
|
|
|
|
func (c *containerData) ReadFile(filepath string, inHostNamespace bool) ([]byte, error) {
|
|
|
|
pids, err := c.getContainerPids(inHostNamespace)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
// TODO(rjnagal): Optimize by just reading container's cgroup.proc file when in host namespace.
|
|
|
|
rootfs := "/"
|
|
|
|
if !inHostNamespace {
|
|
|
|
rootfs = "/rootfs"
|
|
|
|
}
|
|
|
|
for _, pid := range pids {
|
|
|
|
filePath := path.Join(rootfs, "/proc", pid, "/root", filepath)
|
|
|
|
klog.V(3).Infof("Trying path %q", filePath)
|
|
|
|
data, err := ioutil.ReadFile(filePath)
|
|
|
|
if err == nil {
|
|
|
|
return data, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// No process paths could be found. Declare config non-existent.
|
|
|
|
return nil, fmt.Errorf("file %q does not exist.", filepath)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return output for ps command in host /proc with specified format
|
|
|
|
func (c *containerData) getPsOutput(inHostNamespace bool, format string) ([]byte, error) {
|
|
|
|
args := []string{}
|
|
|
|
command := "ps"
|
|
|
|
if !inHostNamespace {
|
|
|
|
command = "/usr/sbin/chroot"
|
|
|
|
args = append(args, "/rootfs", "ps")
|
|
|
|
}
|
|
|
|
args = append(args, "-e", "-o", format)
|
|
|
|
out, err := exec.Command(command, args...).Output()
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to execute %q command: %v", command, err)
|
|
|
|
}
|
|
|
|
return out, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get pids of processes in this container.
|
|
|
|
// A slightly lighterweight call than GetProcessList if other details are not required.
|
|
|
|
func (c *containerData) getContainerPids(inHostNamespace bool) ([]string, error) {
|
|
|
|
format := "pid,cgroup"
|
|
|
|
out, err := c.getPsOutput(inHostNamespace, format)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
expectedFields := 2
|
|
|
|
lines := strings.Split(string(out), "\n")
|
|
|
|
pids := []string{}
|
|
|
|
for _, line := range lines[1:] {
|
|
|
|
if len(line) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
fields := strings.Fields(line)
|
|
|
|
if len(fields) < expectedFields {
|
|
|
|
return nil, fmt.Errorf("expected at least %d fields, found %d: output: %q", expectedFields, len(fields), line)
|
|
|
|
}
|
|
|
|
pid := fields[0]
|
|
|
|
cgroup, err := c.getCgroupPath(fields[1])
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("could not parse cgroup path from %q: %v", fields[1], err)
|
|
|
|
}
|
|
|
|
if c.info.Name == cgroup {
|
|
|
|
pids = append(pids, pid)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return pids, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) GetProcessList(cadvisorContainer string, inHostNamespace bool) ([]v2.ProcessInfo, error) {
|
|
|
|
// report all processes for root.
|
|
|
|
isRoot := c.info.Name == "/"
|
|
|
|
rootfs := "/"
|
|
|
|
if !inHostNamespace {
|
|
|
|
rootfs = "/rootfs"
|
|
|
|
}
|
|
|
|
format := "user,pid,ppid,stime,pcpu,pmem,rss,vsz,stat,time,comm,cgroup"
|
|
|
|
out, err := c.getPsOutput(inHostNamespace, format)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
expectedFields := 12
|
|
|
|
processes := []v2.ProcessInfo{}
|
|
|
|
lines := strings.Split(string(out), "\n")
|
|
|
|
for _, line := range lines[1:] {
|
|
|
|
if len(line) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
fields := strings.Fields(line)
|
|
|
|
if len(fields) < expectedFields {
|
|
|
|
return nil, fmt.Errorf("expected at least %d fields, found %d: output: %q", expectedFields, len(fields), line)
|
|
|
|
}
|
|
|
|
pid, err := strconv.Atoi(fields[1])
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid pid %q: %v", fields[1], err)
|
|
|
|
}
|
|
|
|
ppid, err := strconv.Atoi(fields[2])
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid ppid %q: %v", fields[2], err)
|
|
|
|
}
|
|
|
|
percentCpu, err := strconv.ParseFloat(fields[4], 32)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid cpu percent %q: %v", fields[4], err)
|
|
|
|
}
|
|
|
|
percentMem, err := strconv.ParseFloat(fields[5], 32)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid memory percent %q: %v", fields[5], err)
|
|
|
|
}
|
|
|
|
rss, err := strconv.ParseUint(fields[6], 0, 64)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid rss %q: %v", fields[6], err)
|
|
|
|
}
|
|
|
|
// convert to bytes
|
|
|
|
rss *= 1024
|
|
|
|
vs, err := strconv.ParseUint(fields[7], 0, 64)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("invalid virtual size %q: %v", fields[7], err)
|
|
|
|
}
|
|
|
|
// convert to bytes
|
|
|
|
vs *= 1024
|
|
|
|
cgroup, err := c.getCgroupPath(fields[11])
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("could not parse cgroup path from %q: %v", fields[11], err)
|
|
|
|
}
|
|
|
|
// Remove the ps command we just ran from cadvisor container.
|
|
|
|
// Not necessary, but makes the cadvisor page look cleaner.
|
|
|
|
if !inHostNamespace && cadvisorContainer == cgroup && fields[10] == "ps" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
var cgroupPath string
|
|
|
|
if isRoot {
|
|
|
|
cgroupPath = cgroup
|
|
|
|
}
|
|
|
|
|
|
|
|
var fdCount int
|
|
|
|
dirPath := path.Join(rootfs, "/proc", strconv.Itoa(pid), "fd")
|
|
|
|
fds, err := ioutil.ReadDir(dirPath)
|
|
|
|
if err != nil {
|
|
|
|
klog.V(4).Infof("error while listing directory %q to measure fd count: %v", dirPath, err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
fdCount = len(fds)
|
|
|
|
|
|
|
|
if isRoot || c.info.Name == cgroup {
|
|
|
|
processes = append(processes, v2.ProcessInfo{
|
|
|
|
User: fields[0],
|
|
|
|
Pid: pid,
|
|
|
|
Ppid: ppid,
|
|
|
|
StartTime: fields[3],
|
|
|
|
PercentCpu: float32(percentCpu),
|
|
|
|
PercentMemory: float32(percentMem),
|
|
|
|
RSS: rss,
|
|
|
|
VirtualSize: vs,
|
|
|
|
Status: fields[8],
|
|
|
|
RunningTime: fields[9],
|
|
|
|
Cmd: fields[10],
|
|
|
|
CgroupPath: cgroupPath,
|
|
|
|
FdCount: fdCount,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return processes, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func newContainerData(containerName string, memoryCache *memory.InMemoryCache, handler container.ContainerHandler, logUsage bool, collectorManager collector.CollectorManager, maxHousekeepingInterval time.Duration, allowDynamicHousekeeping bool, clock clock.Clock) (*containerData, error) {
|
|
|
|
if memoryCache == nil {
|
|
|
|
return nil, fmt.Errorf("nil memory storage")
|
|
|
|
}
|
|
|
|
if handler == nil {
|
|
|
|
return nil, fmt.Errorf("nil container handler")
|
|
|
|
}
|
|
|
|
ref, err := handler.ContainerReference()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
cont := &containerData{
|
|
|
|
handler: handler,
|
|
|
|
memoryCache: memoryCache,
|
|
|
|
housekeepingInterval: *HousekeepingInterval,
|
|
|
|
maxHousekeepingInterval: maxHousekeepingInterval,
|
|
|
|
allowDynamicHousekeeping: allowDynamicHousekeeping,
|
|
|
|
logUsage: logUsage,
|
|
|
|
loadAvg: -1.0, // negative value indicates uninitialized.
|
|
|
|
stop: make(chan bool, 1),
|
|
|
|
collectorManager: collectorManager,
|
|
|
|
onDemandChan: make(chan chan struct{}, 100),
|
|
|
|
clock: clock,
|
|
|
|
}
|
|
|
|
cont.info.ContainerReference = ref
|
|
|
|
|
|
|
|
cont.loadDecay = math.Exp(float64(-cont.housekeepingInterval.Seconds() / 10))
|
|
|
|
|
|
|
|
if *enableLoadReader {
|
|
|
|
// Create cpu load reader.
|
|
|
|
loadReader, err := cpuload.New()
|
|
|
|
if err != nil {
|
|
|
|
klog.Warningf("Could not initialize cpu load reader for %q: %s", ref.Name, err)
|
|
|
|
} else {
|
|
|
|
cont.loadReader = loadReader
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
err = cont.updateSpec()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
cont.summaryReader, err = summary.New(cont.info.Spec)
|
|
|
|
if err != nil {
|
|
|
|
cont.summaryReader = nil
|
|
|
|
klog.Warningf("Failed to create summary reader for %q: %v", ref.Name, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return cont, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Determine when the next housekeeping should occur.
|
|
|
|
func (self *containerData) nextHousekeepingInterval() time.Duration {
|
|
|
|
if self.allowDynamicHousekeeping {
|
|
|
|
var empty time.Time
|
|
|
|
stats, err := self.memoryCache.RecentStats(self.info.Name, empty, empty, 2)
|
|
|
|
if err != nil {
|
|
|
|
if self.allowErrorLogging() {
|
|
|
|
klog.Warningf("Failed to get RecentStats(%q) while determining the next housekeeping: %v", self.info.Name, err)
|
|
|
|
}
|
|
|
|
} else if len(stats) == 2 {
|
|
|
|
// TODO(vishnuk): Use no processes as a signal.
|
|
|
|
// Raise the interval if usage hasn't changed in the last housekeeping.
|
|
|
|
if stats[0].StatsEq(stats[1]) && (self.housekeepingInterval < self.maxHousekeepingInterval) {
|
|
|
|
self.housekeepingInterval *= 2
|
|
|
|
if self.housekeepingInterval > self.maxHousekeepingInterval {
|
|
|
|
self.housekeepingInterval = self.maxHousekeepingInterval
|
|
|
|
}
|
|
|
|
} else if self.housekeepingInterval != *HousekeepingInterval {
|
|
|
|
// Lower interval back to the baseline.
|
|
|
|
self.housekeepingInterval = *HousekeepingInterval
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return jitter(self.housekeepingInterval, 1.0)
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO(vmarmol): Implement stats collecting as a custom collector.
|
|
|
|
func (c *containerData) housekeeping() {
|
|
|
|
// Start any background goroutines - must be cleaned up in c.handler.Cleanup().
|
|
|
|
c.handler.Start()
|
|
|
|
defer c.handler.Cleanup()
|
|
|
|
|
|
|
|
// Initialize cpuload reader - must be cleaned up in c.loadReader.Stop()
|
|
|
|
if c.loadReader != nil {
|
|
|
|
err := c.loadReader.Start()
|
|
|
|
if err != nil {
|
|
|
|
klog.Warningf("Could not start cpu load stat collector for %q: %s", c.info.Name, err)
|
|
|
|
}
|
|
|
|
defer c.loadReader.Stop()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Long housekeeping is either 100ms or half of the housekeeping interval.
|
|
|
|
longHousekeeping := 100 * time.Millisecond
|
|
|
|
if *HousekeepingInterval/2 < longHousekeeping {
|
|
|
|
longHousekeeping = *HousekeepingInterval / 2
|
|
|
|
}
|
|
|
|
|
|
|
|
// Housekeep every second.
|
|
|
|
klog.V(3).Infof("Start housekeeping for container %q\n", c.info.Name)
|
|
|
|
houseKeepingTimer := c.clock.NewTimer(0 * time.Second)
|
|
|
|
defer houseKeepingTimer.Stop()
|
|
|
|
for {
|
|
|
|
if !c.housekeepingTick(houseKeepingTimer.C(), longHousekeeping) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Stop and drain the timer so that it is safe to reset it
|
|
|
|
if !houseKeepingTimer.Stop() {
|
|
|
|
select {
|
|
|
|
case <-houseKeepingTimer.C():
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Log usage if asked to do so.
|
|
|
|
if c.logUsage {
|
|
|
|
const numSamples = 60
|
|
|
|
var empty time.Time
|
|
|
|
stats, err := c.memoryCache.RecentStats(c.info.Name, empty, empty, numSamples)
|
|
|
|
if err != nil {
|
|
|
|
if c.allowErrorLogging() {
|
|
|
|
klog.Warningf("[%s] Failed to get recent stats for logging usage: %v", c.info.Name, err)
|
|
|
|
}
|
|
|
|
} else if len(stats) < numSamples {
|
|
|
|
// Ignore, not enough stats yet.
|
|
|
|
} else {
|
|
|
|
usageCpuNs := uint64(0)
|
|
|
|
for i := range stats {
|
|
|
|
if i > 0 {
|
|
|
|
usageCpuNs += (stats[i].Cpu.Usage.Total - stats[i-1].Cpu.Usage.Total)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
usageMemory := stats[numSamples-1].Memory.Usage
|
|
|
|
|
|
|
|
instantUsageInCores := float64(stats[numSamples-1].Cpu.Usage.Total-stats[numSamples-2].Cpu.Usage.Total) / float64(stats[numSamples-1].Timestamp.Sub(stats[numSamples-2].Timestamp).Nanoseconds())
|
|
|
|
usageInCores := float64(usageCpuNs) / float64(stats[numSamples-1].Timestamp.Sub(stats[0].Timestamp).Nanoseconds())
|
|
|
|
usageInHuman := units.HumanSize(float64(usageMemory))
|
|
|
|
// Don't set verbosity since this is already protected by the logUsage flag.
|
|
|
|
klog.Infof("[%s] %.3f cores (average: %.3f cores), %s of memory", c.info.Name, instantUsageInCores, usageInCores, usageInHuman)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
houseKeepingTimer.Reset(c.nextHousekeepingInterval())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) housekeepingTick(timer <-chan time.Time, longHousekeeping time.Duration) bool {
|
|
|
|
select {
|
|
|
|
case <-c.stop:
|
|
|
|
// Stop housekeeping when signaled.
|
|
|
|
return false
|
|
|
|
case finishedChan := <-c.onDemandChan:
|
|
|
|
// notify the calling function once housekeeping has completed
|
|
|
|
defer close(finishedChan)
|
|
|
|
case <-timer:
|
|
|
|
}
|
|
|
|
start := c.clock.Now()
|
|
|
|
err := c.updateStats()
|
|
|
|
if err != nil {
|
|
|
|
if c.allowErrorLogging() {
|
|
|
|
klog.Warningf("Failed to update stats for container \"%s\": %s", c.info.Name, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Log if housekeeping took too long.
|
|
|
|
duration := c.clock.Since(start)
|
|
|
|
if duration >= longHousekeeping {
|
|
|
|
klog.V(3).Infof("[%s] Housekeeping took %s", c.info.Name, duration)
|
|
|
|
}
|
|
|
|
c.notifyOnDemand()
|
|
|
|
c.statsLastUpdatedTime = c.clock.Now()
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) updateSpec() error {
|
|
|
|
spec, err := c.handler.GetSpec()
|
|
|
|
if err != nil {
|
|
|
|
// Ignore errors if the container is dead.
|
|
|
|
if !c.handler.Exists() {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
customMetrics, err := c.collectorManager.GetSpec()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if len(customMetrics) > 0 {
|
|
|
|
spec.HasCustomMetrics = true
|
|
|
|
spec.CustomMetrics = customMetrics
|
|
|
|
}
|
|
|
|
c.lock.Lock()
|
|
|
|
defer c.lock.Unlock()
|
|
|
|
c.info.Spec = spec
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Calculate new smoothed load average using the new sample of runnable threads.
|
|
|
|
// The decay used ensures that the load will stabilize on a new constant value within
|
|
|
|
// 10 seconds.
|
|
|
|
func (c *containerData) updateLoad(newLoad uint64) {
|
|
|
|
if c.loadAvg < 0 {
|
|
|
|
c.loadAvg = float64(newLoad) // initialize to the first seen sample for faster stabilization.
|
|
|
|
} else {
|
|
|
|
c.loadAvg = c.loadAvg*c.loadDecay + float64(newLoad)*(1.0-c.loadDecay)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) updateStats() error {
|
|
|
|
stats, statsErr := c.handler.GetStats()
|
|
|
|
if statsErr != nil {
|
|
|
|
// Ignore errors if the container is dead.
|
|
|
|
if !c.handler.Exists() {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stats may be partially populated, push those before we return an error.
|
|
|
|
statsErr = fmt.Errorf("%v, continuing to push stats", statsErr)
|
|
|
|
}
|
|
|
|
if stats == nil {
|
|
|
|
return statsErr
|
|
|
|
}
|
|
|
|
if c.loadReader != nil {
|
|
|
|
// TODO(vmarmol): Cache this path.
|
|
|
|
path, err := c.handler.GetCgroupPath("cpu")
|
|
|
|
if err == nil {
|
|
|
|
loadStats, err := c.loadReader.GetCpuLoad(c.info.Name, path)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to get load stat for %q - path %q, error %s", c.info.Name, path, err)
|
|
|
|
}
|
|
|
|
stats.TaskStats = loadStats
|
|
|
|
c.updateLoad(loadStats.NrRunning)
|
|
|
|
// convert to 'milliLoad' to avoid floats and preserve precision.
|
|
|
|
stats.Cpu.LoadAverage = int32(c.loadAvg * 1000)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if c.summaryReader != nil {
|
|
|
|
err := c.summaryReader.AddSample(*stats)
|
|
|
|
if err != nil {
|
|
|
|
// Ignore summary errors for now.
|
|
|
|
klog.V(2).Infof("Failed to add summary stats for %q: %v", c.info.Name, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
var customStatsErr error
|
|
|
|
cm := c.collectorManager.(*collector.GenericCollectorManager)
|
|
|
|
if len(cm.Collectors) > 0 {
|
|
|
|
if cm.NextCollectionTime.Before(c.clock.Now()) {
|
|
|
|
customStats, err := c.updateCustomStats()
|
|
|
|
if customStats != nil {
|
|
|
|
stats.CustomMetrics = customStats
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
customStatsErr = err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
var nvidiaStatsErr error
|
|
|
|
if c.nvidiaCollector != nil {
|
|
|
|
// This updates the Accelerators field of the stats struct
|
|
|
|
nvidiaStatsErr = c.nvidiaCollector.UpdateStats(stats)
|
|
|
|
}
|
|
|
|
|
|
|
|
ref, err := c.handler.ContainerReference()
|
|
|
|
if err != nil {
|
|
|
|
// Ignore errors if the container is dead.
|
|
|
|
if !c.handler.Exists() {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
cInfo := info.ContainerInfo{
|
|
|
|
ContainerReference: ref,
|
|
|
|
}
|
|
|
|
|
|
|
|
err = c.memoryCache.AddStats(&cInfo, stats)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if statsErr != nil {
|
|
|
|
return statsErr
|
|
|
|
}
|
|
|
|
if nvidiaStatsErr != nil {
|
|
|
|
return nvidiaStatsErr
|
|
|
|
}
|
|
|
|
return customStatsErr
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) updateCustomStats() (map[string][]info.MetricVal, error) {
|
|
|
|
_, customStats, customStatsErr := c.collectorManager.Collect()
|
|
|
|
if customStatsErr != nil {
|
|
|
|
if !c.handler.Exists() {
|
|
|
|
return customStats, nil
|
|
|
|
}
|
|
|
|
customStatsErr = fmt.Errorf("%v, continuing to push custom stats", customStatsErr)
|
|
|
|
}
|
|
|
|
return customStats, customStatsErr
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerData) updateSubcontainers() error {
|
|
|
|
var subcontainers info.ContainerReferenceSlice
|
|
|
|
subcontainers, err := c.handler.ListContainers(container.ListSelf)
|
|
|
|
if err != nil {
|
|
|
|
// Ignore errors if the container is dead.
|
|
|
|
if !c.handler.Exists() {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
sort.Sort(subcontainers)
|
|
|
|
c.lock.Lock()
|
|
|
|
defer c.lock.Unlock()
|
|
|
|
c.info.Subcontainers = subcontainers
|
|
|
|
return nil
|
|
|
|
}
|