From 2a3dd5066a368b1538744ad8e20c4953f9f57f23 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Mon, 22 May 2023 17:02:56 -0400 Subject: [PATCH] pr review fixes and documentation --- .changelog/17038.txt | 2 +- agent/setup.go | 4 +-- lib/hoststats/collector.go | 46 ++++++++++++------------ lib/hoststats/cpu.go | 28 ++++++++------- lib/hoststats/cpu_test.go | 6 ++-- lib/hoststats/host.go | 11 ++---- website/content/docs/agent/telemetry.mdx | 5 ++- 7 files changed, 50 insertions(+), 52 deletions(-) diff --git a/.changelog/17038.txt b/.changelog/17038.txt index 0e206836d9..b3a47f98a7 100644 --- a/.changelog/17038.txt +++ b/.changelog/17038.txt @@ -1,3 +1,3 @@ ```release-note:improvement -agent: add new metrics to track cpu disk and memory usage for server hosts +agent: add new metrics to track cpu disk and memory usage for server hosts (defaults to: enabled) ``` diff --git a/agent/setup.go b/agent/setup.go index 881d094da0..1e6dfc9f4a 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -13,7 +13,6 @@ import ( "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" - "github.com/hashicorp/consul/lib/hoststats" "github.com/hashicorp/go-hclog" wal "github.com/hashicorp/raft-wal" "github.com/hashicorp/raft-wal/verifier" @@ -43,6 +42,7 @@ import ( "github.com/hashicorp/consul/agent/xds" "github.com/hashicorp/consul/ipaddr" "github.com/hashicorp/consul/lib" + "github.com/hashicorp/consul/lib/hoststats" "github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/tlsutil" ) @@ -61,7 +61,7 @@ type BaseDeps struct { WatchedFiles []string deregisterBalancer, deregisterResolver func() - stopHostCollector func() + stopHostCollector context.CancelFunc } type ConfigLoader func(source config.Source) (config.LoadResult, error) diff --git a/lib/hoststats/collector.go b/lib/hoststats/collector.go index a2c7bade67..c4c57b35c5 100644 --- a/lib/hoststats/collector.go +++ b/lib/hoststats/collector.go @@ -59,7 +59,7 @@ func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption) return collector } -func (h *Collector) loop(ctx context.Context) { +func (c *Collector) loop(ctx context.Context) { // Start collecting host stats right away and then keep collecting every // collection interval next := time.NewTimer(0) @@ -67,9 +67,9 @@ func (h *Collector) loop(ctx context.Context) { for { select { case <-next.C: - h.collect() + c.collect() next.Reset(hostStatsCollectionInterval) - h.Stats().Emit(h.metrics, h.baseLabels) + c.Stats().Emit(c.metrics, c.baseLabels) case <-ctx.Done(): return @@ -78,55 +78,55 @@ func (h *Collector) loop(ctx context.Context) { } // collect will collect stats related to resource usage of the host -func (h *Collector) collect() { - h.hostStatsLock.Lock() - defer h.hostStatsLock.Unlock() +func (c *Collector) collect() { hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} // Determine up-time uptime, err := host.Uptime() if err != nil { - h.logger.Error("failed to collect uptime stats", "error", err) + c.logger.Error("failed to collect uptime stats", "error", err) uptime = 0 } hs.Uptime = uptime // Collect memory stats - mstats, err := h.collectMemoryStats() + mstats, err := c.collectMemoryStats() if err != nil { - h.logger.Error("failed to collect memory stats", "error", err) + c.logger.Error("failed to collect memory stats", "error", err) mstats = &MemoryStats{} } hs.Memory = mstats // Collect cpu stats - cpus, err := h.collectCPUStats() + cpus, err := c.collectCPUStats() if err != nil { - h.logger.Error("failed to collect cpu stats", "error", err) + c.logger.Error("failed to collect cpu stats", "error", err) cpus = []*CPUStats{} } hs.CPU = cpus // Collect disk stats - diskStats, err := h.collectDiskStats(h.dataDir) + diskStats, err := c.collectDiskStats(c.dataDir) if err != nil { - h.logger.Error("failed to collect dataDir disk stats", "error", err) + c.logger.Error("failed to collect dataDir disk stats", "error", err) } hs.DataDirStats = diskStats // Update the collected status object. - h.hostStats = hs + c.hostStatsLock.Lock() + c.hostStats = hs + c.hostStatsLock.Unlock() } -func (h *Collector) collectDiskStats(dir string) (*DiskStats, error) { +func (c *Collector) collectDiskStats(dir string) (*DiskStats, error) { usage, err := disk.Usage(dir) if err != nil { return nil, fmt.Errorf("failed to collect disk usage stats: %w", err) } - return h.toDiskStats(usage), nil + return c.toDiskStats(usage), nil } -func (h *Collector) collectMemoryStats() (*MemoryStats, error) { +func (c *Collector) collectMemoryStats() (*MemoryStats, error) { memStats, err := mem.VirtualMemory() if err != nil { return nil, err @@ -143,19 +143,19 @@ func (h *Collector) collectMemoryStats() (*MemoryStats, error) { } // Stats returns the host stats that has been collected -func (h *Collector) Stats() *HostStats { - h.hostStatsLock.RLock() - defer h.hostStatsLock.RUnlock() +func (c *Collector) Stats() *HostStats { + c.hostStatsLock.RLock() + defer c.hostStatsLock.RUnlock() - if h.hostStats == nil { + if c.hostStats == nil { return &HostStats{} } - return h.hostStats.Clone() + return c.hostStats.Clone() } // toDiskStats merges UsageStat and PartitionStat to create a DiskStat -func (h *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats { +func (c *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats { ds := DiskStats{ Size: usage.Total, Used: usage.Used, diff --git a/lib/hoststats/cpu.go b/lib/hoststats/cpu.go index 1ac0fc859d..45633b40df 100644 --- a/lib/hoststats/cpu.go +++ b/lib/hoststats/cpu.go @@ -2,7 +2,6 @@ package hoststats import ( "math" - "time" "github.com/shirou/gopsutil/v3/cpu" ) @@ -14,17 +13,28 @@ type cpuStatsCalculator struct { prevTotal float64 } -// calculate calculates the current cpu usage percentages +// calculate the current cpu usage percentages. +// Since the cpu.TimesStat captures the total time a cpu spent in various states +// this function tracks the last seen stat and derives each cpu state's utilization +// as a percentage of the total change in cpu time between calls. +// The first time calculate is called CPUStats will report %100 idle +// usage since there is not a previous value to calculate against func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats { + // sum all none idle counters to get the total busy cpu time currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + times.Softirq + times.Steal + times.Guest + times.GuestNice + // sum of the total cpu time currentTotal := currentBusy + times.Idle + // calculate how much cpu time has passed since last calculation deltaTotal := currentTotal - h.prevTotal + stats := &CPUStats{ CPU: times.CPU, + // calculate each percentage as the ratio of the change + // in each state's time to the total change in cpu time Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100, User: ((times.User - h.prev.User) / deltaTotal) * 100, System: ((times.System - h.prev.System) / deltaTotal) * 100, @@ -55,15 +65,7 @@ func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats { return stats } -// cpuStats calculates cpu usage percentage -type cpuStats struct { - prevCpuTime float64 - prevTime time.Time - - totalCpus int -} - -func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) { +func (c *Collector) collectCPUStats() (cpus []*CPUStats, err error) { cpuStats, err := cpu.Times(true) if err != nil { @@ -71,10 +73,10 @@ func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) { } cs := make([]*CPUStats, len(cpuStats)) for idx, cpuStat := range cpuStats { - percentCalculator, ok := h.cpuCalculator[cpuStat.CPU] + percentCalculator, ok := c.cpuCalculator[cpuStat.CPU] if !ok { percentCalculator = &cpuStatsCalculator{} - h.cpuCalculator[cpuStat.CPU] = percentCalculator + c.cpuCalculator[cpuStat.CPU] = percentCalculator } cs[idx] = percentCalculator.calculate(cpuStat) } diff --git a/lib/hoststats/cpu_test.go b/lib/hoststats/cpu_test.go index 3e5d6e81ed..5d5efbe976 100644 --- a/lib/hoststats/cpu_test.go +++ b/lib/hoststats/cpu_test.go @@ -4,6 +4,7 @@ import ( "math" "os" "testing" + "time" "github.com/hashicorp/consul/sdk/testutil" "github.com/shirou/gopsutil/v3/cpu" @@ -20,10 +21,7 @@ func TestHostStats_CPU(t *testing.T) { // Collect twice so we can calculate percents we need to generate some work // so that the cpu values change hs.collect() - total := 0 - for i := 1; i < 1000000000; i++ { - total *= i - total = total % i + for begin := time.Now(); time.Now().Sub(begin) < 100*time.Millisecond; { } hs.collect() stats := hs.Stats() diff --git a/lib/hoststats/host.go b/lib/hoststats/host.go index aa90b6373e..426cf43ea2 100644 --- a/lib/hoststats/host.go +++ b/lib/hoststats/host.go @@ -18,14 +18,9 @@ type HostStats struct { } func (hs *HostStats) Clone() *HostStats { - clone := *hs - - clone.CPU = make([]*CPUStats, len(hs.CPU)) - for i := range hs.CPU { - cpu := *hs.CPU[i] - clone.CPU[i] = &cpu - } - return &clone + clone := &HostStats{} + *clone = *hs + return clone } func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) { diff --git a/website/content/docs/agent/telemetry.mdx b/website/content/docs/agent/telemetry.mdx index b0fa386692..8c62507e9d 100644 --- a/website/content/docs/agent/telemetry.mdx +++ b/website/content/docs/agent/telemetry.mdx @@ -757,7 +757,10 @@ Consul attaches the following labels to metric values. ## Server Host Metrics -Consul servers report the following metrics about the host's system resources +Consul servers report the following metrics about the host's system resources. +Note that if the Consul server is operating inside a container these metrics +still report host resource usage and do not report any resource limits placed +on the container. **Requirements:** - Consul 1.15.3+