pr review fixes and documentation

pull/17038/head
Nick Ethier 2023-05-22 17:02:56 -04:00
parent ac6071721c
commit 2a3dd5066a
7 changed files with 50 additions and 52 deletions

View File

@ -1,3 +1,3 @@
```release-note:improvement ```release-note:improvement
agent: add new metrics to track cpu disk and memory usage for server hosts agent: add new metrics to track cpu disk and memory usage for server hosts (defaults to: enabled)
``` ```

View File

@ -13,7 +13,6 @@ import (
"github.com/armon/go-metrics" "github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus" "github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/lib/hoststats"
"github.com/hashicorp/go-hclog" "github.com/hashicorp/go-hclog"
wal "github.com/hashicorp/raft-wal" wal "github.com/hashicorp/raft-wal"
"github.com/hashicorp/raft-wal/verifier" "github.com/hashicorp/raft-wal/verifier"
@ -43,6 +42,7 @@ import (
"github.com/hashicorp/consul/agent/xds" "github.com/hashicorp/consul/agent/xds"
"github.com/hashicorp/consul/ipaddr" "github.com/hashicorp/consul/ipaddr"
"github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/lib/hoststats"
"github.com/hashicorp/consul/logging" "github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil" "github.com/hashicorp/consul/tlsutil"
) )
@ -61,7 +61,7 @@ type BaseDeps struct {
WatchedFiles []string WatchedFiles []string
deregisterBalancer, deregisterResolver func() deregisterBalancer, deregisterResolver func()
stopHostCollector func() stopHostCollector context.CancelFunc
} }
type ConfigLoader func(source config.Source) (config.LoadResult, error) type ConfigLoader func(source config.Source) (config.LoadResult, error)

View File

@ -59,7 +59,7 @@ func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption)
return collector return collector
} }
func (h *Collector) loop(ctx context.Context) { func (c *Collector) loop(ctx context.Context) {
// Start collecting host stats right away and then keep collecting every // Start collecting host stats right away and then keep collecting every
// collection interval // collection interval
next := time.NewTimer(0) next := time.NewTimer(0)
@ -67,9 +67,9 @@ func (h *Collector) loop(ctx context.Context) {
for { for {
select { select {
case <-next.C: case <-next.C:
h.collect() c.collect()
next.Reset(hostStatsCollectionInterval) next.Reset(hostStatsCollectionInterval)
h.Stats().Emit(h.metrics, h.baseLabels) c.Stats().Emit(c.metrics, c.baseLabels)
case <-ctx.Done(): case <-ctx.Done():
return return
@ -78,55 +78,55 @@ func (h *Collector) loop(ctx context.Context) {
} }
// collect will collect stats related to resource usage of the host // collect will collect stats related to resource usage of the host
func (h *Collector) collect() { func (c *Collector) collect() {
h.hostStatsLock.Lock()
defer h.hostStatsLock.Unlock()
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
// Determine up-time // Determine up-time
uptime, err := host.Uptime() uptime, err := host.Uptime()
if err != nil { if err != nil {
h.logger.Error("failed to collect uptime stats", "error", err) c.logger.Error("failed to collect uptime stats", "error", err)
uptime = 0 uptime = 0
} }
hs.Uptime = uptime hs.Uptime = uptime
// Collect memory stats // Collect memory stats
mstats, err := h.collectMemoryStats() mstats, err := c.collectMemoryStats()
if err != nil { if err != nil {
h.logger.Error("failed to collect memory stats", "error", err) c.logger.Error("failed to collect memory stats", "error", err)
mstats = &MemoryStats{} mstats = &MemoryStats{}
} }
hs.Memory = mstats hs.Memory = mstats
// Collect cpu stats // Collect cpu stats
cpus, err := h.collectCPUStats() cpus, err := c.collectCPUStats()
if err != nil { if err != nil {
h.logger.Error("failed to collect cpu stats", "error", err) c.logger.Error("failed to collect cpu stats", "error", err)
cpus = []*CPUStats{} cpus = []*CPUStats{}
} }
hs.CPU = cpus hs.CPU = cpus
// Collect disk stats // Collect disk stats
diskStats, err := h.collectDiskStats(h.dataDir) diskStats, err := c.collectDiskStats(c.dataDir)
if err != nil { if err != nil {
h.logger.Error("failed to collect dataDir disk stats", "error", err) c.logger.Error("failed to collect dataDir disk stats", "error", err)
} }
hs.DataDirStats = diskStats hs.DataDirStats = diskStats
// Update the collected status object. // Update the collected status object.
h.hostStats = hs c.hostStatsLock.Lock()
c.hostStats = hs
c.hostStatsLock.Unlock()
} }
func (h *Collector) collectDiskStats(dir string) (*DiskStats, error) { func (c *Collector) collectDiskStats(dir string) (*DiskStats, error) {
usage, err := disk.Usage(dir) usage, err := disk.Usage(dir)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to collect disk usage stats: %w", err) return nil, fmt.Errorf("failed to collect disk usage stats: %w", err)
} }
return h.toDiskStats(usage), nil return c.toDiskStats(usage), nil
} }
func (h *Collector) collectMemoryStats() (*MemoryStats, error) { func (c *Collector) collectMemoryStats() (*MemoryStats, error) {
memStats, err := mem.VirtualMemory() memStats, err := mem.VirtualMemory()
if err != nil { if err != nil {
return nil, err return nil, err
@ -143,19 +143,19 @@ func (h *Collector) collectMemoryStats() (*MemoryStats, error) {
} }
// Stats returns the host stats that has been collected // Stats returns the host stats that has been collected
func (h *Collector) Stats() *HostStats { func (c *Collector) Stats() *HostStats {
h.hostStatsLock.RLock() c.hostStatsLock.RLock()
defer h.hostStatsLock.RUnlock() defer c.hostStatsLock.RUnlock()
if h.hostStats == nil { if c.hostStats == nil {
return &HostStats{} return &HostStats{}
} }
return h.hostStats.Clone() return c.hostStats.Clone()
} }
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat // toDiskStats merges UsageStat and PartitionStat to create a DiskStat
func (h *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats { func (c *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats {
ds := DiskStats{ ds := DiskStats{
Size: usage.Total, Size: usage.Total,
Used: usage.Used, Used: usage.Used,

View File

@ -2,7 +2,6 @@ package hoststats
import ( import (
"math" "math"
"time"
"github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/cpu"
) )
@ -14,17 +13,28 @@ type cpuStatsCalculator struct {
prevTotal float64 prevTotal float64
} }
// calculate calculates the current cpu usage percentages // calculate the current cpu usage percentages.
// Since the cpu.TimesStat captures the total time a cpu spent in various states
// this function tracks the last seen stat and derives each cpu state's utilization
// as a percentage of the total change in cpu time between calls.
// The first time calculate is called CPUStats will report %100 idle
// usage since there is not a previous value to calculate against
func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats { func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
// sum all none idle counters to get the total busy cpu time
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
times.Softirq + times.Steal + times.Guest + times.GuestNice times.Softirq + times.Steal + times.Guest + times.GuestNice
// sum of the total cpu time
currentTotal := currentBusy + times.Idle currentTotal := currentBusy + times.Idle
// calculate how much cpu time has passed since last calculation
deltaTotal := currentTotal - h.prevTotal deltaTotal := currentTotal - h.prevTotal
stats := &CPUStats{ stats := &CPUStats{
CPU: times.CPU, CPU: times.CPU,
// calculate each percentage as the ratio of the change
// in each state's time to the total change in cpu time
Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100, Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100,
User: ((times.User - h.prev.User) / deltaTotal) * 100, User: ((times.User - h.prev.User) / deltaTotal) * 100,
System: ((times.System - h.prev.System) / deltaTotal) * 100, System: ((times.System - h.prev.System) / deltaTotal) * 100,
@ -55,15 +65,7 @@ func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
return stats return stats
} }
// cpuStats calculates cpu usage percentage func (c *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
type cpuStats struct {
prevCpuTime float64
prevTime time.Time
totalCpus int
}
func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
cpuStats, err := cpu.Times(true) cpuStats, err := cpu.Times(true)
if err != nil { if err != nil {
@ -71,10 +73,10 @@ func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
} }
cs := make([]*CPUStats, len(cpuStats)) cs := make([]*CPUStats, len(cpuStats))
for idx, cpuStat := range cpuStats { for idx, cpuStat := range cpuStats {
percentCalculator, ok := h.cpuCalculator[cpuStat.CPU] percentCalculator, ok := c.cpuCalculator[cpuStat.CPU]
if !ok { if !ok {
percentCalculator = &cpuStatsCalculator{} percentCalculator = &cpuStatsCalculator{}
h.cpuCalculator[cpuStat.CPU] = percentCalculator c.cpuCalculator[cpuStat.CPU] = percentCalculator
} }
cs[idx] = percentCalculator.calculate(cpuStat) cs[idx] = percentCalculator.calculate(cpuStat)
} }

View File

@ -4,6 +4,7 @@ import (
"math" "math"
"os" "os"
"testing" "testing"
"time"
"github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/consul/sdk/testutil"
"github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/cpu"
@ -20,10 +21,7 @@ func TestHostStats_CPU(t *testing.T) {
// Collect twice so we can calculate percents we need to generate some work // Collect twice so we can calculate percents we need to generate some work
// so that the cpu values change // so that the cpu values change
hs.collect() hs.collect()
total := 0 for begin := time.Now(); time.Now().Sub(begin) < 100*time.Millisecond; {
for i := 1; i < 1000000000; i++ {
total *= i
total = total % i
} }
hs.collect() hs.collect()
stats := hs.Stats() stats := hs.Stats()

View File

@ -18,14 +18,9 @@ type HostStats struct {
} }
func (hs *HostStats) Clone() *HostStats { func (hs *HostStats) Clone() *HostStats {
clone := *hs clone := &HostStats{}
*clone = *hs
clone.CPU = make([]*CPUStats, len(hs.CPU)) return clone
for i := range hs.CPU {
cpu := *hs.CPU[i]
clone.CPU[i] = &cpu
}
return &clone
} }
func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) { func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) {

View File

@ -757,7 +757,10 @@ Consul attaches the following labels to metric values.
## Server Host Metrics ## Server Host Metrics
Consul servers report the following metrics about the host's system resources Consul servers report the following metrics about the host's system resources.
Note that if the Consul server is operating inside a container these metrics
still report host resource usage and do not report any resource limits placed
on the container.
**Requirements:** **Requirements:**
- Consul 1.15.3+ - Consul 1.15.3+