From fafb68b28d839892206e5e688055204c2cf54023 Mon Sep 17 00:00:00 2001 From: Nick Ethier Date: Tue, 18 Apr 2023 22:32:31 -0400 Subject: [PATCH] hoststats: add package for collecting host statistics including cpu, memory and disk usage --- agent/setup.go | 4 + lib/hoststats/collector.go | 189 +++++++++++++++++++++++++++++++++++++ lib/hoststats/cpu.go | 118 +++++++++++++++++++++++ lib/hoststats/cpu_test.go | 77 +++++++++++++++ lib/hoststats/host.go | 95 +++++++++++++++++++ lib/hoststats/metrics.go | 75 +++++++++++++++ 6 files changed, 558 insertions(+) create mode 100644 lib/hoststats/collector.go create mode 100644 lib/hoststats/cpu.go create mode 100644 lib/hoststats/cpu_test.go create mode 100644 lib/hoststats/host.go create mode 100644 lib/hoststats/metrics.go diff --git a/agent/setup.go b/agent/setup.go index 9ed993aaf4..a978744704 100644 --- a/agent/setup.go +++ b/agent/setup.go @@ -4,6 +4,7 @@ package agent import ( + "context" "fmt" "io" "net" @@ -12,6 +13,7 @@ import ( "github.com/armon/go-metrics" "github.com/armon/go-metrics/prometheus" + "github.com/hashicorp/consul/lib/hoststats" "github.com/hashicorp/go-hclog" wal "github.com/hashicorp/raft-wal" "github.com/hashicorp/raft-wal/verifier" @@ -117,6 +119,7 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl if err != nil { return d, fmt.Errorf("failed to initialize telemetry: %w", err) } + hoststats.NewCollector(context.Background(), d.Logger, cfg.DataDir) d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger) if err != nil { @@ -295,6 +298,7 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G Gauges, raftGauges, serverGauges, + hoststats.Gauges, } // TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc diff --git a/lib/hoststats/collector.go b/lib/hoststats/collector.go new file mode 100644 index 0000000000..a2c7bade67 --- /dev/null +++ b/lib/hoststats/collector.go @@ -0,0 +1,189 @@ +package hoststats + +import ( + "context" + "fmt" + "math" + "runtime" + "sync" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-hclog" + "github.com/shirou/gopsutil/v3/disk" + "github.com/shirou/gopsutil/v3/host" + "github.com/shirou/gopsutil/v3/mem" +) + +// Collector collects host resource usage stats +type Collector struct { + numCores int + cpuCalculator map[string]*cpuStatsCalculator + hostStats *HostStats + hostStatsLock sync.RWMutex + dataDir string + + metrics Metrics + baseLabels []metrics.Label + + logger hclog.Logger +} + +// NewCollector returns a Collector. The dataDir is passed in +// so that we can present the disk related statistics for the mountpoint where the dataDir exists +func NewCollector(ctx context.Context, logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector { + logger = logger.Named("host_stats") + collector := initCollector(logger, dataDir) + go collector.loop(ctx) + return collector +} + +// initCollector initializes the Collector but does not start the collection loop +func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector { + numCores := runtime.NumCPU() + statsCalculator := make(map[string]*cpuStatsCalculator) + collector := &Collector{ + cpuCalculator: statsCalculator, + numCores: numCores, + logger: logger, + dataDir: dataDir, + } + + for _, opt := range opts { + opt(collector) + } + + if collector.metrics == nil { + collector.metrics = metrics.Default() + } + return collector +} + +func (h *Collector) loop(ctx context.Context) { + // Start collecting host stats right away and then keep collecting every + // collection interval + next := time.NewTimer(0) + defer next.Stop() + for { + select { + case <-next.C: + h.collect() + next.Reset(hostStatsCollectionInterval) + h.Stats().Emit(h.metrics, h.baseLabels) + + case <-ctx.Done(): + return + } + } +} + +// collect will collect stats related to resource usage of the host +func (h *Collector) collect() { + h.hostStatsLock.Lock() + defer h.hostStatsLock.Unlock() + hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} + + // Determine up-time + uptime, err := host.Uptime() + if err != nil { + h.logger.Error("failed to collect uptime stats", "error", err) + uptime = 0 + } + hs.Uptime = uptime + + // Collect memory stats + mstats, err := h.collectMemoryStats() + if err != nil { + h.logger.Error("failed to collect memory stats", "error", err) + mstats = &MemoryStats{} + } + hs.Memory = mstats + + // Collect cpu stats + cpus, err := h.collectCPUStats() + if err != nil { + h.logger.Error("failed to collect cpu stats", "error", err) + cpus = []*CPUStats{} + } + hs.CPU = cpus + + // Collect disk stats + diskStats, err := h.collectDiskStats(h.dataDir) + if err != nil { + h.logger.Error("failed to collect dataDir disk stats", "error", err) + } + hs.DataDirStats = diskStats + + // Update the collected status object. + h.hostStats = hs +} + +func (h *Collector) collectDiskStats(dir string) (*DiskStats, error) { + usage, err := disk.Usage(dir) + if err != nil { + return nil, fmt.Errorf("failed to collect disk usage stats: %w", err) + } + return h.toDiskStats(usage), nil +} + +func (h *Collector) collectMemoryStats() (*MemoryStats, error) { + memStats, err := mem.VirtualMemory() + if err != nil { + return nil, err + } + mem := &MemoryStats{ + Total: memStats.Total, + Available: memStats.Available, + Used: memStats.Used, + UsedPercent: memStats.UsedPercent, + Free: memStats.Free, + } + + return mem, nil +} + +// Stats returns the host stats that has been collected +func (h *Collector) Stats() *HostStats { + h.hostStatsLock.RLock() + defer h.hostStatsLock.RUnlock() + + if h.hostStats == nil { + return &HostStats{} + } + + return h.hostStats.Clone() +} + +// toDiskStats merges UsageStat and PartitionStat to create a DiskStat +func (h *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats { + ds := DiskStats{ + Size: usage.Total, + Used: usage.Used, + Available: usage.Free, + UsedPercent: usage.UsedPercent, + InodesUsedPercent: usage.InodesUsedPercent, + Path: usage.Path, + } + if math.IsNaN(ds.UsedPercent) { + ds.UsedPercent = 0.0 + } + if math.IsNaN(ds.InodesUsedPercent) { + ds.InodesUsedPercent = 0.0 + } + + return &ds +} + +type CollectorOption func(c *Collector) + +func WithMetrics(m *metrics.Metrics) CollectorOption { + return func(c *Collector) { + c.metrics = m + } +} + +func WithBaseLabels(labels []metrics.Label) CollectorOption { + return func(c *Collector) { + c.baseLabels = labels + } +} diff --git a/lib/hoststats/cpu.go b/lib/hoststats/cpu.go new file mode 100644 index 0000000000..0fc3fc28c4 --- /dev/null +++ b/lib/hoststats/cpu.go @@ -0,0 +1,118 @@ +package hoststats + +import ( + "math" + "time" + + "github.com/shirou/gopsutil/v3/cpu" +) + +// cpuStatsCalculator calculates cpu usage percentages +type cpuStatsCalculator struct { + prevIdle float64 + prevUser float64 + prevSystem float64 + prevBusy float64 + prevTotal float64 +} + +// calculate calculates the current cpu usage percentages +func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) (idle float64, user float64, system float64, total float64) { + currentIdle := times.Idle + currentUser := times.User + currentSystem := times.System + currentTotal := times.Total() + currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + + times.Softirq + times.Steal + times.Guest + times.GuestNice + + deltaTotal := currentTotal - h.prevTotal + idle = ((currentIdle - h.prevIdle) / deltaTotal) * 100 + user = ((currentUser - h.prevUser) / deltaTotal) * 100 + system = ((currentSystem - h.prevSystem) / deltaTotal) * 100 + total = ((currentBusy - h.prevBusy) / deltaTotal) * 100 + + // Protect against any invalid values + if math.IsNaN(idle) || math.IsInf(idle, 0) { + idle = 100.0 + } + if math.IsNaN(user) || math.IsInf(user, 0) { + user = 0.0 + } + if math.IsNaN(system) || math.IsInf(system, 0) { + system = 0.0 + } + if math.IsNaN(total) || math.IsInf(total, 0) { + total = 0.0 + } + + h.prevIdle = currentIdle + h.prevUser = currentUser + h.prevSystem = currentSystem + h.prevTotal = currentTotal + h.prevBusy = currentBusy + return +} + +// cpuStats calculates cpu usage percentage +type cpuStats struct { + prevCpuTime float64 + prevTime time.Time + + totalCpus int +} + +// percent calculates the cpu usage percentage based on the current cpu usage +// and the previous cpu usage where usage is given as time in nanoseconds spend +// in the cpu +func (c *cpuStats) percent(cpuTime float64) float64 { + now := time.Now() + + if c.prevCpuTime == 0.0 { + // invoked first time + c.prevCpuTime = cpuTime + c.prevTime = now + return 0.0 + } + + timeDelta := now.Sub(c.prevTime).Nanoseconds() + ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta) + c.prevCpuTime = cpuTime + c.prevTime = now + return ret +} + +func (c *cpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 { + vDelta := t2 - t1 + if timeDelta <= 0 || vDelta <= 0.0 { + return 0.0 + } + + overall_percent := (vDelta / float64(timeDelta)) * 100.0 + return overall_percent +} + +func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) { + + cpuStats, err := cpu.Times(true) + if err != nil { + return nil, err + } + cs := make([]*CPUStats, len(cpuStats)) + for idx, cpuStat := range cpuStats { + percentCalculator, ok := h.cpuCalculator[cpuStat.CPU] + if !ok { + percentCalculator = &cpuStatsCalculator{} + h.cpuCalculator[cpuStat.CPU] = percentCalculator + } + idle, user, system, total := percentCalculator.calculate(cpuStat) + cs[idx] = &CPUStats{ + CPU: cpuStat.CPU, + User: user, + System: system, + Idle: idle, + Total: total, + } + } + + return cs, nil +} diff --git a/lib/hoststats/cpu_test.go b/lib/hoststats/cpu_test.go new file mode 100644 index 0000000000..6de0823a96 --- /dev/null +++ b/lib/hoststats/cpu_test.go @@ -0,0 +1,77 @@ +package hoststats + +import ( + "math" + "os" + "runtime" + "testing" + "time" + + "github.com/hashicorp/consul/sdk/testutil" + "github.com/shirou/gopsutil/v3/cpu" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCpuStats_percent(t *testing.T) { + cs := &cpuStats{ + totalCpus: runtime.NumCPU(), + } + cs.percent(79.7) + time.Sleep(1 * time.Second) + percent := cs.percent(80.69) + expectedPercent := 98.00 + if percent < expectedPercent && percent > (expectedPercent+1.00) { + t.Fatalf("expected: %v, actual: %v", expectedPercent, percent) + } +} + +func TestHostStats_CPU(t *testing.T) { + + assert := assert.New(t) + + logger := testutil.Logger(t) + cwd, err := os.Getwd() + assert.Nil(err) + hs := initCollector(logger, cwd) + + // Collect twice so we can calculate percents we need to generate some work + // so that the cpu values change + hs.collect() + total := 0 + for i := 1; i < 1000000000; i++ { + total *= i + total = total % i + } + hs.collect() + stats := hs.Stats() + assert.NotZero(len(stats.CPU)) + + for _, cpu := range stats.CPU { + assert.False(math.IsNaN(cpu.Idle)) + assert.False(math.IsNaN(cpu.Total)) + assert.False(math.IsNaN(cpu.System)) + assert.False(math.IsNaN(cpu.User)) + + assert.False(math.IsInf(cpu.Idle, 0)) + assert.False(math.IsInf(cpu.Total, 0)) + assert.False(math.IsInf(cpu.System, 0)) + assert.False(math.IsInf(cpu.User, 0)) + } +} + +func TestCpuStatsCalculator_Nan(t *testing.T) { + times := cpu.TimesStat{ + User: 0.0, + Idle: 100.0, + System: 0.0, + } + + calculator := &cpuStatsCalculator{} + calculator.calculate(times) + idle, user, system, total := calculator.calculate(times) + require.Equal(t, 100.0, idle) + require.Zero(t, user) + require.Zero(t, system) + require.Zero(t, total) +} diff --git a/lib/hoststats/host.go b/lib/hoststats/host.go new file mode 100644 index 0000000000..8ba04c734a --- /dev/null +++ b/lib/hoststats/host.go @@ -0,0 +1,95 @@ +package hoststats + +import ( + "time" + + "github.com/armon/go-metrics" +) + +var hostStatsCollectionInterval = 10 * time.Second + +// HostStats represents resource usage hoststats of the host running a Consul agent +type HostStats struct { + Memory *MemoryStats + CPU []*CPUStats + DataDirStats *DiskStats + Uptime uint64 + Timestamp int64 +} + +func (hs *HostStats) Clone() *HostStats { + clone := *hs + + clone.CPU = make([]*CPUStats, len(hs.CPU)) + for i := range hs.CPU { + cpu := *hs.CPU[i] + clone.CPU[i] = &cpu + } + return &clone +} + +func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) { + + if hs.Memory != nil { + sink.SetGaugeWithLabels([]string{"host", "memory", "total"}, float32(hs.Memory.Total), baseLabels) + sink.SetGaugeWithLabels([]string{"host", "memory", "available"}, float32(hs.Memory.Available), baseLabels) + sink.SetGaugeWithLabels([]string{"host", "memory", "used"}, float32(hs.Memory.Used), baseLabels) + sink.SetGaugeWithLabels([]string{"host", "memory", "used_percent"}, float32(hs.Memory.UsedPercent), baseLabels) + sink.SetGaugeWithLabels([]string{"host", "memory", "free"}, float32(hs.Memory.Free), baseLabels) + } + + for _, cpu := range hs.CPU { + labels := append(baseLabels, metrics.Label{ + Name: "cpu", + Value: cpu.CPU, + }) + + sink.SetGaugeWithLabels([]string{"host", "cpu", "total"}, float32(cpu.Total), labels) + sink.SetGaugeWithLabels([]string{"host", "cpu", "user"}, float32(cpu.User), labels) + sink.SetGaugeWithLabels([]string{"host", "cpu", "idle"}, float32(cpu.Idle), labels) + sink.SetGaugeWithLabels([]string{"host", "cpu", "system"}, float32(cpu.System), labels) + } + + if hs.DataDirStats != nil { + diskLabels := append(baseLabels, metrics.Label{ + Name: "path", + Value: hs.DataDirStats.Path, + }) + + sink.SetGaugeWithLabels([]string{"host", "disk", "size"}, float32(hs.DataDirStats.Size), diskLabels) + sink.SetGaugeWithLabels([]string{"host", "disk", "used"}, float32(hs.DataDirStats.Used), diskLabels) + sink.SetGaugeWithLabels([]string{"host", "disk", "available"}, float32(hs.DataDirStats.Available), diskLabels) + sink.SetGaugeWithLabels([]string{"host", "disk", "used_percent"}, float32(hs.DataDirStats.UsedPercent), diskLabels) + sink.SetGaugeWithLabels([]string{"host", "disk", "inodes_percent"}, float32(hs.DataDirStats.InodesUsedPercent), diskLabels) + } + + sink.SetGaugeWithLabels([]string{"host", "uptime"}, float32(hs.Uptime), baseLabels) +} + +// CPUStats represents hoststats related to cpu usage +type CPUStats struct { + CPU string + User float64 + System float64 + Idle float64 + Total float64 +} + +// MemoryStats represents hoststats related to virtual memory usage +type MemoryStats struct { + Total uint64 + Available uint64 + Used uint64 + UsedPercent float64 + Free uint64 +} + +// DiskStats represents hoststats related to disk usage +type DiskStats struct { + Path string + Size uint64 + Used uint64 + Available uint64 + UsedPercent float64 + InodesUsedPercent float64 +} diff --git a/lib/hoststats/metrics.go b/lib/hoststats/metrics.go new file mode 100644 index 0000000000..5cedfa2962 --- /dev/null +++ b/lib/hoststats/metrics.go @@ -0,0 +1,75 @@ +package hoststats + +import ( + "github.com/armon/go-metrics" + "github.com/armon/go-metrics/prometheus" +) + +// Metrics defines an interface for the methods used to emit data to the go-metrics library. +// `metrics.Default()` should always satisfy this interface. +type Metrics interface { + SetGaugeWithLabels(key []string, val float32, labels []metrics.Label) +} + +var Gauges = []prometheus.GaugeDefinition{ + { + Name: []string{"host", "memory", "total"}, + Help: "Total physical memory in bytes", + }, + { + Name: []string{"host", "memory", "available"}, + Help: "Available physical memory in bytes", + }, + { + Name: []string{"host", "memory", "free"}, + Help: "Free physical memory in bytes", + }, + { + Name: []string{"host", "memory", "used"}, + Help: "Used physical memory in bytes", + }, + { + Name: []string{"host", "memory", "used_percent"}, + Help: "Percentage of physical memory in use", + }, + { + Name: []string{"host", "cpu", "total"}, + Help: "Total cpu utilization", + }, + { + Name: []string{"host", "cpu", "user"}, + Help: "User cpu utilization", + }, + { + Name: []string{"host", "cpu", "idle"}, + Help: "Idle cpu utilization", + }, + { + Name: []string{"host", "cpu", "system"}, + Help: "System cpu utilization", + }, + { + Name: []string{"host", "disk", "size"}, + Help: "Size of disk in bytes", + }, + { + Name: []string{"host", "disk", "used"}, + Help: "Disk usage in bytes", + }, + { + Name: []string{"host", "disk", "available"}, + Help: "Available bytes on disk", + }, + { + Name: []string{"host", "disk", "used_percent"}, + Help: "Percentage of disk space usage", + }, + { + Name: []string{"host", "disk", "inodes_percent"}, + Help: "Percentage of disk inodes usage", + }, + { + Name: []string{"host", "uptime"}, + Help: "System uptime", + }, +}