hoststats: add package for collecting host statistics including cpu, memory and disk usage

pull/17038/head
Nick Ethier 2023-04-18 22:32:31 -04:00
parent 85cfec6b16
commit fafb68b28d
6 changed files with 558 additions and 0 deletions

View File

@ -4,6 +4,7 @@
package agent
import (
"context"
"fmt"
"io"
"net"
@ -12,6 +13,7 @@ import (
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/lib/hoststats"
"github.com/hashicorp/go-hclog"
wal "github.com/hashicorp/raft-wal"
"github.com/hashicorp/raft-wal/verifier"
@ -117,6 +119,7 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer, providedLogger hcl
if err != nil {
return d, fmt.Errorf("failed to initialize telemetry: %w", err)
}
hoststats.NewCollector(context.Background(), d.Logger, cfg.DataDir)
d.TLSConfigurator, err = tlsutil.NewConfigurator(cfg.TLS, d.Logger)
if err != nil {
@ -295,6 +298,7 @@ func getPrometheusDefs(cfg *config.RuntimeConfig, isServer bool) ([]prometheus.G
Gauges,
raftGauges,
serverGauges,
hoststats.Gauges,
}
// TODO(ffmmm): conditionally add only leader specific metrics to gauges, counters, summaries, etc

189
lib/hoststats/collector.go Normal file
View File

@ -0,0 +1,189 @@
package hoststats
import (
"context"
"fmt"
"math"
"runtime"
"sync"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/host"
"github.com/shirou/gopsutil/v3/mem"
)
// Collector collects host resource usage stats
type Collector struct {
numCores int
cpuCalculator map[string]*cpuStatsCalculator
hostStats *HostStats
hostStatsLock sync.RWMutex
dataDir string
metrics Metrics
baseLabels []metrics.Label
logger hclog.Logger
}
// NewCollector returns a Collector. The dataDir is passed in
// so that we can present the disk related statistics for the mountpoint where the dataDir exists
func NewCollector(ctx context.Context, logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector {
logger = logger.Named("host_stats")
collector := initCollector(logger, dataDir)
go collector.loop(ctx)
return collector
}
// initCollector initializes the Collector but does not start the collection loop
func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector {
numCores := runtime.NumCPU()
statsCalculator := make(map[string]*cpuStatsCalculator)
collector := &Collector{
cpuCalculator: statsCalculator,
numCores: numCores,
logger: logger,
dataDir: dataDir,
}
for _, opt := range opts {
opt(collector)
}
if collector.metrics == nil {
collector.metrics = metrics.Default()
}
return collector
}
func (h *Collector) loop(ctx context.Context) {
// Start collecting host stats right away and then keep collecting every
// collection interval
next := time.NewTimer(0)
defer next.Stop()
for {
select {
case <-next.C:
h.collect()
next.Reset(hostStatsCollectionInterval)
h.Stats().Emit(h.metrics, h.baseLabels)
case <-ctx.Done():
return
}
}
}
// collect will collect stats related to resource usage of the host
func (h *Collector) collect() {
h.hostStatsLock.Lock()
defer h.hostStatsLock.Unlock()
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
// Determine up-time
uptime, err := host.Uptime()
if err != nil {
h.logger.Error("failed to collect uptime stats", "error", err)
uptime = 0
}
hs.Uptime = uptime
// Collect memory stats
mstats, err := h.collectMemoryStats()
if err != nil {
h.logger.Error("failed to collect memory stats", "error", err)
mstats = &MemoryStats{}
}
hs.Memory = mstats
// Collect cpu stats
cpus, err := h.collectCPUStats()
if err != nil {
h.logger.Error("failed to collect cpu stats", "error", err)
cpus = []*CPUStats{}
}
hs.CPU = cpus
// Collect disk stats
diskStats, err := h.collectDiskStats(h.dataDir)
if err != nil {
h.logger.Error("failed to collect dataDir disk stats", "error", err)
}
hs.DataDirStats = diskStats
// Update the collected status object.
h.hostStats = hs
}
func (h *Collector) collectDiskStats(dir string) (*DiskStats, error) {
usage, err := disk.Usage(dir)
if err != nil {
return nil, fmt.Errorf("failed to collect disk usage stats: %w", err)
}
return h.toDiskStats(usage), nil
}
func (h *Collector) collectMemoryStats() (*MemoryStats, error) {
memStats, err := mem.VirtualMemory()
if err != nil {
return nil, err
}
mem := &MemoryStats{
Total: memStats.Total,
Available: memStats.Available,
Used: memStats.Used,
UsedPercent: memStats.UsedPercent,
Free: memStats.Free,
}
return mem, nil
}
// Stats returns the host stats that has been collected
func (h *Collector) Stats() *HostStats {
h.hostStatsLock.RLock()
defer h.hostStatsLock.RUnlock()
if h.hostStats == nil {
return &HostStats{}
}
return h.hostStats.Clone()
}
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
func (h *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats {
ds := DiskStats{
Size: usage.Total,
Used: usage.Used,
Available: usage.Free,
UsedPercent: usage.UsedPercent,
InodesUsedPercent: usage.InodesUsedPercent,
Path: usage.Path,
}
if math.IsNaN(ds.UsedPercent) {
ds.UsedPercent = 0.0
}
if math.IsNaN(ds.InodesUsedPercent) {
ds.InodesUsedPercent = 0.0
}
return &ds
}
type CollectorOption func(c *Collector)
func WithMetrics(m *metrics.Metrics) CollectorOption {
return func(c *Collector) {
c.metrics = m
}
}
func WithBaseLabels(labels []metrics.Label) CollectorOption {
return func(c *Collector) {
c.baseLabels = labels
}
}

118
lib/hoststats/cpu.go Normal file
View File

@ -0,0 +1,118 @@
package hoststats
import (
"math"
"time"
"github.com/shirou/gopsutil/v3/cpu"
)
// cpuStatsCalculator calculates cpu usage percentages
type cpuStatsCalculator struct {
prevIdle float64
prevUser float64
prevSystem float64
prevBusy float64
prevTotal float64
}
// calculate calculates the current cpu usage percentages
func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) (idle float64, user float64, system float64, total float64) {
currentIdle := times.Idle
currentUser := times.User
currentSystem := times.System
currentTotal := times.Total()
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
times.Softirq + times.Steal + times.Guest + times.GuestNice
deltaTotal := currentTotal - h.prevTotal
idle = ((currentIdle - h.prevIdle) / deltaTotal) * 100
user = ((currentUser - h.prevUser) / deltaTotal) * 100
system = ((currentSystem - h.prevSystem) / deltaTotal) * 100
total = ((currentBusy - h.prevBusy) / deltaTotal) * 100
// Protect against any invalid values
if math.IsNaN(idle) || math.IsInf(idle, 0) {
idle = 100.0
}
if math.IsNaN(user) || math.IsInf(user, 0) {
user = 0.0
}
if math.IsNaN(system) || math.IsInf(system, 0) {
system = 0.0
}
if math.IsNaN(total) || math.IsInf(total, 0) {
total = 0.0
}
h.prevIdle = currentIdle
h.prevUser = currentUser
h.prevSystem = currentSystem
h.prevTotal = currentTotal
h.prevBusy = currentBusy
return
}
// cpuStats calculates cpu usage percentage
type cpuStats struct {
prevCpuTime float64
prevTime time.Time
totalCpus int
}
// percent calculates the cpu usage percentage based on the current cpu usage
// and the previous cpu usage where usage is given as time in nanoseconds spend
// in the cpu
func (c *cpuStats) percent(cpuTime float64) float64 {
now := time.Now()
if c.prevCpuTime == 0.0 {
// invoked first time
c.prevCpuTime = cpuTime
c.prevTime = now
return 0.0
}
timeDelta := now.Sub(c.prevTime).Nanoseconds()
ret := c.calculatePercent(c.prevCpuTime, cpuTime, timeDelta)
c.prevCpuTime = cpuTime
c.prevTime = now
return ret
}
func (c *cpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 {
vDelta := t2 - t1
if timeDelta <= 0 || vDelta <= 0.0 {
return 0.0
}
overall_percent := (vDelta / float64(timeDelta)) * 100.0
return overall_percent
}
func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
cpuStats, err := cpu.Times(true)
if err != nil {
return nil, err
}
cs := make([]*CPUStats, len(cpuStats))
for idx, cpuStat := range cpuStats {
percentCalculator, ok := h.cpuCalculator[cpuStat.CPU]
if !ok {
percentCalculator = &cpuStatsCalculator{}
h.cpuCalculator[cpuStat.CPU] = percentCalculator
}
idle, user, system, total := percentCalculator.calculate(cpuStat)
cs[idx] = &CPUStats{
CPU: cpuStat.CPU,
User: user,
System: system,
Idle: idle,
Total: total,
}
}
return cs, nil
}

77
lib/hoststats/cpu_test.go Normal file
View File

@ -0,0 +1,77 @@
package hoststats
import (
"math"
"os"
"runtime"
"testing"
"time"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestCpuStats_percent(t *testing.T) {
cs := &cpuStats{
totalCpus: runtime.NumCPU(),
}
cs.percent(79.7)
time.Sleep(1 * time.Second)
percent := cs.percent(80.69)
expectedPercent := 98.00
if percent < expectedPercent && percent > (expectedPercent+1.00) {
t.Fatalf("expected: %v, actual: %v", expectedPercent, percent)
}
}
func TestHostStats_CPU(t *testing.T) {
assert := assert.New(t)
logger := testutil.Logger(t)
cwd, err := os.Getwd()
assert.Nil(err)
hs := initCollector(logger, cwd)
// Collect twice so we can calculate percents we need to generate some work
// so that the cpu values change
hs.collect()
total := 0
for i := 1; i < 1000000000; i++ {
total *= i
total = total % i
}
hs.collect()
stats := hs.Stats()
assert.NotZero(len(stats.CPU))
for _, cpu := range stats.CPU {
assert.False(math.IsNaN(cpu.Idle))
assert.False(math.IsNaN(cpu.Total))
assert.False(math.IsNaN(cpu.System))
assert.False(math.IsNaN(cpu.User))
assert.False(math.IsInf(cpu.Idle, 0))
assert.False(math.IsInf(cpu.Total, 0))
assert.False(math.IsInf(cpu.System, 0))
assert.False(math.IsInf(cpu.User, 0))
}
}
func TestCpuStatsCalculator_Nan(t *testing.T) {
times := cpu.TimesStat{
User: 0.0,
Idle: 100.0,
System: 0.0,
}
calculator := &cpuStatsCalculator{}
calculator.calculate(times)
idle, user, system, total := calculator.calculate(times)
require.Equal(t, 100.0, idle)
require.Zero(t, user)
require.Zero(t, system)
require.Zero(t, total)
}

95
lib/hoststats/host.go Normal file
View File

@ -0,0 +1,95 @@
package hoststats
import (
"time"
"github.com/armon/go-metrics"
)
var hostStatsCollectionInterval = 10 * time.Second
// HostStats represents resource usage hoststats of the host running a Consul agent
type HostStats struct {
Memory *MemoryStats
CPU []*CPUStats
DataDirStats *DiskStats
Uptime uint64
Timestamp int64
}
func (hs *HostStats) Clone() *HostStats {
clone := *hs
clone.CPU = make([]*CPUStats, len(hs.CPU))
for i := range hs.CPU {
cpu := *hs.CPU[i]
clone.CPU[i] = &cpu
}
return &clone
}
func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) {
if hs.Memory != nil {
sink.SetGaugeWithLabels([]string{"host", "memory", "total"}, float32(hs.Memory.Total), baseLabels)
sink.SetGaugeWithLabels([]string{"host", "memory", "available"}, float32(hs.Memory.Available), baseLabels)
sink.SetGaugeWithLabels([]string{"host", "memory", "used"}, float32(hs.Memory.Used), baseLabels)
sink.SetGaugeWithLabels([]string{"host", "memory", "used_percent"}, float32(hs.Memory.UsedPercent), baseLabels)
sink.SetGaugeWithLabels([]string{"host", "memory", "free"}, float32(hs.Memory.Free), baseLabels)
}
for _, cpu := range hs.CPU {
labels := append(baseLabels, metrics.Label{
Name: "cpu",
Value: cpu.CPU,
})
sink.SetGaugeWithLabels([]string{"host", "cpu", "total"}, float32(cpu.Total), labels)
sink.SetGaugeWithLabels([]string{"host", "cpu", "user"}, float32(cpu.User), labels)
sink.SetGaugeWithLabels([]string{"host", "cpu", "idle"}, float32(cpu.Idle), labels)
sink.SetGaugeWithLabels([]string{"host", "cpu", "system"}, float32(cpu.System), labels)
}
if hs.DataDirStats != nil {
diskLabels := append(baseLabels, metrics.Label{
Name: "path",
Value: hs.DataDirStats.Path,
})
sink.SetGaugeWithLabels([]string{"host", "disk", "size"}, float32(hs.DataDirStats.Size), diskLabels)
sink.SetGaugeWithLabels([]string{"host", "disk", "used"}, float32(hs.DataDirStats.Used), diskLabels)
sink.SetGaugeWithLabels([]string{"host", "disk", "available"}, float32(hs.DataDirStats.Available), diskLabels)
sink.SetGaugeWithLabels([]string{"host", "disk", "used_percent"}, float32(hs.DataDirStats.UsedPercent), diskLabels)
sink.SetGaugeWithLabels([]string{"host", "disk", "inodes_percent"}, float32(hs.DataDirStats.InodesUsedPercent), diskLabels)
}
sink.SetGaugeWithLabels([]string{"host", "uptime"}, float32(hs.Uptime), baseLabels)
}
// CPUStats represents hoststats related to cpu usage
type CPUStats struct {
CPU string
User float64
System float64
Idle float64
Total float64
}
// MemoryStats represents hoststats related to virtual memory usage
type MemoryStats struct {
Total uint64
Available uint64
Used uint64
UsedPercent float64
Free uint64
}
// DiskStats represents hoststats related to disk usage
type DiskStats struct {
Path string
Size uint64
Used uint64
Available uint64
UsedPercent float64
InodesUsedPercent float64
}

75
lib/hoststats/metrics.go Normal file
View File

@ -0,0 +1,75 @@
package hoststats
import (
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
)
// Metrics defines an interface for the methods used to emit data to the go-metrics library.
// `metrics.Default()` should always satisfy this interface.
type Metrics interface {
SetGaugeWithLabels(key []string, val float32, labels []metrics.Label)
}
var Gauges = []prometheus.GaugeDefinition{
{
Name: []string{"host", "memory", "total"},
Help: "Total physical memory in bytes",
},
{
Name: []string{"host", "memory", "available"},
Help: "Available physical memory in bytes",
},
{
Name: []string{"host", "memory", "free"},
Help: "Free physical memory in bytes",
},
{
Name: []string{"host", "memory", "used"},
Help: "Used physical memory in bytes",
},
{
Name: []string{"host", "memory", "used_percent"},
Help: "Percentage of physical memory in use",
},
{
Name: []string{"host", "cpu", "total"},
Help: "Total cpu utilization",
},
{
Name: []string{"host", "cpu", "user"},
Help: "User cpu utilization",
},
{
Name: []string{"host", "cpu", "idle"},
Help: "Idle cpu utilization",
},
{
Name: []string{"host", "cpu", "system"},
Help: "System cpu utilization",
},
{
Name: []string{"host", "disk", "size"},
Help: "Size of disk in bytes",
},
{
Name: []string{"host", "disk", "used"},
Help: "Disk usage in bytes",
},
{
Name: []string{"host", "disk", "available"},
Help: "Available bytes on disk",
},
{
Name: []string{"host", "disk", "used_percent"},
Help: "Percentage of disk space usage",
},
{
Name: []string{"host", "disk", "inodes_percent"},
Help: "Percentage of disk inodes usage",
},
{
Name: []string{"host", "uptime"},
Help: "System uptime",
},
}