mirror of https://github.com/hashicorp/consul
Nick Ethier
2 years ago
committed by
GitHub
17 changed files with 598 additions and 11 deletions
@ -0,0 +1,3 @@
|
||||
```release-note:improvement |
||||
agent: add new metrics to track cpu disk and memory usage for server hosts (defaults to: enabled) |
||||
``` |
@ -0,0 +1,189 @@
|
||||
package hoststats |
||||
|
||||
import ( |
||||
"context" |
||||
"fmt" |
||||
"math" |
||||
"runtime" |
||||
"sync" |
||||
"time" |
||||
|
||||
"github.com/armon/go-metrics" |
||||
"github.com/hashicorp/go-hclog" |
||||
"github.com/shirou/gopsutil/v3/disk" |
||||
"github.com/shirou/gopsutil/v3/host" |
||||
"github.com/shirou/gopsutil/v3/mem" |
||||
) |
||||
|
||||
// Collector collects host resource usage stats
|
||||
type Collector struct { |
||||
numCores int |
||||
cpuCalculator map[string]*cpuStatsCalculator |
||||
hostStats *HostStats |
||||
hostStatsLock sync.RWMutex |
||||
dataDir string |
||||
|
||||
metrics Metrics |
||||
baseLabels []metrics.Label |
||||
|
||||
logger hclog.Logger |
||||
} |
||||
|
||||
// NewCollector returns a Collector. The dataDir is passed in
|
||||
// so that we can present the disk related statistics for the mountpoint where the dataDir exists
|
||||
func NewCollector(ctx context.Context, logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector { |
||||
logger = logger.Named("host_stats") |
||||
collector := initCollector(logger, dataDir) |
||||
go collector.loop(ctx) |
||||
return collector |
||||
} |
||||
|
||||
// initCollector initializes the Collector but does not start the collection loop
|
||||
func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption) *Collector { |
||||
numCores := runtime.NumCPU() |
||||
statsCalculator := make(map[string]*cpuStatsCalculator) |
||||
collector := &Collector{ |
||||
cpuCalculator: statsCalculator, |
||||
numCores: numCores, |
||||
logger: logger, |
||||
dataDir: dataDir, |
||||
} |
||||
|
||||
for _, opt := range opts { |
||||
opt(collector) |
||||
} |
||||
|
||||
if collector.metrics == nil { |
||||
collector.metrics = metrics.Default() |
||||
} |
||||
return collector |
||||
} |
||||
|
||||
func (c *Collector) loop(ctx context.Context) { |
||||
// Start collecting host stats right away and then keep collecting every
|
||||
// collection interval
|
||||
next := time.NewTimer(0) |
||||
defer next.Stop() |
||||
for { |
||||
select { |
||||
case <-next.C: |
||||
c.collect() |
||||
next.Reset(hostStatsCollectionInterval) |
||||
c.Stats().Emit(c.metrics, c.baseLabels) |
||||
|
||||
case <-ctx.Done(): |
||||
return |
||||
} |
||||
} |
||||
} |
||||
|
||||
// collect will collect stats related to resource usage of the host
|
||||
func (c *Collector) collect() { |
||||
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} |
||||
|
||||
// Determine up-time
|
||||
uptime, err := host.Uptime() |
||||
if err != nil { |
||||
c.logger.Error("failed to collect uptime stats", "error", err) |
||||
uptime = 0 |
||||
} |
||||
hs.Uptime = uptime |
||||
|
||||
// Collect memory stats
|
||||
mstats, err := c.collectMemoryStats() |
||||
if err != nil { |
||||
c.logger.Error("failed to collect memory stats", "error", err) |
||||
mstats = &MemoryStats{} |
||||
} |
||||
hs.Memory = mstats |
||||
|
||||
// Collect cpu stats
|
||||
cpus, err := c.collectCPUStats() |
||||
if err != nil { |
||||
c.logger.Error("failed to collect cpu stats", "error", err) |
||||
cpus = []*CPUStats{} |
||||
} |
||||
hs.CPU = cpus |
||||
|
||||
// Collect disk stats
|
||||
diskStats, err := c.collectDiskStats(c.dataDir) |
||||
if err != nil { |
||||
c.logger.Error("failed to collect dataDir disk stats", "error", err) |
||||
} |
||||
hs.DataDirStats = diskStats |
||||
|
||||
// Update the collected status object.
|
||||
c.hostStatsLock.Lock() |
||||
c.hostStats = hs |
||||
c.hostStatsLock.Unlock() |
||||
} |
||||
|
||||
func (c *Collector) collectDiskStats(dir string) (*DiskStats, error) { |
||||
usage, err := disk.Usage(dir) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("failed to collect disk usage stats: %w", err) |
||||
} |
||||
return c.toDiskStats(usage), nil |
||||
} |
||||
|
||||
func (c *Collector) collectMemoryStats() (*MemoryStats, error) { |
||||
memStats, err := mem.VirtualMemory() |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
mem := &MemoryStats{ |
||||
Total: memStats.Total, |
||||
Available: memStats.Available, |
||||
Used: memStats.Used, |
||||
UsedPercent: memStats.UsedPercent, |
||||
Free: memStats.Free, |
||||
} |
||||
|
||||
return mem, nil |
||||
} |
||||
|
||||
// Stats returns the host stats that has been collected
|
||||
func (c *Collector) Stats() *HostStats { |
||||
c.hostStatsLock.RLock() |
||||
defer c.hostStatsLock.RUnlock() |
||||
|
||||
if c.hostStats == nil { |
||||
return &HostStats{} |
||||
} |
||||
|
||||
return c.hostStats.Clone() |
||||
} |
||||
|
||||
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
|
||||
func (c *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats { |
||||
ds := DiskStats{ |
||||
Size: usage.Total, |
||||
Used: usage.Used, |
||||
Available: usage.Free, |
||||
UsedPercent: usage.UsedPercent, |
||||
InodesUsedPercent: usage.InodesUsedPercent, |
||||
Path: usage.Path, |
||||
} |
||||
if math.IsNaN(ds.UsedPercent) { |
||||
ds.UsedPercent = 0.0 |
||||
} |
||||
if math.IsNaN(ds.InodesUsedPercent) { |
||||
ds.InodesUsedPercent = 0.0 |
||||
} |
||||
|
||||
return &ds |
||||
} |
||||
|
||||
type CollectorOption func(c *Collector) |
||||
|
||||
func WithMetrics(m *metrics.Metrics) CollectorOption { |
||||
return func(c *Collector) { |
||||
c.metrics = m |
||||
} |
||||
} |
||||
|
||||
func WithBaseLabels(labels []metrics.Label) CollectorOption { |
||||
return func(c *Collector) { |
||||
c.baseLabels = labels |
||||
} |
||||
} |
@ -0,0 +1,85 @@
|
||||
package hoststats |
||||
|
||||
import ( |
||||
"math" |
||||
|
||||
"github.com/shirou/gopsutil/v3/cpu" |
||||
) |
||||
|
||||
// cpuStatsCalculator calculates cpu usage percentages
|
||||
type cpuStatsCalculator struct { |
||||
prev cpu.TimesStat |
||||
prevBusy float64 |
||||
prevTotal float64 |
||||
} |
||||
|
||||
// calculate the current cpu usage percentages.
|
||||
// Since the cpu.TimesStat captures the total time a cpu spent in various states
|
||||
// this function tracks the last seen stat and derives each cpu state's utilization
|
||||
// as a percentage of the total change in cpu time between calls.
|
||||
// The first time calculate is called CPUStats will report %100 idle
|
||||
// usage since there is not a previous value to calculate against
|
||||
func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats { |
||||
|
||||
// sum all none idle counters to get the total busy cpu time
|
||||
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq + |
||||
times.Softirq + times.Steal + times.Guest + times.GuestNice |
||||
// sum of the total cpu time
|
||||
currentTotal := currentBusy + times.Idle |
||||
|
||||
// calculate how much cpu time has passed since last calculation
|
||||
deltaTotal := currentTotal - h.prevTotal |
||||
|
||||
stats := &CPUStats{ |
||||
CPU: times.CPU, |
||||
|
||||
// calculate each percentage as the ratio of the change
|
||||
// in each state's time to the total change in cpu time
|
||||
Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100, |
||||
User: ((times.User - h.prev.User) / deltaTotal) * 100, |
||||
System: ((times.System - h.prev.System) / deltaTotal) * 100, |
||||
Iowait: ((times.Iowait - h.prev.Iowait) / deltaTotal) * 100, |
||||
Total: ((currentBusy - h.prevBusy) / deltaTotal) * 100, |
||||
} |
||||
|
||||
// Protect against any invalid values
|
||||
if math.IsNaN(stats.Idle) || math.IsInf(stats.Idle, 0) { |
||||
stats.Idle = 100.0 |
||||
} |
||||
if math.IsNaN(stats.User) || math.IsInf(stats.User, 0) { |
||||
stats.User = 0.0 |
||||
} |
||||
if math.IsNaN(stats.System) || math.IsInf(stats.System, 0) { |
||||
stats.System = 0.0 |
||||
} |
||||
if math.IsNaN(stats.Iowait) || math.IsInf(stats.Iowait, 0) { |
||||
stats.Iowait = 0.0 |
||||
} |
||||
if math.IsNaN(stats.Total) || math.IsInf(stats.Total, 0) { |
||||
stats.Total = 0.0 |
||||
} |
||||
|
||||
h.prev = times |
||||
h.prevTotal = currentTotal |
||||
h.prevBusy = currentBusy |
||||
return stats |
||||
} |
||||
|
||||
func (c *Collector) collectCPUStats() (cpus []*CPUStats, err error) { |
||||
|
||||
cpuStats, err := cpu.Times(true) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
cs := make([]*CPUStats, len(cpuStats)) |
||||
for idx, cpuStat := range cpuStats { |
||||
percentCalculator, ok := c.cpuCalculator[cpuStat.CPU] |
||||
if !ok { |
||||
percentCalculator = &cpuStatsCalculator{} |
||||
c.cpuCalculator[cpuStat.CPU] = percentCalculator |
||||
} |
||||
cs[idx] = percentCalculator.calculate(cpuStat) |
||||
} |
||||
|
||||
return cs, nil |
||||
} |
@ -0,0 +1,58 @@
|
||||
package hoststats |
||||
|
||||
import ( |
||||
"math" |
||||
"os" |
||||
"testing" |
||||
"time" |
||||
|
||||
"github.com/hashicorp/consul/sdk/testutil" |
||||
"github.com/shirou/gopsutil/v3/cpu" |
||||
"github.com/stretchr/testify/assert" |
||||
"github.com/stretchr/testify/require" |
||||
) |
||||
|
||||
func TestHostStats_CPU(t *testing.T) { |
||||
logger := testutil.Logger(t) |
||||
cwd, err := os.Getwd() |
||||
assert.Nil(t, err) |
||||
hs := initCollector(logger, cwd) |
||||
|
||||
// Collect twice so we can calculate percents we need to generate some work
|
||||
// so that the cpu values change
|
||||
hs.collect() |
||||
for begin := time.Now(); time.Now().Sub(begin) < 100*time.Millisecond; { |
||||
} |
||||
hs.collect() |
||||
stats := hs.Stats() |
||||
assert.NotZero(t, len(stats.CPU)) |
||||
|
||||
for _, cpu := range stats.CPU { |
||||
assert.False(t, math.IsNaN(cpu.Idle)) |
||||
assert.False(t, math.IsNaN(cpu.Total)) |
||||
assert.False(t, math.IsNaN(cpu.System)) |
||||
assert.False(t, math.IsNaN(cpu.User)) |
||||
|
||||
assert.False(t, math.IsInf(cpu.Idle, 0)) |
||||
assert.False(t, math.IsInf(cpu.Total, 0)) |
||||
assert.False(t, math.IsInf(cpu.System, 0)) |
||||
assert.False(t, math.IsInf(cpu.User, 0)) |
||||
} |
||||
} |
||||
|
||||
func TestCpuStatsCalculator_Nan(t *testing.T) { |
||||
times := cpu.TimesStat{ |
||||
User: 0.0, |
||||
Idle: 100.0, |
||||
System: 0.0, |
||||
} |
||||
|
||||
calculator := &cpuStatsCalculator{} |
||||
calculator.calculate(times) |
||||
stats := calculator.calculate(times) |
||||
require.Equal(t, 100.0, stats.Idle) |
||||
require.Zero(t, stats.User) |
||||
require.Zero(t, stats.System) |
||||
require.Zero(t, stats.Iowait) |
||||
require.Zero(t, stats.Total) |
||||
} |
@ -0,0 +1,92 @@
|
||||
package hoststats |
||||
|
||||
import ( |
||||
"time" |
||||
|
||||
"github.com/armon/go-metrics" |
||||
) |
||||
|
||||
var hostStatsCollectionInterval = 10 * time.Second |
||||
|
||||
// HostStats represents resource usage hoststats of the host running a Consul agent
|
||||
type HostStats struct { |
||||
Memory *MemoryStats |
||||
CPU []*CPUStats |
||||
DataDirStats *DiskStats |
||||
Uptime uint64 |
||||
Timestamp int64 |
||||
} |
||||
|
||||
func (hs *HostStats) Clone() *HostStats { |
||||
clone := &HostStats{} |
||||
*clone = *hs |
||||
return clone |
||||
} |
||||
|
||||
func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) { |
||||
|
||||
if hs.Memory != nil { |
||||
sink.SetGaugeWithLabels([]string{"host", "memory", "total"}, float32(hs.Memory.Total), baseLabels) |
||||
sink.SetGaugeWithLabels([]string{"host", "memory", "available"}, float32(hs.Memory.Available), baseLabels) |
||||
sink.SetGaugeWithLabels([]string{"host", "memory", "used"}, float32(hs.Memory.Used), baseLabels) |
||||
sink.SetGaugeWithLabels([]string{"host", "memory", "used_percent"}, float32(hs.Memory.UsedPercent), baseLabels) |
||||
sink.SetGaugeWithLabels([]string{"host", "memory", "free"}, float32(hs.Memory.Free), baseLabels) |
||||
} |
||||
|
||||
for _, cpu := range hs.CPU { |
||||
labels := append(baseLabels, metrics.Label{ |
||||
Name: "cpu", |
||||
Value: cpu.CPU, |
||||
}) |
||||
|
||||
sink.SetGaugeWithLabels([]string{"host", "cpu", "total"}, float32(cpu.Total), labels) |
||||
sink.SetGaugeWithLabels([]string{"host", "cpu", "user"}, float32(cpu.User), labels) |
||||
sink.SetGaugeWithLabels([]string{"host", "cpu", "idle"}, float32(cpu.Idle), labels) |
||||
sink.SetGaugeWithLabels([]string{"host", "cpu", "iowait"}, float32(cpu.Iowait), labels) |
||||
sink.SetGaugeWithLabels([]string{"host", "cpu", "system"}, float32(cpu.System), labels) |
||||
} |
||||
|
||||
if hs.DataDirStats != nil { |
||||
diskLabels := append(baseLabels, metrics.Label{ |
||||
Name: "path", |
||||
Value: hs.DataDirStats.Path, |
||||
}) |
||||
|
||||
sink.SetGaugeWithLabels([]string{"host", "disk", "size"}, float32(hs.DataDirStats.Size), diskLabels) |
||||
sink.SetGaugeWithLabels([]string{"host", "disk", "used"}, float32(hs.DataDirStats.Used), diskLabels) |
||||
sink.SetGaugeWithLabels([]string{"host", "disk", "available"}, float32(hs.DataDirStats.Available), diskLabels) |
||||
sink.SetGaugeWithLabels([]string{"host", "disk", "used_percent"}, float32(hs.DataDirStats.UsedPercent), diskLabels) |
||||
sink.SetGaugeWithLabels([]string{"host", "disk", "inodes_percent"}, float32(hs.DataDirStats.InodesUsedPercent), diskLabels) |
||||
} |
||||
|
||||
sink.SetGaugeWithLabels([]string{"host", "uptime"}, float32(hs.Uptime), baseLabels) |
||||
} |
||||
|
||||
// CPUStats represents hoststats related to cpu usage
|
||||
type CPUStats struct { |
||||
CPU string |
||||
User float64 |
||||
System float64 |
||||
Idle float64 |
||||
Iowait float64 |
||||
Total float64 |
||||
} |
||||
|
||||
// MemoryStats represents hoststats related to virtual memory usage
|
||||
type MemoryStats struct { |
||||
Total uint64 |
||||
Available uint64 |
||||
Used uint64 |
||||
UsedPercent float64 |
||||
Free uint64 |
||||
} |
||||
|
||||
// DiskStats represents hoststats related to disk usage
|
||||
type DiskStats struct { |
||||
Path string |
||||
Size uint64 |
||||
Used uint64 |
||||
Available uint64 |
||||
UsedPercent float64 |
||||
InodesUsedPercent float64 |
||||
} |
@ -0,0 +1,79 @@
|
||||
package hoststats |
||||
|
||||
import ( |
||||
"github.com/armon/go-metrics" |
||||
"github.com/armon/go-metrics/prometheus" |
||||
) |
||||
|
||||
// Metrics defines an interface for the methods used to emit data to the go-metrics library.
|
||||
// `metrics.Default()` should always satisfy this interface.
|
||||
type Metrics interface { |
||||
SetGaugeWithLabels(key []string, val float32, labels []metrics.Label) |
||||
} |
||||
|
||||
var Gauges = []prometheus.GaugeDefinition{ |
||||
{ |
||||
Name: []string{"host", "memory", "total"}, |
||||
Help: "Total physical memory in bytes", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "memory", "available"}, |
||||
Help: "Available physical memory in bytes", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "memory", "free"}, |
||||
Help: "Free physical memory in bytes", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "memory", "used"}, |
||||
Help: "Used physical memory in bytes", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "memory", "used_percent"}, |
||||
Help: "Percentage of physical memory in use", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "cpu", "total"}, |
||||
Help: "Total cpu utilization", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "cpu", "user"}, |
||||
Help: "User cpu utilization", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "cpu", "idle"}, |
||||
Help: "Idle cpu utilization", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "cpu", "iowait"}, |
||||
Help: "Iowait cpu utilization", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "cpu", "system"}, |
||||
Help: "System cpu utilization", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "disk", "size"}, |
||||
Help: "Size of disk in bytes", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "disk", "used"}, |
||||
Help: "Disk usage in bytes", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "disk", "available"}, |
||||
Help: "Available bytes on disk", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "disk", "used_percent"}, |
||||
Help: "Percentage of disk space usage", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "disk", "inodes_percent"}, |
||||
Help: "Percentage of disk inodes usage", |
||||
}, |
||||
{ |
||||
Name: []string{"host", "uptime"}, |
||||
Help: "System uptime", |
||||
}, |
||||
} |
Loading…
Reference in new issue