pr review fixes and documentation

pull/17038/head
Nick Ethier 2023-05-22 17:02:56 -04:00
parent ac6071721c
commit 2a3dd5066a
7 changed files with 50 additions and 52 deletions

View File

@ -1,3 +1,3 @@
```release-note:improvement
agent: add new metrics to track cpu disk and memory usage for server hosts
agent: add new metrics to track cpu disk and memory usage for server hosts (defaults to: enabled)
```

View File

@ -13,7 +13,6 @@ import (
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/lib/hoststats"
"github.com/hashicorp/go-hclog"
wal "github.com/hashicorp/raft-wal"
"github.com/hashicorp/raft-wal/verifier"
@ -43,6 +42,7 @@ import (
"github.com/hashicorp/consul/agent/xds"
"github.com/hashicorp/consul/ipaddr"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/lib/hoststats"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/tlsutil"
)
@ -61,7 +61,7 @@ type BaseDeps struct {
WatchedFiles []string
deregisterBalancer, deregisterResolver func()
stopHostCollector func()
stopHostCollector context.CancelFunc
}
type ConfigLoader func(source config.Source) (config.LoadResult, error)

View File

@ -59,7 +59,7 @@ func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption)
return collector
}
func (h *Collector) loop(ctx context.Context) {
func (c *Collector) loop(ctx context.Context) {
// Start collecting host stats right away and then keep collecting every
// collection interval
next := time.NewTimer(0)
@ -67,9 +67,9 @@ func (h *Collector) loop(ctx context.Context) {
for {
select {
case <-next.C:
h.collect()
c.collect()
next.Reset(hostStatsCollectionInterval)
h.Stats().Emit(h.metrics, h.baseLabels)
c.Stats().Emit(c.metrics, c.baseLabels)
case <-ctx.Done():
return
@ -78,55 +78,55 @@ func (h *Collector) loop(ctx context.Context) {
}
// collect will collect stats related to resource usage of the host
func (h *Collector) collect() {
h.hostStatsLock.Lock()
defer h.hostStatsLock.Unlock()
func (c *Collector) collect() {
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
// Determine up-time
uptime, err := host.Uptime()
if err != nil {
h.logger.Error("failed to collect uptime stats", "error", err)
c.logger.Error("failed to collect uptime stats", "error", err)
uptime = 0
}
hs.Uptime = uptime
// Collect memory stats
mstats, err := h.collectMemoryStats()
mstats, err := c.collectMemoryStats()
if err != nil {
h.logger.Error("failed to collect memory stats", "error", err)
c.logger.Error("failed to collect memory stats", "error", err)
mstats = &MemoryStats{}
}
hs.Memory = mstats
// Collect cpu stats
cpus, err := h.collectCPUStats()
cpus, err := c.collectCPUStats()
if err != nil {
h.logger.Error("failed to collect cpu stats", "error", err)
c.logger.Error("failed to collect cpu stats", "error", err)
cpus = []*CPUStats{}
}
hs.CPU = cpus
// Collect disk stats
diskStats, err := h.collectDiskStats(h.dataDir)
diskStats, err := c.collectDiskStats(c.dataDir)
if err != nil {
h.logger.Error("failed to collect dataDir disk stats", "error", err)
c.logger.Error("failed to collect dataDir disk stats", "error", err)
}
hs.DataDirStats = diskStats
// Update the collected status object.
h.hostStats = hs
c.hostStatsLock.Lock()
c.hostStats = hs
c.hostStatsLock.Unlock()
}
func (h *Collector) collectDiskStats(dir string) (*DiskStats, error) {
func (c *Collector) collectDiskStats(dir string) (*DiskStats, error) {
usage, err := disk.Usage(dir)
if err != nil {
return nil, fmt.Errorf("failed to collect disk usage stats: %w", err)
}
return h.toDiskStats(usage), nil
return c.toDiskStats(usage), nil
}
func (h *Collector) collectMemoryStats() (*MemoryStats, error) {
func (c *Collector) collectMemoryStats() (*MemoryStats, error) {
memStats, err := mem.VirtualMemory()
if err != nil {
return nil, err
@ -143,19 +143,19 @@ func (h *Collector) collectMemoryStats() (*MemoryStats, error) {
}
// Stats returns the host stats that has been collected
func (h *Collector) Stats() *HostStats {
h.hostStatsLock.RLock()
defer h.hostStatsLock.RUnlock()
func (c *Collector) Stats() *HostStats {
c.hostStatsLock.RLock()
defer c.hostStatsLock.RUnlock()
if h.hostStats == nil {
if c.hostStats == nil {
return &HostStats{}
}
return h.hostStats.Clone()
return c.hostStats.Clone()
}
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
func (h *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats {
func (c *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats {
ds := DiskStats{
Size: usage.Total,
Used: usage.Used,

View File

@ -2,7 +2,6 @@ package hoststats
import (
"math"
"time"
"github.com/shirou/gopsutil/v3/cpu"
)
@ -14,17 +13,28 @@ type cpuStatsCalculator struct {
prevTotal float64
}
// calculate calculates the current cpu usage percentages
// calculate the current cpu usage percentages.
// Since the cpu.TimesStat captures the total time a cpu spent in various states
// this function tracks the last seen stat and derives each cpu state's utilization
// as a percentage of the total change in cpu time between calls.
// The first time calculate is called CPUStats will report %100 idle
// usage since there is not a previous value to calculate against
func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
// sum all none idle counters to get the total busy cpu time
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
times.Softirq + times.Steal + times.Guest + times.GuestNice
// sum of the total cpu time
currentTotal := currentBusy + times.Idle
// calculate how much cpu time has passed since last calculation
deltaTotal := currentTotal - h.prevTotal
stats := &CPUStats{
CPU: times.CPU,
// calculate each percentage as the ratio of the change
// in each state's time to the total change in cpu time
Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100,
User: ((times.User - h.prev.User) / deltaTotal) * 100,
System: ((times.System - h.prev.System) / deltaTotal) * 100,
@ -55,15 +65,7 @@ func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
return stats
}
// cpuStats calculates cpu usage percentage
type cpuStats struct {
prevCpuTime float64
prevTime time.Time
totalCpus int
}
func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
func (c *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
cpuStats, err := cpu.Times(true)
if err != nil {
@ -71,10 +73,10 @@ func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
}
cs := make([]*CPUStats, len(cpuStats))
for idx, cpuStat := range cpuStats {
percentCalculator, ok := h.cpuCalculator[cpuStat.CPU]
percentCalculator, ok := c.cpuCalculator[cpuStat.CPU]
if !ok {
percentCalculator = &cpuStatsCalculator{}
h.cpuCalculator[cpuStat.CPU] = percentCalculator
c.cpuCalculator[cpuStat.CPU] = percentCalculator
}
cs[idx] = percentCalculator.calculate(cpuStat)
}

View File

@ -4,6 +4,7 @@ import (
"math"
"os"
"testing"
"time"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/shirou/gopsutil/v3/cpu"
@ -20,10 +21,7 @@ func TestHostStats_CPU(t *testing.T) {
// Collect twice so we can calculate percents we need to generate some work
// so that the cpu values change
hs.collect()
total := 0
for i := 1; i < 1000000000; i++ {
total *= i
total = total % i
for begin := time.Now(); time.Now().Sub(begin) < 100*time.Millisecond; {
}
hs.collect()
stats := hs.Stats()

View File

@ -18,14 +18,9 @@ type HostStats struct {
}
func (hs *HostStats) Clone() *HostStats {
clone := *hs
clone.CPU = make([]*CPUStats, len(hs.CPU))
for i := range hs.CPU {
cpu := *hs.CPU[i]
clone.CPU[i] = &cpu
}
return &clone
clone := &HostStats{}
*clone = *hs
return clone
}
func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) {

View File

@ -757,7 +757,10 @@ Consul attaches the following labels to metric values.
## Server Host Metrics
Consul servers report the following metrics about the host's system resources
Consul servers report the following metrics about the host's system resources.
Note that if the Consul server is operating inside a container these metrics
still report host resource usage and do not report any resource limits placed
on the container.
**Requirements:**
- Consul 1.15.3+