mirror of https://github.com/hashicorp/consul
pr review fixes and documentation
parent
ac6071721c
commit
2a3dd5066a
|
@ -1,3 +1,3 @@
|
||||||
```release-note:improvement
|
```release-note:improvement
|
||||||
agent: add new metrics to track cpu disk and memory usage for server hosts
|
agent: add new metrics to track cpu disk and memory usage for server hosts (defaults to: enabled)
|
||||||
```
|
```
|
||||||
|
|
|
@ -13,7 +13,6 @@ import (
|
||||||
|
|
||||||
"github.com/armon/go-metrics"
|
"github.com/armon/go-metrics"
|
||||||
"github.com/armon/go-metrics/prometheus"
|
"github.com/armon/go-metrics/prometheus"
|
||||||
"github.com/hashicorp/consul/lib/hoststats"
|
|
||||||
"github.com/hashicorp/go-hclog"
|
"github.com/hashicorp/go-hclog"
|
||||||
wal "github.com/hashicorp/raft-wal"
|
wal "github.com/hashicorp/raft-wal"
|
||||||
"github.com/hashicorp/raft-wal/verifier"
|
"github.com/hashicorp/raft-wal/verifier"
|
||||||
|
@ -43,6 +42,7 @@ import (
|
||||||
"github.com/hashicorp/consul/agent/xds"
|
"github.com/hashicorp/consul/agent/xds"
|
||||||
"github.com/hashicorp/consul/ipaddr"
|
"github.com/hashicorp/consul/ipaddr"
|
||||||
"github.com/hashicorp/consul/lib"
|
"github.com/hashicorp/consul/lib"
|
||||||
|
"github.com/hashicorp/consul/lib/hoststats"
|
||||||
"github.com/hashicorp/consul/logging"
|
"github.com/hashicorp/consul/logging"
|
||||||
"github.com/hashicorp/consul/tlsutil"
|
"github.com/hashicorp/consul/tlsutil"
|
||||||
)
|
)
|
||||||
|
@ -61,7 +61,7 @@ type BaseDeps struct {
|
||||||
WatchedFiles []string
|
WatchedFiles []string
|
||||||
|
|
||||||
deregisterBalancer, deregisterResolver func()
|
deregisterBalancer, deregisterResolver func()
|
||||||
stopHostCollector func()
|
stopHostCollector context.CancelFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
type ConfigLoader func(source config.Source) (config.LoadResult, error)
|
type ConfigLoader func(source config.Source) (config.LoadResult, error)
|
||||||
|
|
|
@ -59,7 +59,7 @@ func initCollector(logger hclog.Logger, dataDir string, opts ...CollectorOption)
|
||||||
return collector
|
return collector
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *Collector) loop(ctx context.Context) {
|
func (c *Collector) loop(ctx context.Context) {
|
||||||
// Start collecting host stats right away and then keep collecting every
|
// Start collecting host stats right away and then keep collecting every
|
||||||
// collection interval
|
// collection interval
|
||||||
next := time.NewTimer(0)
|
next := time.NewTimer(0)
|
||||||
|
@ -67,9 +67,9 @@ func (h *Collector) loop(ctx context.Context) {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-next.C:
|
case <-next.C:
|
||||||
h.collect()
|
c.collect()
|
||||||
next.Reset(hostStatsCollectionInterval)
|
next.Reset(hostStatsCollectionInterval)
|
||||||
h.Stats().Emit(h.metrics, h.baseLabels)
|
c.Stats().Emit(c.metrics, c.baseLabels)
|
||||||
|
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
|
@ -78,55 +78,55 @@ func (h *Collector) loop(ctx context.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// collect will collect stats related to resource usage of the host
|
// collect will collect stats related to resource usage of the host
|
||||||
func (h *Collector) collect() {
|
func (c *Collector) collect() {
|
||||||
h.hostStatsLock.Lock()
|
|
||||||
defer h.hostStatsLock.Unlock()
|
|
||||||
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
|
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
|
||||||
|
|
||||||
// Determine up-time
|
// Determine up-time
|
||||||
uptime, err := host.Uptime()
|
uptime, err := host.Uptime()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.logger.Error("failed to collect uptime stats", "error", err)
|
c.logger.Error("failed to collect uptime stats", "error", err)
|
||||||
uptime = 0
|
uptime = 0
|
||||||
}
|
}
|
||||||
hs.Uptime = uptime
|
hs.Uptime = uptime
|
||||||
|
|
||||||
// Collect memory stats
|
// Collect memory stats
|
||||||
mstats, err := h.collectMemoryStats()
|
mstats, err := c.collectMemoryStats()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.logger.Error("failed to collect memory stats", "error", err)
|
c.logger.Error("failed to collect memory stats", "error", err)
|
||||||
mstats = &MemoryStats{}
|
mstats = &MemoryStats{}
|
||||||
}
|
}
|
||||||
hs.Memory = mstats
|
hs.Memory = mstats
|
||||||
|
|
||||||
// Collect cpu stats
|
// Collect cpu stats
|
||||||
cpus, err := h.collectCPUStats()
|
cpus, err := c.collectCPUStats()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.logger.Error("failed to collect cpu stats", "error", err)
|
c.logger.Error("failed to collect cpu stats", "error", err)
|
||||||
cpus = []*CPUStats{}
|
cpus = []*CPUStats{}
|
||||||
}
|
}
|
||||||
hs.CPU = cpus
|
hs.CPU = cpus
|
||||||
|
|
||||||
// Collect disk stats
|
// Collect disk stats
|
||||||
diskStats, err := h.collectDiskStats(h.dataDir)
|
diskStats, err := c.collectDiskStats(c.dataDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
h.logger.Error("failed to collect dataDir disk stats", "error", err)
|
c.logger.Error("failed to collect dataDir disk stats", "error", err)
|
||||||
}
|
}
|
||||||
hs.DataDirStats = diskStats
|
hs.DataDirStats = diskStats
|
||||||
|
|
||||||
// Update the collected status object.
|
// Update the collected status object.
|
||||||
h.hostStats = hs
|
c.hostStatsLock.Lock()
|
||||||
|
c.hostStats = hs
|
||||||
|
c.hostStatsLock.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *Collector) collectDiskStats(dir string) (*DiskStats, error) {
|
func (c *Collector) collectDiskStats(dir string) (*DiskStats, error) {
|
||||||
usage, err := disk.Usage(dir)
|
usage, err := disk.Usage(dir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to collect disk usage stats: %w", err)
|
return nil, fmt.Errorf("failed to collect disk usage stats: %w", err)
|
||||||
}
|
}
|
||||||
return h.toDiskStats(usage), nil
|
return c.toDiskStats(usage), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *Collector) collectMemoryStats() (*MemoryStats, error) {
|
func (c *Collector) collectMemoryStats() (*MemoryStats, error) {
|
||||||
memStats, err := mem.VirtualMemory()
|
memStats, err := mem.VirtualMemory()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -143,19 +143,19 @@ func (h *Collector) collectMemoryStats() (*MemoryStats, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Stats returns the host stats that has been collected
|
// Stats returns the host stats that has been collected
|
||||||
func (h *Collector) Stats() *HostStats {
|
func (c *Collector) Stats() *HostStats {
|
||||||
h.hostStatsLock.RLock()
|
c.hostStatsLock.RLock()
|
||||||
defer h.hostStatsLock.RUnlock()
|
defer c.hostStatsLock.RUnlock()
|
||||||
|
|
||||||
if h.hostStats == nil {
|
if c.hostStats == nil {
|
||||||
return &HostStats{}
|
return &HostStats{}
|
||||||
}
|
}
|
||||||
|
|
||||||
return h.hostStats.Clone()
|
return c.hostStats.Clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
|
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
|
||||||
func (h *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats {
|
func (c *Collector) toDiskStats(usage *disk.UsageStat) *DiskStats {
|
||||||
ds := DiskStats{
|
ds := DiskStats{
|
||||||
Size: usage.Total,
|
Size: usage.Total,
|
||||||
Used: usage.Used,
|
Used: usage.Used,
|
||||||
|
|
|
@ -2,7 +2,6 @@ package hoststats
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/shirou/gopsutil/v3/cpu"
|
"github.com/shirou/gopsutil/v3/cpu"
|
||||||
)
|
)
|
||||||
|
@ -14,17 +13,28 @@ type cpuStatsCalculator struct {
|
||||||
prevTotal float64
|
prevTotal float64
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate calculates the current cpu usage percentages
|
// calculate the current cpu usage percentages.
|
||||||
|
// Since the cpu.TimesStat captures the total time a cpu spent in various states
|
||||||
|
// this function tracks the last seen stat and derives each cpu state's utilization
|
||||||
|
// as a percentage of the total change in cpu time between calls.
|
||||||
|
// The first time calculate is called CPUStats will report %100 idle
|
||||||
|
// usage since there is not a previous value to calculate against
|
||||||
func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
|
func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
|
||||||
|
|
||||||
|
// sum all none idle counters to get the total busy cpu time
|
||||||
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
|
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
|
||||||
times.Softirq + times.Steal + times.Guest + times.GuestNice
|
times.Softirq + times.Steal + times.Guest + times.GuestNice
|
||||||
|
// sum of the total cpu time
|
||||||
currentTotal := currentBusy + times.Idle
|
currentTotal := currentBusy + times.Idle
|
||||||
|
|
||||||
|
// calculate how much cpu time has passed since last calculation
|
||||||
deltaTotal := currentTotal - h.prevTotal
|
deltaTotal := currentTotal - h.prevTotal
|
||||||
|
|
||||||
stats := &CPUStats{
|
stats := &CPUStats{
|
||||||
CPU: times.CPU,
|
CPU: times.CPU,
|
||||||
|
|
||||||
|
// calculate each percentage as the ratio of the change
|
||||||
|
// in each state's time to the total change in cpu time
|
||||||
Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100,
|
Idle: ((times.Idle - h.prev.Idle) / deltaTotal) * 100,
|
||||||
User: ((times.User - h.prev.User) / deltaTotal) * 100,
|
User: ((times.User - h.prev.User) / deltaTotal) * 100,
|
||||||
System: ((times.System - h.prev.System) / deltaTotal) * 100,
|
System: ((times.System - h.prev.System) / deltaTotal) * 100,
|
||||||
|
@ -55,15 +65,7 @@ func (h *cpuStatsCalculator) calculate(times cpu.TimesStat) *CPUStats {
|
||||||
return stats
|
return stats
|
||||||
}
|
}
|
||||||
|
|
||||||
// cpuStats calculates cpu usage percentage
|
func (c *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
|
||||||
type cpuStats struct {
|
|
||||||
prevCpuTime float64
|
|
||||||
prevTime time.Time
|
|
||||||
|
|
||||||
totalCpus int
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
|
|
||||||
|
|
||||||
cpuStats, err := cpu.Times(true)
|
cpuStats, err := cpu.Times(true)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -71,10 +73,10 @@ func (h *Collector) collectCPUStats() (cpus []*CPUStats, err error) {
|
||||||
}
|
}
|
||||||
cs := make([]*CPUStats, len(cpuStats))
|
cs := make([]*CPUStats, len(cpuStats))
|
||||||
for idx, cpuStat := range cpuStats {
|
for idx, cpuStat := range cpuStats {
|
||||||
percentCalculator, ok := h.cpuCalculator[cpuStat.CPU]
|
percentCalculator, ok := c.cpuCalculator[cpuStat.CPU]
|
||||||
if !ok {
|
if !ok {
|
||||||
percentCalculator = &cpuStatsCalculator{}
|
percentCalculator = &cpuStatsCalculator{}
|
||||||
h.cpuCalculator[cpuStat.CPU] = percentCalculator
|
c.cpuCalculator[cpuStat.CPU] = percentCalculator
|
||||||
}
|
}
|
||||||
cs[idx] = percentCalculator.calculate(cpuStat)
|
cs[idx] = percentCalculator.calculate(cpuStat)
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,7 @@ import (
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/hashicorp/consul/sdk/testutil"
|
"github.com/hashicorp/consul/sdk/testutil"
|
||||||
"github.com/shirou/gopsutil/v3/cpu"
|
"github.com/shirou/gopsutil/v3/cpu"
|
||||||
|
@ -20,10 +21,7 @@ func TestHostStats_CPU(t *testing.T) {
|
||||||
// Collect twice so we can calculate percents we need to generate some work
|
// Collect twice so we can calculate percents we need to generate some work
|
||||||
// so that the cpu values change
|
// so that the cpu values change
|
||||||
hs.collect()
|
hs.collect()
|
||||||
total := 0
|
for begin := time.Now(); time.Now().Sub(begin) < 100*time.Millisecond; {
|
||||||
for i := 1; i < 1000000000; i++ {
|
|
||||||
total *= i
|
|
||||||
total = total % i
|
|
||||||
}
|
}
|
||||||
hs.collect()
|
hs.collect()
|
||||||
stats := hs.Stats()
|
stats := hs.Stats()
|
||||||
|
|
|
@ -18,14 +18,9 @@ type HostStats struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (hs *HostStats) Clone() *HostStats {
|
func (hs *HostStats) Clone() *HostStats {
|
||||||
clone := *hs
|
clone := &HostStats{}
|
||||||
|
*clone = *hs
|
||||||
clone.CPU = make([]*CPUStats, len(hs.CPU))
|
return clone
|
||||||
for i := range hs.CPU {
|
|
||||||
cpu := *hs.CPU[i]
|
|
||||||
clone.CPU[i] = &cpu
|
|
||||||
}
|
|
||||||
return &clone
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) {
|
func (hs *HostStats) Emit(sink Metrics, baseLabels []metrics.Label) {
|
||||||
|
|
|
@ -757,7 +757,10 @@ Consul attaches the following labels to metric values.
|
||||||
|
|
||||||
## Server Host Metrics
|
## Server Host Metrics
|
||||||
|
|
||||||
Consul servers report the following metrics about the host's system resources
|
Consul servers report the following metrics about the host's system resources.
|
||||||
|
Note that if the Consul server is operating inside a container these metrics
|
||||||
|
still report host resource usage and do not report any resource limits placed
|
||||||
|
on the container.
|
||||||
|
|
||||||
**Requirements:**
|
**Requirements:**
|
||||||
- Consul 1.15.3+
|
- Consul 1.15.3+
|
||||||
|
|
Loading…
Reference in New Issue