Expose /proc/pressure (#1261)
This enables the collection of pressure stall information as exposed by the `/proc/pressure` interface added in the 4.20 release of the Linux kernel. Closes #1174 Signed-off-by: Daniele Sluijters <daenney@users.noreply.github.com>pull/1325/head
parent
4e5c4d464f
commit
cc2fd82008
|
@ -29,6 +29,7 @@
|
||||||
* [FEATURE] Add uname collector for FreeBSD #1239
|
* [FEATURE] Add uname collector for FreeBSD #1239
|
||||||
* [FEATURE] Add diskstats collector for OpenBSD #1250
|
* [FEATURE] Add diskstats collector for OpenBSD #1250
|
||||||
* [CHANGE] Bonding state uses mii_status #1124
|
* [CHANGE] Bonding state uses mii_status #1124
|
||||||
|
* [FEATURE] Add pressure collector exposing pressure stall information for Linux #1174
|
||||||
|
|
||||||
## 0.17.0 / 2018-11-30
|
## 0.17.0 / 2018-11-30
|
||||||
|
|
||||||
|
|
|
@ -73,6 +73,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
|
||||||
meminfo\_numa | Exposes memory statistics from `/proc/meminfo_numa`. | Linux
|
meminfo\_numa | Exposes memory statistics from `/proc/meminfo_numa`. | Linux
|
||||||
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
|
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
|
||||||
ntp | Exposes local NTP daemon health to check [time](./docs/TIME.md) | _any_
|
ntp | Exposes local NTP daemon health to check [time](./docs/TIME.md) | _any_
|
||||||
|
pressure | Exposes pressure stall statistics from `/proc/pressure/`. | Linux (kernel 4.20+ and/or [CONFIG\_PSI](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/accounting/psi.txt))
|
||||||
processes | Exposes aggregate process statistics from `/proc`. | Linux
|
processes | Exposes aggregate process statistics from `/proc`. | Linux
|
||||||
qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux
|
qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux
|
||||||
runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_
|
runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_
|
||||||
|
|
|
@ -2289,6 +2289,21 @@ node_nfsd_server_rpcs_total 18628
|
||||||
# HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running.
|
# HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running.
|
||||||
# TYPE node_nfsd_server_threads gauge
|
# TYPE node_nfsd_server_threads gauge
|
||||||
node_nfsd_server_threads 8
|
node_nfsd_server_threads 8
|
||||||
|
# HELP node_pressure_cpu_waiting_seconds_total Total time in seconds that processes have waited for CPU time
|
||||||
|
# TYPE node_pressure_cpu_waiting_seconds_total counter
|
||||||
|
node_pressure_cpu_waiting_seconds_total 14.036781000000001
|
||||||
|
# HELP node_pressure_io_stalled_seconds_total Total time in seconds no process could make progress due to IO congestion
|
||||||
|
# TYPE node_pressure_io_stalled_seconds_total counter
|
||||||
|
node_pressure_io_stalled_seconds_total 159.229614
|
||||||
|
# HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion
|
||||||
|
# TYPE node_pressure_io_waiting_seconds_total counter
|
||||||
|
node_pressure_io_waiting_seconds_total 159.886802
|
||||||
|
# HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion
|
||||||
|
# TYPE node_pressure_memory_stalled_seconds_total counter
|
||||||
|
node_pressure_memory_stalled_seconds_total 0
|
||||||
|
# HELP node_pressure_memory_waiting_seconds_total Total time in seconds that processes have waited for memory
|
||||||
|
# TYPE node_pressure_memory_waiting_seconds_total counter
|
||||||
|
node_pressure_memory_waiting_seconds_total 0
|
||||||
# HELP node_processes_max_processes Number of max PIDs limit
|
# HELP node_processes_max_processes Number of max PIDs limit
|
||||||
# TYPE node_processes_max_processes gauge
|
# TYPE node_processes_max_processes gauge
|
||||||
node_processes_max_processes 123
|
node_processes_max_processes 123
|
||||||
|
@ -2361,6 +2376,7 @@ node_scrape_collector_success{collector="netdev"} 1
|
||||||
node_scrape_collector_success{collector="netstat"} 1
|
node_scrape_collector_success{collector="netstat"} 1
|
||||||
node_scrape_collector_success{collector="nfs"} 1
|
node_scrape_collector_success{collector="nfs"} 1
|
||||||
node_scrape_collector_success{collector="nfsd"} 1
|
node_scrape_collector_success{collector="nfsd"} 1
|
||||||
|
node_scrape_collector_success{collector="pressure"} 1
|
||||||
node_scrape_collector_success{collector="processes"} 1
|
node_scrape_collector_success{collector="processes"} 1
|
||||||
node_scrape_collector_success{collector="qdisc"} 1
|
node_scrape_collector_success{collector="qdisc"} 1
|
||||||
node_scrape_collector_success{collector="sockstat"} 1
|
node_scrape_collector_success{collector="sockstat"} 1
|
||||||
|
|
|
@ -2289,6 +2289,21 @@ node_nfsd_server_rpcs_total 18628
|
||||||
# HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running.
|
# HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running.
|
||||||
# TYPE node_nfsd_server_threads gauge
|
# TYPE node_nfsd_server_threads gauge
|
||||||
node_nfsd_server_threads 8
|
node_nfsd_server_threads 8
|
||||||
|
# HELP node_pressure_cpu_waiting_seconds_total Total time in seconds that processes have waited for CPU time
|
||||||
|
# TYPE node_pressure_cpu_waiting_seconds_total counter
|
||||||
|
node_pressure_cpu_waiting_seconds_total 14.036781000000001
|
||||||
|
# HELP node_pressure_io_stalled_seconds_total Total time in seconds no process could make progress due to IO congestion
|
||||||
|
# TYPE node_pressure_io_stalled_seconds_total counter
|
||||||
|
node_pressure_io_stalled_seconds_total 159.229614
|
||||||
|
# HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion
|
||||||
|
# TYPE node_pressure_io_waiting_seconds_total counter
|
||||||
|
node_pressure_io_waiting_seconds_total 159.886802
|
||||||
|
# HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion
|
||||||
|
# TYPE node_pressure_memory_stalled_seconds_total counter
|
||||||
|
node_pressure_memory_stalled_seconds_total 0
|
||||||
|
# HELP node_pressure_memory_waiting_seconds_total Total time in seconds that processes have waited for memory
|
||||||
|
# TYPE node_pressure_memory_waiting_seconds_total counter
|
||||||
|
node_pressure_memory_waiting_seconds_total 0
|
||||||
# HELP node_processes_max_processes Number of max PIDs limit
|
# HELP node_processes_max_processes Number of max PIDs limit
|
||||||
# TYPE node_processes_max_processes gauge
|
# TYPE node_processes_max_processes gauge
|
||||||
node_processes_max_processes 123
|
node_processes_max_processes 123
|
||||||
|
@ -2361,6 +2376,7 @@ node_scrape_collector_success{collector="netdev"} 1
|
||||||
node_scrape_collector_success{collector="netstat"} 1
|
node_scrape_collector_success{collector="netstat"} 1
|
||||||
node_scrape_collector_success{collector="nfs"} 1
|
node_scrape_collector_success{collector="nfs"} 1
|
||||||
node_scrape_collector_success{collector="nfsd"} 1
|
node_scrape_collector_success{collector="nfsd"} 1
|
||||||
|
node_scrape_collector_success{collector="pressure"} 1
|
||||||
node_scrape_collector_success{collector="processes"} 1
|
node_scrape_collector_success{collector="processes"} 1
|
||||||
node_scrape_collector_success{collector="qdisc"} 1
|
node_scrape_collector_success{collector="qdisc"} 1
|
||||||
node_scrape_collector_success{collector="sockstat"} 1
|
node_scrape_collector_success{collector="sockstat"} 1
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
some avg10=0.00 avg60=0.00 avg300=0.00 total=14036781
|
|
@ -0,0 +1,2 @@
|
||||||
|
some avg10=0.18 avg60=0.34 avg300=0.10 total=159886802
|
||||||
|
full avg10=0.18 avg60=0.34 avg300=0.10 total=159229614
|
|
@ -0,0 +1,2 @@
|
||||||
|
some avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
||||||
|
full avg10=0.00 avg60=0.00 avg300=0.00 total=0
|
|
@ -0,0 +1,105 @@
|
||||||
|
// Copyright 2019 The Prometheus Authors
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// +build !nopressure
|
||||||
|
|
||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/common/log"
|
||||||
|
"github.com/prometheus/procfs"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
psiResources = []string{"cpu", "io", "memory"}
|
||||||
|
)
|
||||||
|
|
||||||
|
type pressureStatsCollector struct {
|
||||||
|
cpu *prometheus.Desc
|
||||||
|
io *prometheus.Desc
|
||||||
|
ioFull *prometheus.Desc
|
||||||
|
mem *prometheus.Desc
|
||||||
|
memFull *prometheus.Desc
|
||||||
|
|
||||||
|
fs procfs.FS
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
registerCollector("pressure", defaultEnabled, NewPressureStatsCollector)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewPressureStatsCollector returns a Collector exposing pressure stall information
|
||||||
|
func NewPressureStatsCollector() (Collector, error) {
|
||||||
|
fs, err := procfs.NewFS(*procPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to open procfs: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &pressureStatsCollector{
|
||||||
|
cpu: prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "pressure", "cpu_waiting_seconds_total"),
|
||||||
|
"Total time in seconds that processes have waited for CPU time",
|
||||||
|
nil, nil,
|
||||||
|
),
|
||||||
|
io: prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "pressure", "io_waiting_seconds_total"),
|
||||||
|
"Total time in seconds that processes have waited due to IO congestion",
|
||||||
|
nil, nil,
|
||||||
|
),
|
||||||
|
ioFull: prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "pressure", "io_stalled_seconds_total"),
|
||||||
|
"Total time in seconds no process could make progress due to IO congestion",
|
||||||
|
nil, nil,
|
||||||
|
),
|
||||||
|
mem: prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "pressure", "memory_waiting_seconds_total"),
|
||||||
|
"Total time in seconds that processes have waited for memory",
|
||||||
|
nil, nil,
|
||||||
|
),
|
||||||
|
memFull: prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName(namespace, "pressure", "memory_stalled_seconds_total"),
|
||||||
|
"Total time in seconds no process could make progress due to memory congestion",
|
||||||
|
nil, nil,
|
||||||
|
),
|
||||||
|
fs: fs,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update calls procfs.NewPSIStatsForResource for the different resources and updates the values
|
||||||
|
func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error {
|
||||||
|
for _, res := range psiResources {
|
||||||
|
log.Debugf("collecting statistics for resource: %s", res)
|
||||||
|
vals, err := c.fs.NewPSIStatsForResource(res)
|
||||||
|
if err != nil {
|
||||||
|
log.Debug("pressure information is unavailable, you need a Linux kernel >= 4.20 and/or CONFIG_PSI enabled for your kernel")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
switch res {
|
||||||
|
case "cpu":
|
||||||
|
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
|
||||||
|
case "io":
|
||||||
|
ch <- prometheus.MustNewConstMetric(c.io, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
|
||||||
|
ch <- prometheus.MustNewConstMetric(c.ioFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0)
|
||||||
|
case "memory":
|
||||||
|
ch <- prometheus.MustNewConstMetric(c.mem, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
|
||||||
|
ch <- prometheus.MustNewConstMetric(c.memFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0)
|
||||||
|
default:
|
||||||
|
log.Debugf("did not account for resource: %s", res)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -28,6 +28,7 @@ enabled_collectors=$(cat << COLLECTORS
|
||||||
netstat
|
netstat
|
||||||
nfs
|
nfs
|
||||||
nfsd
|
nfsd
|
||||||
|
pressure
|
||||||
qdisc
|
qdisc
|
||||||
sockstat
|
sockstat
|
||||||
stat
|
stat
|
||||||
|
|
Loading…
Reference in New Issue