From cc2fd82008d51b0f40abe69f8993011a533de2f3 Mon Sep 17 00:00:00 2001 From: Daniele Sluijters Date: Thu, 18 Apr 2019 11:19:20 +0100 Subject: [PATCH] Expose /proc/pressure (#1261) This enables the collection of pressure stall information as exposed by the `/proc/pressure` interface added in the 4.20 release of the Linux kernel. Closes #1174 Signed-off-by: Daniele Sluijters --- CHANGELOG.md | 1 + README.md | 1 + collector/fixtures/e2e-64k-page-output.txt | 16 ++++ collector/fixtures/e2e-output.txt | 16 ++++ collector/fixtures/proc/pressure/cpu | 1 + collector/fixtures/proc/pressure/io | 2 + collector/fixtures/proc/pressure/memory | 2 + collector/pressure_linux.go | 105 +++++++++++++++++++++ end-to-end-test.sh | 1 + 9 files changed, 145 insertions(+) create mode 100644 collector/fixtures/proc/pressure/cpu create mode 100644 collector/fixtures/proc/pressure/io create mode 100644 collector/fixtures/proc/pressure/memory create mode 100644 collector/pressure_linux.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 154b4b75..d86dbcfc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ * [FEATURE] Add uname collector for FreeBSD #1239 * [FEATURE] Add diskstats collector for OpenBSD #1250 * [CHANGE] Bonding state uses mii_status #1124 +* [FEATURE] Add pressure collector exposing pressure stall information for Linux #1174 ## 0.17.0 / 2018-11-30 diff --git a/README.md b/README.md index 65898d0a..4d97efce 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So meminfo\_numa | Exposes memory statistics from `/proc/meminfo_numa`. | Linux mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux ntp | Exposes local NTP daemon health to check [time](./docs/TIME.md) | _any_ +pressure | Exposes pressure stall statistics from `/proc/pressure/`. | Linux (kernel 4.20+ and/or [CONFIG\_PSI](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/accounting/psi.txt)) processes | Exposes aggregate process statistics from `/proc`. | Linux qdisc | Exposes [queuing discipline](https://en.wikipedia.org/wiki/Network_scheduler#Linux_kernel) statistics | Linux runit | Exposes service status from [runit](http://smarden.org/runit/). | _any_ diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index ea597310..47e8f9ad 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -2289,6 +2289,21 @@ node_nfsd_server_rpcs_total 18628 # HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running. # TYPE node_nfsd_server_threads gauge node_nfsd_server_threads 8 +# HELP node_pressure_cpu_waiting_seconds_total Total time in seconds that processes have waited for CPU time +# TYPE node_pressure_cpu_waiting_seconds_total counter +node_pressure_cpu_waiting_seconds_total 14.036781000000001 +# HELP node_pressure_io_stalled_seconds_total Total time in seconds no process could make progress due to IO congestion +# TYPE node_pressure_io_stalled_seconds_total counter +node_pressure_io_stalled_seconds_total 159.229614 +# HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion +# TYPE node_pressure_io_waiting_seconds_total counter +node_pressure_io_waiting_seconds_total 159.886802 +# HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion +# TYPE node_pressure_memory_stalled_seconds_total counter +node_pressure_memory_stalled_seconds_total 0 +# HELP node_pressure_memory_waiting_seconds_total Total time in seconds that processes have waited for memory +# TYPE node_pressure_memory_waiting_seconds_total counter +node_pressure_memory_waiting_seconds_total 0 # HELP node_processes_max_processes Number of max PIDs limit # TYPE node_processes_max_processes gauge node_processes_max_processes 123 @@ -2361,6 +2376,7 @@ node_scrape_collector_success{collector="netdev"} 1 node_scrape_collector_success{collector="netstat"} 1 node_scrape_collector_success{collector="nfs"} 1 node_scrape_collector_success{collector="nfsd"} 1 +node_scrape_collector_success{collector="pressure"} 1 node_scrape_collector_success{collector="processes"} 1 node_scrape_collector_success{collector="qdisc"} 1 node_scrape_collector_success{collector="sockstat"} 1 diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index f4c6d56e..147113e4 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -2289,6 +2289,21 @@ node_nfsd_server_rpcs_total 18628 # HELP node_nfsd_server_threads Total number of NFSd kernel threads that are running. # TYPE node_nfsd_server_threads gauge node_nfsd_server_threads 8 +# HELP node_pressure_cpu_waiting_seconds_total Total time in seconds that processes have waited for CPU time +# TYPE node_pressure_cpu_waiting_seconds_total counter +node_pressure_cpu_waiting_seconds_total 14.036781000000001 +# HELP node_pressure_io_stalled_seconds_total Total time in seconds no process could make progress due to IO congestion +# TYPE node_pressure_io_stalled_seconds_total counter +node_pressure_io_stalled_seconds_total 159.229614 +# HELP node_pressure_io_waiting_seconds_total Total time in seconds that processes have waited due to IO congestion +# TYPE node_pressure_io_waiting_seconds_total counter +node_pressure_io_waiting_seconds_total 159.886802 +# HELP node_pressure_memory_stalled_seconds_total Total time in seconds no process could make progress due to memory congestion +# TYPE node_pressure_memory_stalled_seconds_total counter +node_pressure_memory_stalled_seconds_total 0 +# HELP node_pressure_memory_waiting_seconds_total Total time in seconds that processes have waited for memory +# TYPE node_pressure_memory_waiting_seconds_total counter +node_pressure_memory_waiting_seconds_total 0 # HELP node_processes_max_processes Number of max PIDs limit # TYPE node_processes_max_processes gauge node_processes_max_processes 123 @@ -2361,6 +2376,7 @@ node_scrape_collector_success{collector="netdev"} 1 node_scrape_collector_success{collector="netstat"} 1 node_scrape_collector_success{collector="nfs"} 1 node_scrape_collector_success{collector="nfsd"} 1 +node_scrape_collector_success{collector="pressure"} 1 node_scrape_collector_success{collector="processes"} 1 node_scrape_collector_success{collector="qdisc"} 1 node_scrape_collector_success{collector="sockstat"} 1 diff --git a/collector/fixtures/proc/pressure/cpu b/collector/fixtures/proc/pressure/cpu new file mode 100644 index 00000000..14acc3a3 --- /dev/null +++ b/collector/fixtures/proc/pressure/cpu @@ -0,0 +1 @@ +some avg10=0.00 avg60=0.00 avg300=0.00 total=14036781 diff --git a/collector/fixtures/proc/pressure/io b/collector/fixtures/proc/pressure/io new file mode 100644 index 00000000..4cdc4135 --- /dev/null +++ b/collector/fixtures/proc/pressure/io @@ -0,0 +1,2 @@ +some avg10=0.18 avg60=0.34 avg300=0.10 total=159886802 +full avg10=0.18 avg60=0.34 avg300=0.10 total=159229614 diff --git a/collector/fixtures/proc/pressure/memory b/collector/fixtures/proc/pressure/memory new file mode 100644 index 00000000..30c03cc4 --- /dev/null +++ b/collector/fixtures/proc/pressure/memory @@ -0,0 +1,2 @@ +some avg10=0.00 avg60=0.00 avg300=0.00 total=0 +full avg10=0.00 avg60=0.00 avg300=0.00 total=0 diff --git a/collector/pressure_linux.go b/collector/pressure_linux.go new file mode 100644 index 00000000..90b20f88 --- /dev/null +++ b/collector/pressure_linux.go @@ -0,0 +1,105 @@ +// Copyright 2019 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !nopressure + +package collector + +import ( + "fmt" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" + "github.com/prometheus/procfs" +) + +var ( + psiResources = []string{"cpu", "io", "memory"} +) + +type pressureStatsCollector struct { + cpu *prometheus.Desc + io *prometheus.Desc + ioFull *prometheus.Desc + mem *prometheus.Desc + memFull *prometheus.Desc + + fs procfs.FS +} + +func init() { + registerCollector("pressure", defaultEnabled, NewPressureStatsCollector) +} + +// NewPressureStatsCollector returns a Collector exposing pressure stall information +func NewPressureStatsCollector() (Collector, error) { + fs, err := procfs.NewFS(*procPath) + if err != nil { + return nil, fmt.Errorf("failed to open procfs: %v", err) + } + + return &pressureStatsCollector{ + cpu: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "pressure", "cpu_waiting_seconds_total"), + "Total time in seconds that processes have waited for CPU time", + nil, nil, + ), + io: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "pressure", "io_waiting_seconds_total"), + "Total time in seconds that processes have waited due to IO congestion", + nil, nil, + ), + ioFull: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "pressure", "io_stalled_seconds_total"), + "Total time in seconds no process could make progress due to IO congestion", + nil, nil, + ), + mem: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "pressure", "memory_waiting_seconds_total"), + "Total time in seconds that processes have waited for memory", + nil, nil, + ), + memFull: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "pressure", "memory_stalled_seconds_total"), + "Total time in seconds no process could make progress due to memory congestion", + nil, nil, + ), + fs: fs, + }, nil +} + +// Update calls procfs.NewPSIStatsForResource for the different resources and updates the values +func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error { + for _, res := range psiResources { + log.Debugf("collecting statistics for resource: %s", res) + vals, err := c.fs.NewPSIStatsForResource(res) + if err != nil { + log.Debug("pressure information is unavailable, you need a Linux kernel >= 4.20 and/or CONFIG_PSI enabled for your kernel") + return nil + } + switch res { + case "cpu": + ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0) + case "io": + ch <- prometheus.MustNewConstMetric(c.io, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0) + ch <- prometheus.MustNewConstMetric(c.ioFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0) + case "memory": + ch <- prometheus.MustNewConstMetric(c.mem, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0) + ch <- prometheus.MustNewConstMetric(c.memFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0) + default: + log.Debugf("did not account for resource: %s", res) + } + } + + return nil +} diff --git a/end-to-end-test.sh b/end-to-end-test.sh index ea24cf51..6d3c9f44 100755 --- a/end-to-end-test.sh +++ b/end-to-end-test.sh @@ -28,6 +28,7 @@ enabled_collectors=$(cat << COLLECTORS netstat nfs nfsd + pressure qdisc sockstat stat