From fe5ffa916f93e88d4ca9f9d5d5c30fb2d5f70a01 Mon Sep 17 00:00:00 2001 From: mchtech Date: Tue, 11 Jun 2024 10:31:42 +0000 Subject: [PATCH] add IRQ PSI metrics Signed-off-by: mchtech --- collector/pressure_linux.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/collector/pressure_linux.go b/collector/pressure_linux.go index f64d004f..17502981 100644 --- a/collector/pressure_linux.go +++ b/collector/pressure_linux.go @@ -29,7 +29,7 @@ import ( ) var ( - psiResources = []string{"cpu", "io", "memory"} + psiResources = []string{"cpu", "io", "memory", "irq"} ) type pressureStatsCollector struct { @@ -38,6 +38,7 @@ type pressureStatsCollector struct { ioFull *prometheus.Desc mem *prometheus.Desc memFull *prometheus.Desc + irqFull *prometheus.Desc fs procfs.FS @@ -81,6 +82,11 @@ func NewPressureStatsCollector(logger log.Logger) (Collector, error) { "Total time in seconds no process could make progress due to memory congestion", nil, nil, ), + irqFull: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "pressure", "irq_stalled_seconds_total"), + "Total time in seconds no process could make progress due to IRQ congestion", + nil, nil, + ), fs: fs, logger: logger, }, nil @@ -102,7 +108,9 @@ func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error { } return fmt.Errorf("failed to retrieve pressure stats: %w", err) } - if vals.Some == nil { + // IRQ pressure does not have 'some' data. + // See https://github.com/torvalds/linux/blob/v6.9/kernel/sched/psi.c#L1243 + if vals.Some == nil && res != "irq" { level.Debug(c.logger).Log("msg", "pressure information returned no 'some' data") return ErrNoData } @@ -119,6 +127,8 @@ func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error { case "memory": ch <- prometheus.MustNewConstMetric(c.mem, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0) ch <- prometheus.MustNewConstMetric(c.memFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0) + case "irq": + ch <- prometheus.MustNewConstMetric(c.irqFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0) default: level.Debug(c.logger).Log("msg", "did not account for resource", "resource", res) }