|
|
|
// Copyright 2019 The Prometheus Authors
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
//go:build !nopressure
|
|
|
|
// +build !nopressure
|
|
|
|
|
|
|
|
package collector
|
|
|
|
|
|
|
|
import (
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"log/slog"
|
|
|
|
"os"
|
|
|
|
"syscall"
|
|
|
|
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
|
|
"github.com/prometheus/procfs"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
psiResources = []string{"cpu", "io", "memory", "irq"}
|
|
|
|
)
|
|
|
|
|
|
|
|
type pressureStatsCollector struct {
|
|
|
|
cpu *prometheus.Desc
|
|
|
|
io *prometheus.Desc
|
|
|
|
ioFull *prometheus.Desc
|
|
|
|
mem *prometheus.Desc
|
|
|
|
memFull *prometheus.Desc
|
|
|
|
irqFull *prometheus.Desc
|
|
|
|
|
|
|
|
fs procfs.FS
|
|
|
|
|
|
|
|
logger *slog.Logger
|
|
|
|
}
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
registerCollector("pressure", defaultEnabled, NewPressureStatsCollector)
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewPressureStatsCollector returns a Collector exposing pressure stall information
|
|
|
|
func NewPressureStatsCollector(logger *slog.Logger) (Collector, error) {
|
|
|
|
fs, err := procfs.NewFS(*procPath)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to open procfs: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return &pressureStatsCollector{
|
|
|
|
cpu: prometheus.NewDesc(
|
|
|
|
prometheus.BuildFQName(namespace, "pressure", "cpu_waiting_seconds_total"),
|
|
|
|
"Total time in seconds that processes have waited for CPU time",
|
|
|
|
nil, nil,
|
|
|
|
),
|
|
|
|
io: prometheus.NewDesc(
|
|
|
|
prometheus.BuildFQName(namespace, "pressure", "io_waiting_seconds_total"),
|
|
|
|
"Total time in seconds that processes have waited due to IO congestion",
|
|
|
|
nil, nil,
|
|
|
|
),
|
|
|
|
ioFull: prometheus.NewDesc(
|
|
|
|
prometheus.BuildFQName(namespace, "pressure", "io_stalled_seconds_total"),
|
|
|
|
"Total time in seconds no process could make progress due to IO congestion",
|
|
|
|
nil, nil,
|
|
|
|
),
|
|
|
|
mem: prometheus.NewDesc(
|
|
|
|
prometheus.BuildFQName(namespace, "pressure", "memory_waiting_seconds_total"),
|
|
|
|
"Total time in seconds that processes have waited for memory",
|
|
|
|
nil, nil,
|
|
|
|
),
|
|
|
|
memFull: prometheus.NewDesc(
|
|
|
|
prometheus.BuildFQName(namespace, "pressure", "memory_stalled_seconds_total"),
|
|
|
|
"Total time in seconds no process could make progress due to memory congestion",
|
|
|
|
nil, nil,
|
|
|
|
),
|
|
|
|
irqFull: prometheus.NewDesc(
|
|
|
|
prometheus.BuildFQName(namespace, "pressure", "irq_stalled_seconds_total"),
|
|
|
|
"Total time in seconds no process could make progress due to IRQ congestion",
|
|
|
|
nil, nil,
|
|
|
|
),
|
|
|
|
fs: fs,
|
|
|
|
logger: logger,
|
|
|
|
}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update calls procfs.NewPSIStatsForResource for the different resources and updates the values
|
|
|
|
func (c *pressureStatsCollector) Update(ch chan<- prometheus.Metric) error {
|
|
|
|
for _, res := range psiResources {
|
|
|
|
c.logger.Debug("collecting statistics for resource", "resource", res)
|
|
|
|
vals, err := c.fs.PSIStatsForResource(res)
|
|
|
|
if err != nil {
|
|
|
|
if errors.Is(err, os.ErrNotExist) {
|
|
|
|
c.logger.Debug("pressure information is unavailable, you need a Linux kernel >= 4.20 and/or CONFIG_PSI enabled for your kernel")
|
|
|
|
return ErrNoData
|
|
|
|
}
|
|
|
|
if errors.Is(err, syscall.ENOTSUP) {
|
|
|
|
c.logger.Debug("pressure information is disabled, add psi=1 kernel command line to enable it")
|
|
|
|
return ErrNoData
|
|
|
|
}
|
|
|
|
return fmt.Errorf("failed to retrieve pressure stats: %w", err)
|
|
|
|
}
|
|
|
|
// IRQ pressure does not have 'some' data.
|
|
|
|
// See https://github.com/torvalds/linux/blob/v6.9/include/linux/psi_types.h#L65
|
|
|
|
if vals.Some == nil && res != "irq" {
|
|
|
|
c.logger.Debug("pressure information returned no 'some' data")
|
|
|
|
return ErrNoData
|
|
|
|
}
|
|
|
|
if vals.Full == nil && res != "cpu" {
|
|
|
|
c.logger.Debug("pressure information returned no 'full' data")
|
|
|
|
return ErrNoData
|
|
|
|
}
|
|
|
|
switch res {
|
|
|
|
case "cpu":
|
|
|
|
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
|
|
|
|
case "io":
|
|
|
|
ch <- prometheus.MustNewConstMetric(c.io, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
|
|
|
|
ch <- prometheus.MustNewConstMetric(c.ioFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0)
|
|
|
|
case "memory":
|
|
|
|
ch <- prometheus.MustNewConstMetric(c.mem, prometheus.CounterValue, float64(vals.Some.Total)/1000.0/1000.0)
|
|
|
|
ch <- prometheus.MustNewConstMetric(c.memFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0)
|
|
|
|
case "irq":
|
|
|
|
ch <- prometheus.MustNewConstMetric(c.irqFull, prometheus.CounterValue, float64(vals.Full.Total)/1000.0/1000.0)
|
|
|
|
default:
|
|
|
|
c.logger.Debug("did not account for resource", "resource", res)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|