prometheusmetricshost-metricsmachine-metricsnode-metricsprocfsprometheus-exportersystem-informationsystem-metrics
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
435 lines
12 KiB
435 lines
12 KiB
// Copyright 2019 The Prometheus Authors |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
//go:build !nobtrfs |
|
// +build !nobtrfs |
|
|
|
package collector |
|
|
|
import ( |
|
"fmt" |
|
"log/slog" |
|
"path" |
|
"strings" |
|
"syscall" |
|
|
|
dennwc "github.com/dennwc/btrfs" |
|
"github.com/prometheus/client_golang/prometheus" |
|
"github.com/prometheus/procfs/btrfs" |
|
) |
|
|
|
// A btrfsCollector is a Collector which gathers metrics from Btrfs filesystems. |
|
type btrfsCollector struct { |
|
fs btrfs.FS |
|
logger *slog.Logger |
|
} |
|
|
|
func init() { |
|
registerCollector("btrfs", defaultEnabled, NewBtrfsCollector) |
|
} |
|
|
|
// NewBtrfsCollector returns a new Collector exposing Btrfs statistics. |
|
func NewBtrfsCollector(logger *slog.Logger) (Collector, error) { |
|
fs, err := btrfs.NewFS(*sysPath) |
|
if err != nil { |
|
return nil, fmt.Errorf("failed to open sysfs: %w", err) |
|
} |
|
|
|
return &btrfsCollector{ |
|
fs: fs, |
|
logger: logger, |
|
}, nil |
|
} |
|
|
|
// Update retrieves and exports Btrfs statistics. |
|
// It implements Collector. |
|
func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error { |
|
stats, err := c.fs.Stats() |
|
if err != nil { |
|
return fmt.Errorf("failed to retrieve Btrfs stats from procfs: %w", err) |
|
} |
|
|
|
ioctlStatsMap, err := c.getIoctlStats() |
|
if err != nil { |
|
c.logger.Debug( |
|
"Error querying btrfs device stats with ioctl", |
|
"err", err) |
|
ioctlStatsMap = make(map[string]*btrfsIoctlFsStats) |
|
} |
|
|
|
for _, s := range stats { |
|
// match up procfs and ioctl info by filesystem UUID (without dashes) |
|
var fsUUID = strings.Replace(s.UUID, "-", "", -1) |
|
ioctlStats := ioctlStatsMap[fsUUID] |
|
c.updateBtrfsStats(ch, s, ioctlStats) |
|
} |
|
|
|
return nil |
|
} |
|
|
|
type btrfsIoctlFsDevStats struct { |
|
path string |
|
uuid string |
|
|
|
bytesUsed uint64 |
|
totalBytes uint64 |
|
|
|
// The error stats below match the following upstream lists: |
|
// https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140 |
|
// https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692 |
|
writeErrs uint64 |
|
readErrs uint64 |
|
flushErrs uint64 |
|
corruptionErrs uint64 |
|
generationErrs uint64 |
|
} |
|
|
|
type btrfsIoctlFsStats struct { |
|
uuid string |
|
devices []btrfsIoctlFsDevStats |
|
} |
|
|
|
func (c *btrfsCollector) getIoctlStats() (map[string]*btrfsIoctlFsStats, error) { |
|
// Instead of introducing more ioctl calls to scan for all btrfs |
|
// filesystems re-use our mount point utils to find known mounts |
|
mountsList, err := mountPointDetails(c.logger) |
|
if err != nil { |
|
return nil, err |
|
} |
|
|
|
// Track devices we have successfully scanned, by device path. |
|
devicesDone := make(map[string]struct{}) |
|
// Filesystems scann results by UUID. |
|
fsStats := make(map[string]*btrfsIoctlFsStats) |
|
|
|
for _, mount := range mountsList { |
|
if mount.fsType != "btrfs" { |
|
continue |
|
} |
|
|
|
if _, found := devicesDone[mount.device]; found { |
|
// We already found this filesystem by another mount point. |
|
continue |
|
} |
|
|
|
mountPath := rootfsFilePath(mount.mountPoint) |
|
|
|
fs, err := dennwc.Open(mountPath, true) |
|
if err != nil { |
|
// Failed to open this mount point, maybe we didn't have permission |
|
// maybe we'll find another mount point for this FS later. |
|
c.logger.Debug( |
|
"Error inspecting btrfs mountpoint", |
|
"mountPoint", mountPath, |
|
"err", err) |
|
continue |
|
} |
|
defer fs.Close() |
|
|
|
fsInfo, err := fs.Info() |
|
if err != nil { |
|
// Failed to get the FS info for some reason, |
|
// perhaps it'll work with a different mount point |
|
c.logger.Debug( |
|
"Error querying btrfs filesystem", |
|
"mountPoint", mountPath, |
|
"err", err) |
|
continue |
|
} |
|
|
|
fsID := fsInfo.FSID.String() |
|
if _, found := fsStats[fsID]; found { |
|
// We already found this filesystem by another mount point |
|
continue |
|
} |
|
|
|
deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo) |
|
if err != nil { |
|
c.logger.Debug( |
|
"Error querying btrfs device stats", |
|
"mountPoint", mountPath, |
|
"err", err) |
|
continue |
|
} |
|
|
|
devicesDone[mount.device] = struct{}{} |
|
fsStats[fsID] = &btrfsIoctlFsStats{ |
|
uuid: fsID, |
|
devices: deviceStats, |
|
} |
|
} |
|
|
|
return fsStats, nil |
|
} |
|
|
|
func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]btrfsIoctlFsDevStats, error) { |
|
devices := make([]btrfsIoctlFsDevStats, 0, fsInfo.NumDevices) |
|
|
|
for i := uint64(0); i <= fsInfo.MaxID; i++ { |
|
deviceInfo, err := fs.GetDevInfo(i) |
|
|
|
if err != nil { |
|
if errno, ok := err.(syscall.Errno); ok && errno == syscall.ENODEV { |
|
// Device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this. |
|
continue |
|
} |
|
return nil, err |
|
} |
|
|
|
deviceStats, err := fs.GetDevStats(i) |
|
if err != nil { |
|
return nil, err |
|
} |
|
|
|
devices = append(devices, btrfsIoctlFsDevStats{ |
|
path: deviceInfo.Path, |
|
uuid: deviceInfo.UUID.String(), |
|
bytesUsed: deviceInfo.BytesUsed, |
|
totalBytes: deviceInfo.TotalBytes, |
|
|
|
writeErrs: deviceStats.WriteErrs, |
|
readErrs: deviceStats.ReadErrs, |
|
flushErrs: deviceStats.FlushErrs, |
|
corruptionErrs: deviceStats.CorruptionErrs, |
|
generationErrs: deviceStats.GenerationErrs, |
|
}) |
|
|
|
if uint64(len(devices)) == fsInfo.NumDevices { |
|
break |
|
} |
|
} |
|
|
|
return devices, nil |
|
} |
|
|
|
// btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric. |
|
type btrfsMetric struct { |
|
name string |
|
metricType prometheus.ValueType |
|
desc string |
|
value float64 |
|
extraLabel []string |
|
extraLabelValue []string |
|
} |
|
|
|
// updateBtrfsStats collects statistics for one bcache ID. |
|
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) { |
|
const subsystem = "btrfs" |
|
|
|
// Basic information about the filesystem. |
|
devLabels := []string{"uuid"} |
|
|
|
// Retrieve the metrics. |
|
metrics := c.getMetrics(s, ioctlStats) |
|
|
|
// Convert all gathered metrics to Prometheus Metrics and add to channel. |
|
for _, m := range metrics { |
|
labels := append(devLabels, m.extraLabel...) |
|
|
|
desc := prometheus.NewDesc( |
|
prometheus.BuildFQName(namespace, subsystem, m.name), |
|
m.desc, |
|
labels, |
|
nil, |
|
) |
|
|
|
labelValues := []string{s.UUID} |
|
if len(m.extraLabelValue) > 0 { |
|
labelValues = append(labelValues, m.extraLabelValue...) |
|
} |
|
|
|
ch <- prometheus.MustNewConstMetric( |
|
desc, |
|
m.metricType, |
|
m.value, |
|
labelValues..., |
|
) |
|
} |
|
} |
|
|
|
// getMetrics returns metrics for the given Btrfs statistics. |
|
func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) []btrfsMetric { |
|
metrics := []btrfsMetric{ |
|
{ |
|
name: "info", |
|
desc: "Filesystem information", |
|
value: 1, |
|
metricType: prometheus.GaugeValue, |
|
extraLabel: []string{"label"}, |
|
extraLabelValue: []string{s.Label}, |
|
}, |
|
{ |
|
name: "global_rsv_size_bytes", |
|
desc: "Size of global reserve.", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(s.Allocation.GlobalRsvSize), |
|
}, |
|
{ |
|
name: "commits_total", |
|
desc: "The total number of commits that have occurred.", |
|
metricType: prometheus.CounterValue, |
|
value: float64(s.CommitStats.Commits), |
|
}, |
|
{ |
|
name: "last_commit_seconds", |
|
desc: "Duration of the most recent commit, in seconds.", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(s.CommitStats.LastCommitMs) / 1000, |
|
}, |
|
{ |
|
name: "max_commit_seconds", |
|
desc: "Duration of the slowest commit, in seconds.", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(s.CommitStats.MaxCommitMs) / 1000, |
|
}, |
|
{ |
|
name: "commit_seconds_total", |
|
desc: "Sum of the duration of all commits, in seconds.", |
|
metricType: prometheus.CounterValue, |
|
value: float64(s.CommitStats.TotalCommitMs) / 1000, |
|
}, |
|
} |
|
|
|
// Information about data, metadata and system data. |
|
metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...) |
|
metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...) |
|
metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...) |
|
|
|
// Information about devices. |
|
if ioctlStats == nil { |
|
for n, dev := range s.Devices { |
|
metrics = append(metrics, btrfsMetric{ |
|
name: "device_size_bytes", |
|
desc: "Size of a device that is part of the filesystem.", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(dev.Size), |
|
extraLabel: []string{"device"}, |
|
extraLabelValue: []string{n}, |
|
}) |
|
} |
|
return metrics |
|
} |
|
|
|
for _, dev := range ioctlStats.devices { |
|
// trim the path prefix from the device name so the value should match |
|
// the value used in the fallback branch above. |
|
// e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1 |
|
_, device := path.Split(dev.path) |
|
|
|
extraLabels := []string{"device", "btrfs_dev_uuid"} |
|
extraLabelValues := []string{device, dev.uuid} |
|
|
|
metrics = append(metrics, |
|
btrfsMetric{ |
|
name: "device_size_bytes", |
|
desc: "Size of a device that is part of the filesystem.", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(dev.totalBytes), |
|
extraLabel: extraLabels, |
|
extraLabelValue: extraLabelValues, |
|
}, |
|
// A bytes available metric is probably more useful than a |
|
// bytes used metric, because large numbers of bytes will |
|
// suffer from floating point representation issues |
|
// and we probably care more about the number when it's low anyway |
|
btrfsMetric{ |
|
name: "device_unused_bytes", |
|
desc: "Unused bytes unused on a device that is part of the filesystem.", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(dev.totalBytes - dev.bytesUsed), |
|
extraLabel: extraLabels, |
|
extraLabelValue: extraLabelValues, |
|
}) |
|
|
|
errorLabels := append([]string{"type"}, extraLabels...) |
|
values := []uint64{ |
|
dev.writeErrs, |
|
dev.readErrs, |
|
dev.flushErrs, |
|
dev.corruptionErrs, |
|
dev.generationErrs, |
|
} |
|
btrfsErrorTypeNames := []string{ |
|
"write", |
|
"read", |
|
"flush", |
|
"corruption", |
|
"generation", |
|
} |
|
|
|
for i, errorType := range btrfsErrorTypeNames { |
|
metrics = append(metrics, |
|
btrfsMetric{ |
|
name: "device_errors_total", |
|
desc: "Errors reported for the device", |
|
metricType: prometheus.CounterValue, |
|
value: float64(values[i]), |
|
extraLabel: errorLabels, |
|
extraLabelValue: append([]string{errorType}, extraLabelValues...), |
|
}) |
|
} |
|
} |
|
|
|
return metrics |
|
} |
|
|
|
// getAllocationStats returns allocation metrics for the given Btrfs Allocation statistics. |
|
func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats) []btrfsMetric { |
|
metrics := []btrfsMetric{ |
|
{ |
|
name: "reserved_bytes", |
|
desc: "Amount of space reserved for a data type", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(s.ReservedBytes), |
|
extraLabel: []string{"block_group_type"}, |
|
extraLabelValue: []string{a}, |
|
}, |
|
} |
|
|
|
// Add all layout statistics. |
|
for layout, stats := range s.Layouts { |
|
metrics = append(metrics, c.getLayoutStats(a, layout, stats)...) |
|
} |
|
|
|
return metrics |
|
} |
|
|
|
// getLayoutStats returns metrics for a data layout. |
|
func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btrfsMetric { |
|
return []btrfsMetric{ |
|
{ |
|
name: "used_bytes", |
|
desc: "Amount of used space by a layout/data type", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(s.UsedBytes), |
|
extraLabel: []string{"block_group_type", "mode"}, |
|
extraLabelValue: []string{a, l}, |
|
}, |
|
{ |
|
name: "size_bytes", |
|
desc: "Amount of space allocated for a layout/data type", |
|
metricType: prometheus.GaugeValue, |
|
value: float64(s.TotalBytes), |
|
extraLabel: []string{"block_group_type", "mode"}, |
|
extraLabelValue: []string{a, l}, |
|
}, |
|
{ |
|
name: "allocation_ratio", |
|
desc: "Data allocation ratio for a layout/data type", |
|
metricType: prometheus.GaugeValue, |
|
value: s.Ratio, |
|
extraLabel: []string{"block_group_type", "mode"}, |
|
extraLabelValue: []string{a, l}, |
|
}, |
|
} |
|
}
|
|
|