// Copyright 2015 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // +build !nosystemd package collector import ( "fmt" "math" "regexp" "strings" "github.com/coreos/go-systemd/dbus" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/log" kingpin "gopkg.in/alecthomas/kingpin.v2" ) var ( unitWhitelist = kingpin.Flag("collector.systemd.unit-whitelist", "Regexp of systemd units to whitelist. Units must both match whitelist and not match blacklist to be included.").Default(".+").String() unitBlacklist = kingpin.Flag("collector.systemd.unit-blacklist", "Regexp of systemd units to blacklist. Units must both match whitelist and not match blacklist to be included.").Default(".+\\.scope").String() systemdPrivate = kingpin.Flag("collector.systemd.private", "Establish a private, direct connection to systemd without dbus.").Bool() ) type systemdCollector struct { unitDesc *prometheus.Desc unitStartTimeDesc *prometheus.Desc unitTasksCurrentDesc *prometheus.Desc unitTasksMaxDesc *prometheus.Desc systemRunningDesc *prometheus.Desc summaryDesc *prometheus.Desc nRestartsDesc *prometheus.Desc timerLastTriggerDesc *prometheus.Desc socketAcceptedConnectionsDesc *prometheus.Desc socketCurrentConnectionsDesc *prometheus.Desc socketRefusedConnectionsDesc *prometheus.Desc unitWhitelistPattern *regexp.Regexp unitBlacklistPattern *regexp.Regexp } var unitStatesName = []string{"active", "activating", "deactivating", "inactive", "failed"} func init() { registerCollector("systemd", defaultDisabled, NewSystemdCollector) } // NewSystemdCollector returns a new Collector exposing systemd statistics. func NewSystemdCollector() (Collector, error) { const subsystem = "systemd" unitDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "unit_state"), "Systemd unit", []string{"name", "state", "type"}, nil, ) unitStartTimeDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "unit_start_time_seconds"), "Start time of the unit since unix epoch in seconds.", []string{"name"}, nil, ) unitTasksCurrentDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "unit_tasks_current"), "Current number of tasks per Systemd unit", []string{"name"}, nil, ) unitTasksMaxDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "unit_tasks_max"), "Maximum number of tasks per Systemd unit", []string{"name"}, nil, ) systemRunningDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "system_running"), "Whether the system is operational (see 'systemctl is-system-running')", nil, nil, ) summaryDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "units"), "Summary of systemd unit states", []string{"state"}, nil) nRestartsDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "service_restart_total"), "Service unit count of Restart triggers", []string{"state"}, nil) timerLastTriggerDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "timer_last_trigger_seconds"), "Seconds since epoch of last trigger.", []string{"name"}, nil) socketAcceptedConnectionsDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "socket_accepted_connections_total"), "Total number of accepted socket connections", []string{"name"}, nil) socketCurrentConnectionsDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "socket_current_connections"), "Current number of socket connections", []string{"name"}, nil) socketRefusedConnectionsDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "socket_refused_connections_total"), "Total number of refused socket connections", []string{"name"}, nil) unitWhitelistPattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitWhitelist)) unitBlacklistPattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitBlacklist)) return &systemdCollector{ unitDesc: unitDesc, unitStartTimeDesc: unitStartTimeDesc, unitTasksCurrentDesc: unitTasksCurrentDesc, unitTasksMaxDesc: unitTasksMaxDesc, systemRunningDesc: systemRunningDesc, summaryDesc: summaryDesc, nRestartsDesc: nRestartsDesc, timerLastTriggerDesc: timerLastTriggerDesc, socketAcceptedConnectionsDesc: socketAcceptedConnectionsDesc, socketCurrentConnectionsDesc: socketCurrentConnectionsDesc, socketRefusedConnectionsDesc: socketRefusedConnectionsDesc, unitWhitelistPattern: unitWhitelistPattern, unitBlacklistPattern: unitBlacklistPattern, }, nil } func (c *systemdCollector) Update(ch chan<- prometheus.Metric) error { allUnits, err := c.getAllUnits() if err != nil { return fmt.Errorf("couldn't get units: %s", err) } summary := summarizeUnits(allUnits) c.collectSummaryMetrics(ch, summary) units := filterUnits(allUnits, c.unitWhitelistPattern, c.unitBlacklistPattern) c.collectUnitStatusMetrics(ch, units) c.collectUnitStartTimeMetrics(ch, units) c.collectUnitTasksCurrentMetrics(ch, units) c.collectUnitTasksMaxMetrics(ch, units) c.collectTimers(ch, units) c.collectSockets(ch, units) systemState, err := c.getSystemState() if err != nil { return fmt.Errorf("couldn't get system state: %s", err) } c.collectSystemState(ch, systemState) return nil } func (c *systemdCollector) collectUnitStatusMetrics(ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { for _, stateName := range unitStatesName { isActive := 0.0 if stateName == unit.ActiveState { isActive = 1.0 } ch <- prometheus.MustNewConstMetric( c.unitDesc, prometheus.GaugeValue, isActive, unit.Name, stateName, unit.serviceType) } if strings.HasSuffix(unit.Name, ".service") && unit.nRestarts != nil { ch <- prometheus.MustNewConstMetric( c.nRestartsDesc, prometheus.CounterValue, float64(*unit.nRestarts), unit.Name) } } } func (c *systemdCollector) collectSockets(ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { if !strings.HasSuffix(unit.Name, ".socket") { continue } ch <- prometheus.MustNewConstMetric( c.socketAcceptedConnectionsDesc, prometheus.CounterValue, float64(unit.acceptedConnections), unit.Name) ch <- prometheus.MustNewConstMetric( c.socketCurrentConnectionsDesc, prometheus.GaugeValue, float64(unit.currentConnections), unit.Name) if unit.refusedConnections != nil { ch <- prometheus.MustNewConstMetric( c.socketRefusedConnectionsDesc, prometheus.GaugeValue, float64(*unit.refusedConnections), unit.Name) } } } func (c *systemdCollector) collectUnitStartTimeMetrics(ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { ch <- prometheus.MustNewConstMetric( c.unitStartTimeDesc, prometheus.GaugeValue, float64(unit.startTimeUsec)/1e6, unit.Name) } } func (c *systemdCollector) collectUnitTasksCurrentMetrics(ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { if unit.tasksCurrent != nil { ch <- prometheus.MustNewConstMetric( c.unitTasksCurrentDesc, prometheus.GaugeValue, float64(*unit.tasksCurrent), unit.Name) } } } func (c *systemdCollector) collectUnitTasksMaxMetrics(ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { if unit.tasksMax != nil { ch <- prometheus.MustNewConstMetric( c.unitTasksMaxDesc, prometheus.GaugeValue, float64(*unit.tasksMax), unit.Name) } } } func (c *systemdCollector) collectTimers(ch chan<- prometheus.Metric, units []unit) { for _, unit := range units { if !strings.HasSuffix(unit.Name, ".timer") { continue } ch <- prometheus.MustNewConstMetric( c.timerLastTriggerDesc, prometheus.GaugeValue, float64(unit.lastTriggerUsec)/1e6, unit.Name) } } func (c *systemdCollector) collectSummaryMetrics(ch chan<- prometheus.Metric, summary map[string]float64) { for stateName, count := range summary { ch <- prometheus.MustNewConstMetric( c.summaryDesc, prometheus.GaugeValue, count, stateName) } } func (c *systemdCollector) collectSystemState(ch chan<- prometheus.Metric, systemState string) { isSystemRunning := 0.0 if systemState == `"running"` { isSystemRunning = 1.0 } ch <- prometheus.MustNewConstMetric(c.systemRunningDesc, prometheus.GaugeValue, isSystemRunning) } func (c *systemdCollector) newDbus() (*dbus.Conn, error) { if *systemdPrivate { return dbus.NewSystemdConnection() } return dbus.New() } type unit struct { dbus.UnitStatus lastTriggerUsec uint64 startTimeUsec uint64 tasksCurrent *uint64 tasksMax *uint64 nRestarts *uint32 serviceType string acceptedConnections uint32 currentConnections uint32 refusedConnections *uint32 } // unitType gets the suffix after the last "." in the // unit name and capitalizes the first letter func (u *unit) unitType() string { suffixIndex := strings.LastIndex(u.Name, ".") + 1 if suffixIndex < 1 || suffixIndex > len(u.Name) { return "" } return strings.Title(u.Name[suffixIndex:]) } func (c *systemdCollector) getAllUnits() ([]unit, error) { conn, err := c.newDbus() if err != nil { return nil, fmt.Errorf("couldn't get dbus connection: %s", err) } defer conn.Close() // Filter out any units that are not installed and are pulled in only as dependencies. allUnits, err := conn.ListUnits() if err != nil { return nil, err } result := make([]unit, 0, len(allUnits)) for _, status := range allUnits { unit := unit{ UnitStatus: status, } unitType := unit.unitType() if unitType == "Service" || unitType == "Mount" { serviceType, err := conn.GetUnitTypeProperty(unit.Name, unitType, "Type") if err != nil { log.Debugf("couldn't get type for unit '%s': %s", unit.Name, err) } else { unit.serviceType = serviceType.Value.Value().(string) } } if strings.HasSuffix(unit.Name, ".timer") { lastTriggerValue, err := conn.GetUnitTypeProperty(unit.Name, "Timer", "LastTriggerUSec") if err != nil { log.Debugf("couldn't get unit '%s' LastTriggerUSec: %s", unit.Name, err) continue } unit.lastTriggerUsec = lastTriggerValue.Value.Value().(uint64) } if strings.HasSuffix(unit.Name, ".service") { // NRestarts wasn't added until systemd 235. restartsCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "NRestarts") if err != nil { log.Debugf("couldn't get unit '%s' NRestarts: %s", unit.Name, err) } else { nRestarts := restartsCount.Value.Value().(uint32) unit.nRestarts = &nRestarts } tasksCurrentCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksCurrent") if err != nil { log.Debugf("couldn't get unit '%s' TasksCurrent: %s", unit.Name, err) } else { val := tasksCurrentCount.Value.Value().(uint64) // Don't set if tasksCurrent if dbus reports MaxUint64. if val != math.MaxUint64 { unit.tasksCurrent = &val } } tasksMaxCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksMax") if err != nil { log.Debugf("couldn't get unit '%s' TasksMax: %s", unit.Name, err) } else { val := tasksMaxCount.Value.Value().(uint64) // Don't set if tasksMax if dbus reports MaxUint64. if val != math.MaxUint64 { unit.tasksMax = &val } } } if strings.HasSuffix(unit.Name, ".socket") { acceptedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NAccepted") if err != nil { log.Debugf("couldn't get unit '%s' NAccepted: %s", unit.Name, err) continue } unit.acceptedConnections = acceptedConnectionCount.Value.Value().(uint32) currentConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NConnections") if err != nil { log.Debugf("couldn't get unit '%s' NConnections: %s", unit.Name, err) continue } unit.currentConnections = currentConnectionCount.Value.Value().(uint32) // NRefused wasn't added until systemd 239. refusedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NRefused") if err != nil { log.Debugf("couldn't get unit '%s' NRefused: %s", unit.Name, err) } else { nRefused := refusedConnectionCount.Value.Value().(uint32) unit.refusedConnections = &nRefused } } if unit.ActiveState != "active" { unit.startTimeUsec = 0 } else { timestampValue, err := conn.GetUnitProperty(unit.Name, "ActiveEnterTimestamp") if err != nil { log.Debugf("couldn't get unit '%s' StartTimeUsec: %s", unit.Name, err) continue } unit.startTimeUsec = timestampValue.Value.Value().(uint64) } result = append(result, unit) } return result, nil } func summarizeUnits(units []unit) map[string]float64 { summarized := make(map[string]float64) for _, unitStateName := range unitStatesName { summarized[unitStateName] = 0.0 } for _, unit := range units { summarized[unit.ActiveState] += 1.0 } return summarized } func filterUnits(units []unit, whitelistPattern, blacklistPattern *regexp.Regexp) []unit { filtered := make([]unit, 0, len(units)) for _, unit := range units { if whitelistPattern.MatchString(unit.Name) && !blacklistPattern.MatchString(unit.Name) && unit.LoadState == "loaded" { log.Debugf("Adding unit: %s", unit.Name) filtered = append(filtered, unit) } else { log.Debugf("Ignoring unit: %s", unit.Name) } } return filtered } func (c *systemdCollector) getSystemState() (state string, err error) { conn, err := c.newDbus() if err != nil { return "", fmt.Errorf("couldn't get dbus connection: %s", err) } state, err = conn.GetManagerProperty("SystemState") conn.Close() return state, err }