node_exporter/collector/ntp.go

169 lines
5.9 KiB
Go

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !nontp
// +build !nontp
package collector
import (
"fmt"
"log/slog"
"net"
"sync"
"time"
"github.com/beevik/ntp"
"github.com/prometheus/client_golang/prometheus"
)
const (
hour24 = 24 * time.Hour // `time` does not export `Day` as Day != 24h because of DST
ntpSubsystem = "ntp"
)
var (
leapMidnight time.Time
leapMidnightMutex = &sync.Mutex{}
)
type ntpCollector struct {
stratum, leap, rtt, offset, reftime, rootDelay, rootDispersion, sanity typedDesc
logger *slog.Logger
config *NodeCollectorConfig
}
func init() {
registerCollector("ntp", defaultDisabled, NewNtpCollector)
}
// NewNtpCollector returns a new Collector exposing sanity of local NTP server.
// Default definition of "local" is:
// - collector.ntp.server address is a loopback address (or collector.ntp.server-is-mine flag is turned on)
// - the server is reachable with outgoin IP_TTL = 1
func NewNtpCollector(config *NodeCollectorConfig, logger *slog.Logger) (Collector, error) {
ipaddr := net.ParseIP(*config.NTP.Server)
if !*config.NTP.ServerIsLocal && (ipaddr == nil || !ipaddr.IsLoopback()) {
return nil, fmt.Errorf("only IP address of local NTP server is valid for --collector.ntp.server")
}
if *config.NTP.ProtocolVersion < 2 || *config.NTP.ProtocolVersion > 4 {
return nil, fmt.Errorf("invalid NTP protocol version %d; must be 2, 3, or 4", *config.NTP.ProtocolVersion)
}
if *config.NTP.OffsetTolerance < 0 {
return nil, fmt.Errorf("offset tolerance must be non-negative")
}
if *config.NTP.ServerPort < 1 || *config.NTP.ServerPort > 65535 {
return nil, fmt.Errorf("invalid NTP port number %d; must be between 1 and 65535 inclusive", *config.NTP.ServerPort)
}
logger.Warn("This collector is deprecated and will be removed in the next major version release.")
return &ntpCollector{
stratum: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, ntpSubsystem, "stratum"),
"NTPD stratum.",
nil, nil,
), prometheus.GaugeValue},
leap: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, ntpSubsystem, "leap"),
"NTPD leap second indicator, 2 bits.",
nil, nil,
), prometheus.GaugeValue},
rtt: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, ntpSubsystem, "rtt_seconds"),
"RTT to NTPD.",
nil, nil,
), prometheus.GaugeValue},
offset: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, ntpSubsystem, "offset_seconds"),
"ClockOffset between NTP and local clock.",
nil, nil,
), prometheus.GaugeValue},
reftime: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, ntpSubsystem, "reference_timestamp_seconds"),
"NTPD ReferenceTime, UNIX timestamp.",
nil, nil,
), prometheus.GaugeValue},
rootDelay: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, ntpSubsystem, "root_delay_seconds"),
"NTPD RootDelay.",
nil, nil,
), prometheus.GaugeValue},
rootDispersion: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, ntpSubsystem, "root_dispersion_seconds"),
"NTPD RootDispersion.",
nil, nil,
), prometheus.GaugeValue},
sanity: typedDesc{prometheus.NewDesc(
prometheus.BuildFQName(namespace, ntpSubsystem, "sanity"),
"NTPD sanity according to RFC5905 heuristics and configured limits.",
nil, nil,
), prometheus.GaugeValue},
logger: logger,
config: config,
}, nil
}
func (c *ntpCollector) Update(ch chan<- prometheus.Metric) error {
resp, err := ntp.QueryWithOptions(*c.config.NTP.Server, ntp.QueryOptions{
Version: *c.config.NTP.ProtocolVersion,
TTL: *c.config.NTP.IPTTL,
Timeout: time.Second, // default `ntpdate` timeout
Port: *c.config.NTP.ServerPort,
})
if err != nil {
return fmt.Errorf("couldn't get SNTP reply: %w", err)
}
ch <- c.stratum.mustNewConstMetric(float64(resp.Stratum))
ch <- c.leap.mustNewConstMetric(float64(resp.Leap))
ch <- c.rtt.mustNewConstMetric(resp.RTT.Seconds())
ch <- c.offset.mustNewConstMetric(resp.ClockOffset.Seconds())
if resp.ReferenceTime.Unix() > 0 {
// Go Zero is 0001-01-01 00:00:00 UTC
// NTP Zero is 1900-01-01 00:00:00 UTC
// UNIX Zero is 1970-01-01 00:00:00 UTC
// so let's keep ALL ancient `reftime` values as zero
ch <- c.reftime.mustNewConstMetric(float64(resp.ReferenceTime.UnixNano()) / 1e9)
} else {
ch <- c.reftime.mustNewConstMetric(0)
}
ch <- c.rootDelay.mustNewConstMetric(resp.RootDelay.Seconds())
ch <- c.rootDispersion.mustNewConstMetric(resp.RootDispersion.Seconds())
// Here is SNTP packet sanity check that is exposed to move burden of
// configuration from node_exporter user to the developer.
maxerr := *c.config.NTP.OffsetTolerance
leapMidnightMutex.Lock()
if resp.Leap == ntp.LeapAddSecond || resp.Leap == ntp.LeapDelSecond {
// state of leapMidnight is cached as leap flag is dropped right after midnight
leapMidnight = resp.Time.Truncate(hour24).Add(hour24)
}
if leapMidnight.Add(-hour24).Before(resp.Time) && resp.Time.Before(leapMidnight.Add(hour24)) {
// tolerate leap smearing
maxerr += time.Second
}
leapMidnightMutex.Unlock()
if resp.Validate() == nil && resp.RootDistance <= *c.config.NTP.MaxDistance && resp.MinError <= maxerr {
ch <- c.sanity.mustNewConstMetric(1)
} else {
ch <- c.sanity.mustNewConstMetric(0)
}
return nil
}