mirror of https://github.com/hashicorp/consul
Merge pull request #9924 from hashicorp/dnephin/cert-expiration-metric
connect: emit a metric for the seconds until root CA expirypull/10437/head
commit
d81f527be8
|
@ -0,0 +1,4 @@
|
||||||
|
```release-note:improvement
|
||||||
|
telemetry: add a new `mesh.active-root-ca.expiry` metric for tracking when the root certificate expires.
|
||||||
|
```
|
||||||
|
|
|
@ -36,6 +36,7 @@ func (s *Server) startConnectLeader(ctx context.Context) error {
|
||||||
|
|
||||||
s.caManager.Start(ctx)
|
s.caManager.Start(ctx)
|
||||||
s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning)
|
s.leaderRoutineManager.Start(ctx, caRootPruningRoutineName, s.runCARootPruning)
|
||||||
|
s.leaderRoutineManager.Start(ctx, caRootMetricRoutineName, rootCAExpiryMonitor(s).monitor)
|
||||||
|
|
||||||
return s.startIntentionConfigEntryMigration(ctx)
|
return s.startIntentionConfigEntryMigration(ctx)
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,74 @@
|
||||||
|
package consul
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/armon/go-metrics"
|
||||||
|
"github.com/armon/go-metrics/prometheus"
|
||||||
|
"github.com/hashicorp/go-hclog"
|
||||||
|
|
||||||
|
"github.com/hashicorp/consul/logging"
|
||||||
|
)
|
||||||
|
|
||||||
|
var CertExpirationGauges = []prometheus.GaugeDefinition{
|
||||||
|
{
|
||||||
|
Name: metricsKeyMeshRootCAExpiry,
|
||||||
|
Help: "Seconds until the service mesh root certificate expires.",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
var metricsKeyMeshRootCAExpiry = []string{"mesh", "active-root-ca", "expiry"}
|
||||||
|
|
||||||
|
func rootCAExpiryMonitor(s *Server) certExpirationMonitor {
|
||||||
|
return certExpirationMonitor{
|
||||||
|
Key: metricsKeyMeshRootCAExpiry,
|
||||||
|
Labels: []metrics.Label{
|
||||||
|
{Name: "datacenter", Value: s.config.Datacenter},
|
||||||
|
},
|
||||||
|
Logger: s.logger.Named(logging.Connect),
|
||||||
|
Query: func() (time.Duration, error) {
|
||||||
|
state := s.fsm.State()
|
||||||
|
_, root, err := state.CARootActive(nil)
|
||||||
|
switch {
|
||||||
|
case err != nil:
|
||||||
|
return 0, fmt.Errorf("failed to retrieve root CA: %w", err)
|
||||||
|
case root == nil:
|
||||||
|
return 0, fmt.Errorf("no active root CA")
|
||||||
|
}
|
||||||
|
|
||||||
|
return time.Until(root.NotAfter), nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type certExpirationMonitor struct {
|
||||||
|
Key []string
|
||||||
|
Labels []metrics.Label
|
||||||
|
Logger hclog.Logger
|
||||||
|
// Query is called at each interval. It should return the duration until the
|
||||||
|
// certificate expires, or an error if the query failed.
|
||||||
|
Query func() (time.Duration, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
const certExpirationMonitorInterval = time.Hour
|
||||||
|
|
||||||
|
func (m certExpirationMonitor) monitor(ctx context.Context) error {
|
||||||
|
ticker := time.NewTicker(certExpirationMonitorInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return nil
|
||||||
|
case <-ticker.C:
|
||||||
|
d, err := m.Query()
|
||||||
|
if err != nil {
|
||||||
|
m.Logger.Warn("failed to emit certificate expiry metric", "error", err)
|
||||||
|
}
|
||||||
|
expiry := d / time.Second
|
||||||
|
metrics.SetGaugeWithLabels(m.Key, float32(expiry), m.Labels)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -102,6 +102,7 @@ const (
|
||||||
aclTokenReapingRoutineName = "acl token reaping"
|
aclTokenReapingRoutineName = "acl token reaping"
|
||||||
aclUpgradeRoutineName = "legacy ACL token upgrade"
|
aclUpgradeRoutineName = "legacy ACL token upgrade"
|
||||||
caRootPruningRoutineName = "CA root pruning"
|
caRootPruningRoutineName = "CA root pruning"
|
||||||
|
caRootMetricRoutineName = "CA root expiration metric"
|
||||||
configReplicationRoutineName = "config entry replication"
|
configReplicationRoutineName = "config entry replication"
|
||||||
federationStateReplicationRoutineName = "federation state replication"
|
federationStateReplicationRoutineName = "federation state replication"
|
||||||
federationStateAntiEntropyRoutineName = "federation state anti-entropy"
|
federationStateAntiEntropyRoutineName = "federation state anti-entropy"
|
||||||
|
|
|
@ -194,6 +194,7 @@ func getPrometheusDefs(cfg lib.TelemetryConfig) ([]prometheus.GaugeDefinition, [
|
||||||
xds.StatsGauges,
|
xds.StatsGauges,
|
||||||
usagemetrics.Gauges,
|
usagemetrics.Gauges,
|
||||||
consul.ReplicationGauges,
|
consul.ReplicationGauges,
|
||||||
|
consul.CertExpirationGauges,
|
||||||
Gauges,
|
Gauges,
|
||||||
raftGauges,
|
raftGauges,
|
||||||
}
|
}
|
||||||
|
|
|
@ -478,6 +478,7 @@ These metrics give insight into the health of the cluster as a whole.
|
||||||
| `consul.catalog.connect.query-tag..` | Increments for each connect-based catalog query for the given service with the given tag. | queries | counter |
|
| `consul.catalog.connect.query-tag..` | Increments for each connect-based catalog query for the given service with the given tag. | queries | counter |
|
||||||
| `consul.catalog.connect.query-tags..` | Increments for each connect-based catalog query for the given service with the given tags. | queries | counter |
|
| `consul.catalog.connect.query-tags..` | Increments for each connect-based catalog query for the given service with the given tags. | queries | counter |
|
||||||
| `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter |
|
| `consul.catalog.connect.not-found.` | Increments for each connect-based catalog query where the given service could not be found. | queries | counter |
|
||||||
|
| `consul.mesh.active-root-ca.expiry` | The number of seconds until the root CA expires, updated every hour. | seconds | gauge |
|
||||||
|
|
||||||
## Connect Built-in Proxy Metrics
|
## Connect Built-in Proxy Metrics
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue