k3s/pkg/certmonitor/certmonitor.go

145 lines
5.3 KiB
Go

package certmonitor
import (
"context"
"crypto/x509"
"fmt"
"os"
"path/filepath"
"strings"
"time"
daemonconfig "github.com/k3s-io/k3s/pkg/daemons/config"
"github.com/k3s-io/k3s/pkg/daemons/control/deps"
"github.com/k3s-io/k3s/pkg/util"
"github.com/k3s-io/k3s/pkg/util/services"
"github.com/k3s-io/k3s/pkg/version"
"github.com/prometheus/client_golang/prometheus"
certutil "github.com/rancher/dynamiclistener/cert"
"github.com/rancher/wrangler/pkg/merr"
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/component-base/metrics/legacyregistry"
)
var (
// DefaultRegisterer and DefaultGatherer are the implementations of the
// prometheus Registerer and Gatherer interfaces that all metrics operations
// will use. They are variables so that packages that embed this library can
// replace them at runtime, instead of having to pass around specific
// registries.
DefaultRegisterer = legacyregistry.Registerer()
DefaultGatherer = legacyregistry.DefaultGatherer
// Check certificates twice an hour. Kubernetes events have a TTL of 1 hour by default,
// so similar events should be aggregated and refreshed by the event recorder as long
// as they are created within the TTL period.
certCheckInterval = time.Minute * 30
controllerName = version.Program + "-cert-monitor"
certificateExpirationSeconds = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: version.Program + "_certificate_expiration_seconds",
Help: "Remaining lifetime on the certificate.",
}, []string{"subject", "usages"})
)
// Setup starts the certificate expiration monitor
func Setup(ctx context.Context, nodeConfig *daemonconfig.Node, dataDir string) error {
logrus.Debugf("Starting %s with monitoring period %s", controllerName, certCheckInterval)
DefaultRegisterer.MustRegister(certificateExpirationSeconds)
client, err := util.GetClientSet(nodeConfig.AgentConfig.KubeConfigKubelet)
if err != nil {
return err
}
recorder := util.BuildControllerEventRecorder(client, controllerName, metav1.NamespaceDefault)
// This is consistent with events attached to the node generated by the kubelet
// https://github.com/kubernetes/kubernetes/blob/612130dd2f4188db839ea5c2dea07a96b0ad8d1c/pkg/kubelet/kubelet.go#L479-L485
nodeRef := &corev1.ObjectReference{
Kind: "Node",
Name: nodeConfig.AgentConfig.NodeName,
UID: types.UID(nodeConfig.AgentConfig.NodeName),
Namespace: "",
}
// Create a dummy controlConfig just to hold the paths for the server certs
controlConfig := daemonconfig.Control{
DataDir: filepath.Join(dataDir, "server"),
Runtime: &daemonconfig.ControlRuntime{},
}
deps.CreateRuntimeCertFiles(&controlConfig)
caMap := map[string][]string{}
nodeList := services.Agent
if _, err := os.Stat(controlConfig.DataDir); err == nil {
nodeList = services.All
caMap, err = services.FilesForServices(controlConfig, services.CA)
if err != nil {
return err
}
}
nodeMap, err := services.FilesForServices(controlConfig, nodeList)
if err != nil {
return err
}
go wait.Until(func() {
logrus.Debugf("Running %s certificate expiration check", controllerName)
if err := checkCerts(nodeMap, time.Hour*24*daemonconfig.CertificateRenewDays); err != nil {
message := fmt.Sprintf("Node certificates require attention - restart %s on this node to trigger automatic rotation: %v", version.Program, err)
recorder.Event(nodeRef, corev1.EventTypeWarning, "CertificateExpirationWarning", message)
}
if err := checkCerts(caMap, time.Hour*24*365); err != nil {
message := fmt.Sprintf("Certificate authority certificates require attention - check %s documentation and begin planning rotation: %v", version.Program, err)
recorder.Event(nodeRef, corev1.EventTypeWarning, "CACertificateExpirationWarning", message)
}
}, certCheckInterval, ctx.Done())
return nil
}
func checkCerts(fileMap map[string][]string, warningPeriod time.Duration) error {
errs := merr.Errors{}
now := time.Now()
warn := now.Add(warningPeriod)
for service, files := range fileMap {
for _, file := range files {
basename := filepath.Base(file)
certs, _ := certutil.CertsFromFile(file)
for _, cert := range certs {
usages := []string{}
if cert.KeyUsage&x509.KeyUsageCertSign != 0 {
usages = append(usages, "CertSign")
}
for _, eku := range cert.ExtKeyUsage {
switch eku {
case x509.ExtKeyUsageServerAuth:
usages = append(usages, "ServerAuth")
case x509.ExtKeyUsageClientAuth:
usages = append(usages, "ClientAuth")
}
}
certificateExpirationSeconds.WithLabelValues(cert.Subject.String(), strings.Join(usages, ",")).Set(cert.NotAfter.Sub(now).Seconds())
if now.Before(cert.NotBefore) {
errs = append(errs, fmt.Errorf("%s/%s: certificate %s is not valid before %s", service, basename, cert.Subject, cert.NotBefore.Format(time.RFC3339)))
} else if now.After(cert.NotAfter) {
errs = append(errs, fmt.Errorf("%s/%s: certificate %s expired at %s", service, basename, cert.Subject, cert.NotAfter.Format(time.RFC3339)))
} else if warn.After(cert.NotAfter) {
errs = append(errs, fmt.Errorf("%s/%s: certificate %s will expire within %d days at %s", service, basename, cert.Subject, daemonconfig.CertificateRenewDays, cert.NotAfter.Format(time.RFC3339)))
}
}
}
}
return merr.NewErrors(errs...)
}