You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
consul/agent/auto-config/run.go

200 lines
7.2 KiB

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package autoconf
import (
"context"
"fmt"
"time"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/structs"
)
// handleCacheEvent is used to handle event notifications from the cache for the roots
// or leaf cert watches.
func (ac *AutoConfig) handleCacheEvent(u cache.UpdateEvent) error {
switch u.CorrelationID {
case rootsWatchID:
ac.logger.Debug("roots watch fired - updating CA certificates")
if u.Err != nil {
return fmt.Errorf("root watch returned an error: %w", u.Err)
}
roots, ok := u.Result.(*structs.IndexedCARoots)
if !ok {
return fmt.Errorf("invalid type for roots watch response: %T", u.Result)
}
return ac.updateCARoots(roots)
case leafWatchID:
ac.logger.Debug("leaf certificate watch fired - updating TLS certificate")
if u.Err != nil {
return fmt.Errorf("leaf watch returned an error: %w", u.Err)
}
leaf, ok := u.Result.(*structs.IssuedCert)
if !ok {
return fmt.Errorf("invalid type for agent leaf cert watch response: %T", u.Result)
}
return ac.updateLeafCert(leaf)
}
return nil
}
// handleTokenUpdate is used when a notification about the agent token being updated
// is received and various watches need cancelling/restarting to use the new token.
func (ac *AutoConfig) handleTokenUpdate(ctx context.Context) error {
ac.logger.Debug("Agent token updated - resetting watches")
// TODO (autoencrypt) Prepopulate the cache with the new token with
// the existing cache entry with the old token. The certificate doesn't
// need to change just because the token has. However there isn't a
// good way to make that happen and this behavior is benign enough
// that I am going to push off implementing it.
// the agent token has been updated so we must update our leaf cert watch.
// this cancels the current watches before setting up new ones
ac.cancelWatches()
// recreate the chan for cache updates. This is a precautionary measure to ensure
// that we don't accidentally get notified for the new watches being setup before
// a blocking query in the cache returns and sends data to the old chan. In theory
// the code in agent/cache/watch.go should prevent this where we specifically check
// for context cancellation prior to sending the event. However we could cancel
// it after that check and finish setting up the new watches before getting the old
// events. Both the go routine scheduler and the OS thread scheduler would have to
// be acting up for this to happen. Regardless the way to ensure we don't get events
// for the old watches is to simply replace the chan we are expecting them from.
close(ac.cacheUpdates)
ac.cacheUpdates = make(chan cache.UpdateEvent, 10)
// restart watches - this will be done with the correct token
cancelWatches, err := ac.setupCertificateCacheWatches(ctx)
if err != nil {
return fmt.Errorf("failed to restart watches after agent token update: %w", err)
}
ac.cancelWatches = cancelWatches
return nil
}
// handleFallback is used when the current TLS certificate has expired and the normal
// updating mechanisms have failed to renew it quickly enough. This function will
// use the configured fallback mechanism to retrieve a new cert and start monitoring
// that one.
func (ac *AutoConfig) handleFallback(ctx context.Context) error {
ac.logger.Warn("agent's client certificate has expired")
// Background because the context is mainly useful when the agent is first starting up.
switch {
case ac.config.AutoConfig.Enabled:
resp, err := ac.getInitialConfiguration(ctx)
if err != nil {
return fmt.Errorf("error while retrieving new agent certificates via auto-config: %w", err)
}
return ac.recordInitialConfiguration(resp)
case ac.config.AutoEncryptTLS:
reply, err := ac.autoEncryptInitialCerts(ctx)
if err != nil {
return fmt.Errorf("error while retrieving new agent certificate via auto-encrypt: %w", err)
}
return ac.setInitialTLSCertificates(reply)
default:
return fmt.Errorf("logic error: either auto-encrypt or auto-config must be enabled")
}
}
// run is the private method to be spawn by the Start method for
// executing the main monitoring loop.
func (ac *AutoConfig) run(ctx context.Context, exit chan struct{}) {
// The fallbackTimer is used to notify AFTER the agents
// leaf certificate has expired and where we need
// to fall back to the less secure RPC endpoint just like
// if the agent was starting up new.
//
// Check 10sec (fallback leeway duration) after cert
// expires. The agent cache should be handling the expiration
// and renew it before then.
//
// If there is no cert, use a value which immediately triggers the
// renew, but this case shouldn't happen because at
// this point, auto_encrypt was just being setup
// successfully.
calcFallbackInterval := func() time.Duration {
cert := ac.acConfig.TLSConfigurator.AutoEncryptCert()
if cert == nil {
return -1
}
expiry := cert.NotAfter.Add(ac.acConfig.FallbackLeeway)
return expiry.Sub(time.Now())
}
fallbackTimer := time.NewTimer(calcFallbackInterval())
// cleanup for once we are stopped
defer func() {
// cancel the go routines performing the cache watches
ac.cancelWatches()
// ensure we don't leak the timers go routine
fallbackTimer.Stop()
// stop receiving notifications for token updates
ac.acConfig.Tokens.StopNotify(ac.tokenUpdates)
ac.logger.Debug("auto-config has been stopped")
ac.Lock()
ac.cancel = nil
ac.running = false
// this should be the final cleanup task as its what notifies
// the rest of the world that this go routine has exited.
close(exit)
ac.Unlock()
}()
for {
select {
case <-ctx.Done():
ac.logger.Debug("stopping auto-config")
return
case <-ac.tokenUpdates.Ch:
ac.logger.Debug("handling a token update event")
if err := ac.handleTokenUpdate(ctx); err != nil {
ac.logger.Error("error in handling token update event", "error", err)
}
case u := <-ac.cacheUpdates:
ac.logger.Debug("handling a cache update event", "correlation_id", u.CorrelationID)
if err := ac.handleCacheEvent(u); err != nil {
ac.logger.Error("error in handling cache update event", "error", err)
}
// reset the fallback timer as the certificate may have been updated
fallbackTimer.Stop()
fallbackTimer = time.NewTimer(calcFallbackInterval())
case <-fallbackTimer.C:
// This is a safety net in case the cert doesn't get renewed
// in time. The agent would be stuck in that case because the watches
// never use the AutoEncrypt.Sign endpoint.
// check auto encrypt client cert expiration
cert := ac.acConfig.TLSConfigurator.AutoEncryptCert()
if cert == nil || cert.NotAfter.Before(time.Now()) {
if err := ac.handleFallback(ctx); err != nil {
ac.logger.Error("error when handling a certificate expiry event", "error", err)
fallbackTimer = time.NewTimer(ac.acConfig.FallbackRetry)
} else {
fallbackTimer = time.NewTimer(calcFallbackInterval())
}
} else {
// this shouldn't be possible. We calculate the timer duration to be the certificate
// expiration time + some leeway (10s default). So whenever we get here the certificate
// should be expired. Regardless its probably worth resetting the timer.
fallbackTimer = time.NewTimer(calcFallbackInterval())
}
}
}
}