// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package autoconf
import (
"context"
"fmt"
"time"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/structs"
)
// handleCacheEvent is used to handle event notifications from the cache for the roots
// or leaf cert watches.
func ( ac * AutoConfig ) handleCacheEvent ( u cache . UpdateEvent ) error {
switch u . CorrelationID {
case rootsWatchID :
ac . logger . Debug ( "roots watch fired - updating CA certificates" )
if u . Err != nil {
return fmt . Errorf ( "root watch returned an error: %w" , u . Err )
}
roots , ok := u . Result . ( * structs . IndexedCARoots )
if ! ok {
return fmt . Errorf ( "invalid type for roots watch response: %T" , u . Result )
}
return ac . updateCARoots ( roots )
case leafWatchID :
ac . logger . Debug ( "leaf certificate watch fired - updating TLS certificate" )
if u . Err != nil {
return fmt . Errorf ( "leaf watch returned an error: %w" , u . Err )
}
leaf , ok := u . Result . ( * structs . IssuedCert )
if ! ok {
return fmt . Errorf ( "invalid type for agent leaf cert watch response: %T" , u . Result )
}
return ac . updateLeafCert ( leaf )
}
return nil
}
// handleTokenUpdate is used when a notification about the agent token being updated
// is received and various watches need cancelling/restarting to use the new token.
func ( ac * AutoConfig ) handleTokenUpdate ( ctx context . Context ) error {
ac . logger . Debug ( "Agent token updated - resetting watches" )
// TODO (autoencrypt) Prepopulate the cache with the new token with
// the existing cache entry with the old token. The certificate doesn't
// need to change just because the token has. However there isn't a
// good way to make that happen and this behavior is benign enough
// that I am going to push off implementing it.
// the agent token has been updated so we must update our leaf cert watch.
// this cancels the current watches before setting up new ones
ac . cancelWatches ( )
// recreate the chan for cache updates. This is a precautionary measure to ensure
// that we don't accidentally get notified for the new watches being setup before
// a blocking query in the cache returns and sends data to the old chan. In theory
// the code in agent/cache/watch.go should prevent this where we specifically check
// for context cancellation prior to sending the event. However we could cancel
// it after that check and finish setting up the new watches before getting the old
// events. Both the go routine scheduler and the OS thread scheduler would have to
// be acting up for this to happen. Regardless the way to ensure we don't get events
// for the old watches is to simply replace the chan we are expecting them from.
close ( ac . cacheUpdates )
ac . cacheUpdates = make ( chan cache . UpdateEvent , 10 )
// restart watches - this will be done with the correct token
cancelWatches , err := ac . setupCertificateCacheWatches ( ctx )
if err != nil {
return fmt . Errorf ( "failed to restart watches after agent token update: %w" , err )
}
ac . cancelWatches = cancelWatches
return nil
}
// handleFallback is used when the current TLS certificate has expired and the normal
// updating mechanisms have failed to renew it quickly enough. This function will
// use the configured fallback mechanism to retrieve a new cert and start monitoring
// that one.
func ( ac * AutoConfig ) handleFallback ( ctx context . Context ) error {
ac . logger . Warn ( "agent's client certificate has expired" )
// Background because the context is mainly useful when the agent is first starting up.
switch {
case ac . config . AutoConfig . Enabled :
resp , err := ac . getInitialConfiguration ( ctx )
if err != nil {
return fmt . Errorf ( "error while retrieving new agent certificates via auto-config: %w" , err )
}
return ac . recordInitialConfiguration ( resp )
case ac . config . AutoEncryptTLS :
reply , err := ac . autoEncryptInitialCerts ( ctx )
if err != nil {
return fmt . Errorf ( "error while retrieving new agent certificate via auto-encrypt: %w" , err )
}
return ac . setInitialTLSCertificates ( reply )
default :
return fmt . Errorf ( "logic error: either auto-encrypt or auto-config must be enabled" )
}
}
// run is the private method to be spawn by the Start method for
// executing the main monitoring loop.
func ( ac * AutoConfig ) run ( ctx context . Context , exit chan struct { } ) {
// The fallbackTimer is used to notify AFTER the agents
// leaf certificate has expired and where we need
// to fall back to the less secure RPC endpoint just like
// if the agent was starting up new.
//
// Check 10sec (fallback leeway duration) after cert
// expires. The agent cache should be handling the expiration
// and renew it before then.
//
// If there is no cert, use a value which immediately triggers the
// renew, but this case shouldn't happen because at
// this point, auto_encrypt was just being setup
// successfully.
calcFallbackInterval := func ( ) time . Duration {
cert := ac . acConfig . TLSConfigurator . AutoEncryptCert ( )
if cert == nil {
return - 1
}
expiry := cert . NotAfter . Add ( ac . acConfig . FallbackLeeway )
return expiry . Sub ( time . Now ( ) )
}
fallbackTimer := time . NewTimer ( calcFallbackInterval ( ) )
// cleanup for once we are stopped
defer func ( ) {
// cancel the go routines performing the cache watches
ac . cancelWatches ( )
// ensure we don't leak the timers go routine
fallbackTimer . Stop ( )
// stop receiving notifications for token updates
ac . acConfig . Tokens . StopNotify ( ac . tokenUpdates )
ac . logger . Debug ( "auto-config has been stopped" )
ac . Lock ( )
ac . cancel = nil
ac . running = false
// this should be the final cleanup task as its what notifies
// the rest of the world that this go routine has exited.
close ( exit )
ac . Unlock ( )
} ( )
for {
select {
case <- ctx . Done ( ) :
ac . logger . Debug ( "stopping auto-config" )
return
case <- ac . tokenUpdates . Ch :
ac . logger . Debug ( "handling a token update event" )
if err := ac . handleTokenUpdate ( ctx ) ; err != nil {
ac . logger . Error ( "error in handling token update event" , "error" , err )
}
case u := <- ac . cacheUpdates :
ac . logger . Debug ( "handling a cache update event" , "correlation_id" , u . CorrelationID )
if err := ac . handleCacheEvent ( u ) ; err != nil {
ac . logger . Error ( "error in handling cache update event" , "error" , err )
}
// reset the fallback timer as the certificate may have been updated
fallbackTimer . Stop ( )
fallbackTimer = time . NewTimer ( calcFallbackInterval ( ) )
case <- fallbackTimer . C :
// This is a safety net in case the cert doesn't get renewed
// in time. The agent would be stuck in that case because the watches
// never use the AutoEncrypt.Sign endpoint.
// check auto encrypt client cert expiration
cert := ac . acConfig . TLSConfigurator . AutoEncryptCert ( )
if cert == nil || cert . NotAfter . Before ( time . Now ( ) ) {
if err := ac . handleFallback ( ctx ) ; err != nil {
ac . logger . Error ( "error when handling a certificate expiry event" , "error" , err )
fallbackTimer = time . NewTimer ( ac . acConfig . FallbackRetry )
} else {
fallbackTimer = time . NewTimer ( calcFallbackInterval ( ) )
}
} else {
// this shouldn't be possible. We calculate the timer duration to be the certificate
// expiration time + some leeway (10s default). So whenever we get here the certificate
// should be expired. Regardless its probably worth resetting the timer.
fallbackTimer = time . NewTimer ( calcFallbackInterval ( ) )
}
}
}
}