Merge pull request #8311 from hashicorp/bugfix/auto-encrypt-token-update

pull/8300/head
Matt Keeler 2020-07-21 13:15:27 -04:00 committed by GitHub
commit 3c09482864
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 1441 additions and 233 deletions

View File

@ -32,6 +32,7 @@ import (
autoconf "github.com/hashicorp/consul/agent/auto-config"
"github.com/hashicorp/consul/agent/cache"
cachetype "github.com/hashicorp/consul/agent/cache-types"
certmon "github.com/hashicorp/consul/agent/cert-monitor"
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/config"
"github.com/hashicorp/consul/agent/consul"
@ -165,6 +166,8 @@ type notifier interface {
type Agent struct {
autoConf *autoconf.AutoConfig
certMonitor *certmon.CertMonitor
// config is the agent configuration.
config *config.RuntimeConfig
@ -716,17 +719,39 @@ func (a *Agent) Start(ctx context.Context) error {
a.registerCache()
if a.config.AutoEncryptTLS && !a.config.ServerMode {
reply, err := a.setupClientAutoEncrypt(ctx)
reply, err := a.autoEncryptInitialCertificate(ctx)
if err != nil {
return fmt.Errorf("AutoEncrypt failed: %s", err)
}
rootsReq, leafReq, err := a.setupClientAutoEncryptCache(reply)
cmConfig := new(certmon.Config).
WithCache(a.cache).
WithLogger(a.logger.Named(logging.AutoEncrypt)).
WithTLSConfigurator(a.tlsConfigurator).
WithTokens(a.tokens).
WithFallback(a.autoEncryptInitialCertificate).
WithDNSSANs(a.config.AutoEncryptDNSSAN).
WithIPSANs(a.config.AutoEncryptIPSAN).
WithDatacenter(a.config.Datacenter).
WithNodeName(a.config.NodeName)
monitor, err := certmon.New(cmConfig)
if err != nil {
return fmt.Errorf("AutoEncrypt failed: %s", err)
return fmt.Errorf("AutoEncrypt failed to setup certificate monitor: %w", err)
}
if err = a.setupClientAutoEncryptWatching(rootsReq, leafReq); err != nil {
return fmt.Errorf("AutoEncrypt failed: %s", err)
if err := monitor.Update(reply); err != nil {
return fmt.Errorf("AutoEncrypt failed to setup certificate monitor: %w", err)
}
a.certMonitor = monitor
// we don't need to worry about ever calling Stop as we have tied the go routines
// to the agents lifetime by using the StopCh. Also the agent itself doesn't have
// a need of ensuring that the go routine was stopped before performing any action
// so we can ignore the chan in the return.
if _, err := a.certMonitor.Start(&lib.StopChannelContext{StopCh: a.shutdownCh}); err != nil {
return fmt.Errorf("AutoEncrypt failed to start certificate monitor: %w", err)
}
a.logger.Info("automatically upgraded to TLS")
}
@ -829,7 +854,7 @@ func (a *Agent) Start(ctx context.Context) error {
return nil
}
func (a *Agent) setupClientAutoEncrypt(ctx context.Context) (*structs.SignedResponse, error) {
func (a *Agent) autoEncryptInitialCertificate(ctx context.Context) (*structs.SignedResponse, error) {
client := a.delegate.(*consul.Client)
addrs := a.config.StartJoinAddrsLAN
@ -839,165 +864,7 @@ func (a *Agent) setupClientAutoEncrypt(ctx context.Context) (*structs.SignedResp
}
addrs = append(addrs, retryJoinAddrs(disco, retryJoinSerfVariant, "LAN", a.config.RetryJoinLAN, a.logger)...)
reply, priv, err := client.RequestAutoEncryptCerts(ctx, addrs, a.config.ServerPort, a.tokens.AgentToken(), a.config.AutoEncryptDNSSAN, a.config.AutoEncryptIPSAN)
if err != nil {
return nil, err
}
connectCAPems := []string{}
for _, ca := range reply.ConnectCARoots.Roots {
connectCAPems = append(connectCAPems, ca.RootCert)
}
if err := a.tlsConfigurator.UpdateAutoEncrypt(reply.ManualCARoots, connectCAPems, reply.IssuedCert.CertPEM, priv, reply.VerifyServerHostname); err != nil {
return nil, err
}
return reply, nil
}
func (a *Agent) setupClientAutoEncryptCache(reply *structs.SignedResponse) (*structs.DCSpecificRequest, *cachetype.ConnectCALeafRequest, error) {
rootsReq := &structs.DCSpecificRequest{
Datacenter: a.config.Datacenter,
QueryOptions: structs.QueryOptions{Token: a.tokens.AgentToken()},
}
// prepolutate roots cache
rootRes := cache.FetchResult{Value: &reply.ConnectCARoots, Index: reply.ConnectCARoots.QueryMeta.Index}
if err := a.cache.Prepopulate(cachetype.ConnectCARootName, rootRes, a.config.Datacenter, a.tokens.AgentToken(), rootsReq.CacheInfo().Key); err != nil {
return nil, nil, err
}
leafReq := &cachetype.ConnectCALeafRequest{
Datacenter: a.config.Datacenter,
Token: a.tokens.AgentToken(),
Agent: a.config.NodeName,
DNSSAN: a.config.AutoEncryptDNSSAN,
IPSAN: a.config.AutoEncryptIPSAN,
}
// prepolutate leaf cache
certRes := cache.FetchResult{
Value: &reply.IssuedCert,
Index: reply.ConnectCARoots.QueryMeta.Index,
}
for _, ca := range reply.ConnectCARoots.Roots {
if ca.ID == reply.ConnectCARoots.ActiveRootID {
certRes.State = cachetype.ConnectCALeafSuccess(ca.SigningKeyID)
break
}
}
if err := a.cache.Prepopulate(cachetype.ConnectCALeafName, certRes, a.config.Datacenter, a.tokens.AgentToken(), leafReq.Key()); err != nil {
return nil, nil, err
}
return rootsReq, leafReq, nil
}
func (a *Agent) setupClientAutoEncryptWatching(rootsReq *structs.DCSpecificRequest, leafReq *cachetype.ConnectCALeafRequest) error {
// setup watches
ch := make(chan cache.UpdateEvent, 10)
ctx, cancel := context.WithCancel(context.Background())
// Watch for root changes
err := a.cache.Notify(ctx, cachetype.ConnectCARootName, rootsReq, rootsWatchID, ch)
if err != nil {
cancel()
return err
}
// Watch the leaf cert
err = a.cache.Notify(ctx, cachetype.ConnectCALeafName, leafReq, leafWatchID, ch)
if err != nil {
cancel()
return err
}
// Setup actions in case the watches are firing.
go func() {
for {
select {
case <-a.shutdownCh:
cancel()
return
case <-ctx.Done():
return
case u := <-ch:
switch u.CorrelationID {
case rootsWatchID:
roots, ok := u.Result.(*structs.IndexedCARoots)
if !ok {
err := fmt.Errorf("invalid type for roots response: %T", u.Result)
a.logger.Error("watch error for correlation id",
"correlation_id", u.CorrelationID,
"error", err,
)
continue
}
pems := []string{}
for _, root := range roots.Roots {
pems = append(pems, root.RootCert)
}
a.tlsConfigurator.UpdateAutoEncryptCA(pems)
case leafWatchID:
leaf, ok := u.Result.(*structs.IssuedCert)
if !ok {
err := fmt.Errorf("invalid type for leaf response: %T", u.Result)
a.logger.Error("watch error for correlation id",
"correlation_id", u.CorrelationID,
"error", err,
)
continue
}
a.tlsConfigurator.UpdateAutoEncryptCert(leaf.CertPEM, leaf.PrivateKeyPEM)
}
}
}
}()
// Setup safety net in case the auto_encrypt cert doesn't get renewed
// in time. The agent would be stuck in that case because the watches
// never use the AutoEncrypt.Sign endpoint.
go func() {
// Check 10sec after cert expires. The agent cache
// should be handling the expiration and renew before
// it.
// If there is no cert, AutoEncryptCertNotAfter returns
// a value in the past which immediately triggers the
// renew, but this case shouldn't happen because at
// this point, auto_encrypt was just being setup
// successfully.
interval := a.tlsConfigurator.AutoEncryptCertNotAfter().Sub(time.Now().Add(10 * time.Second))
autoLogger := a.logger.Named(logging.AutoEncrypt)
for {
a.logger.Debug("setting up client certificate expiration check on interval", "interval", interval)
select {
case <-a.shutdownCh:
return
case <-time.After(interval):
// check auto encrypt client cert expiration
if a.tlsConfigurator.AutoEncryptCertExpired() {
autoLogger.Debug("client certificate expired.")
// Background because the context is mainly useful when the agent is first starting up.
reply, err := a.setupClientAutoEncrypt(context.Background())
if err != nil {
autoLogger.Error("client certificate expired, failed to renew", "error", err)
// in case of an error, try again in one minute
interval = time.Minute
continue
}
_, _, err = a.setupClientAutoEncryptCache(reply)
if err != nil {
autoLogger.Error("client certificate expired, failed to populate cache", "error", err)
// in case of an error, try again in one minute
interval = time.Minute
continue
}
}
}
}
}()
return nil
return client.RequestAutoEncryptCerts(ctx, addrs, a.config.ServerPort, a.tokens.AgentToken(), a.config.AutoEncryptDNSSAN, a.config.AutoEncryptIPSAN)
}
func (a *Agent) listenAndServeGRPC() error {

View File

@ -5247,6 +5247,8 @@ func TestAgentConnectCALeafCert_good(t *testing.T) {
assert.Equal(fmt.Sprintf("%d", issued.ModifyIndex),
resp.Header().Get("X-Consul-Index"))
index := resp.Header().Get("X-Consul-Index")
// Test caching
{
// Fetch it again
@ -5259,39 +5261,25 @@ func TestAgentConnectCALeafCert_good(t *testing.T) {
require.Equal("HIT", resp.Header().Get("X-Cache"))
}
// Test that caching is updated in the background
// Issue a blocking query to ensure that the cert gets updated appropriately
{
// Set a new CA
ca := connect.TestCAConfigSet(t, a, nil)
retry.Run(t, func(r *retry.R) {
resp := httptest.NewRecorder()
// Try and sign again (note no index/wait arg since cache should update in
// background even if we aren't actively blocking)
obj, err := a.srv.AgentConnectCALeafCert(resp, req)
r.Check(err)
resp := httptest.NewRecorder()
req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+index, nil)
obj, err := a.srv.AgentConnectCALeafCert(resp, req)
require.NoError(err)
issued2 := obj.(*structs.IssuedCert)
require.NotEqual(issued.CertPEM, issued2.CertPEM)
require.NotEqual(issued.PrivateKeyPEM, issued2.PrivateKeyPEM)
issued2 := obj.(*structs.IssuedCert)
if issued.CertPEM == issued2.CertPEM {
r.Fatalf("leaf has not updated")
}
// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(t, issued2, ca)
// Got a new leaf. Sanity check it's a whole new key as well as different
// cert.
if issued.PrivateKeyPEM == issued2.PrivateKeyPEM {
r.Fatalf("new leaf has same private key as before")
}
// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(t, issued2, ca)
// Should be a cache hit! The data should've updated in the cache
// in the background so this should've been fetched directly from
// the cache.
if resp.Header().Get("X-Cache") != "HIT" {
r.Fatalf("should be a cache hit")
}
})
// Should not be a cache hit! The data was updated in response to the blocking
// query being made.
require.Equal("MISS", resp.Header().Get("X-Cache"))
}
}

View File

@ -50,7 +50,7 @@ const caChangeJitterWindow = 30 * time.Second
// ConnectCALeaf supports fetching and generating Connect leaf
// certificates.
type ConnectCALeaf struct {
RegisterOptionsBlockingRefresh
RegisterOptionsBlockingNoRefresh
caIndex uint64 // Current index for CA roots
// rootWatchMu protects access to the rootWatchSubscribers map and

View File

@ -18,7 +18,17 @@ func (r RegisterOptionsBlockingRefresh) RegisterOptions() cache.RegisterOptions
Refresh: true,
SupportsBlocking: true,
RefreshTimer: 0 * time.Second,
RefreshTimeout: 10 * time.Minute,
QueryTimeout: 10 * time.Minute,
}
}
type RegisterOptionsBlockingNoRefresh struct{}
func (r RegisterOptionsBlockingNoRefresh) RegisterOptions() cache.RegisterOptions {
return cache.RegisterOptions{
Refresh: false,
SupportsBlocking: true,
QueryTimeout: 10 * time.Minute,
}
}

14
agent/cache/cache.go vendored
View File

@ -172,7 +172,7 @@ type RegisterOptions struct {
// If this is zero, then data is refreshed immediately when a fetch
// is returned.
//
// Using different values for RefreshTimer and RefreshTimeout, various
// Using different values for RefreshTimer and QueryTimeout, various
// "refresh" mechanisms can be implemented:
//
// * With a high timer duration and a low timeout, a timer-based
@ -184,10 +184,10 @@ type RegisterOptions struct {
//
RefreshTimer time.Duration
// RefreshTimeout is the default value for the maximum query time for a fetch
// QueryTimeout is the default value for the maximum query time for a fetch
// operation. It is set as FetchOptions.Timeout so that cache.Type
// implementations can use it as the MaxQueryTime.
RefreshTimeout time.Duration
QueryTimeout time.Duration
}
// RegisterType registers a cacheable type.
@ -475,7 +475,7 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign
// keepalives are every 30 seconds so the RPC should fail if the packets are
// being blackholed for more than 30 seconds.
var connectedTimer *time.Timer
if tEntry.Opts.Refresh && entry.Index > 0 && tEntry.Opts.RefreshTimeout > 31*time.Second {
if tEntry.Opts.Refresh && entry.Index > 0 && tEntry.Opts.QueryTimeout > 31*time.Second {
connectedTimer = time.AfterFunc(31*time.Second, func() {
c.entriesLock.Lock()
defer c.entriesLock.Unlock()
@ -491,7 +491,11 @@ func (c *Cache) fetch(key string, r getOptions, allowNew bool, attempt uint, ign
fOpts := FetchOptions{}
if tEntry.Opts.SupportsBlocking {
fOpts.MinIndex = entry.Index
fOpts.Timeout = tEntry.Opts.RefreshTimeout
fOpts.Timeout = tEntry.Opts.QueryTimeout
if fOpts.Timeout == 0 {
fOpts.Timeout = 10 * time.Minute
}
}
if entry.Valid {
fOpts.LastResult = &FetchResult{

View File

@ -464,9 +464,9 @@ func TestCacheGet_periodicRefresh(t *testing.T) {
typ := &MockType{}
typ.On("RegisterOptions").Return(RegisterOptions{
Refresh: true,
RefreshTimer: 100 * time.Millisecond,
RefreshTimeout: 5 * time.Minute,
Refresh: true,
RefreshTimer: 100 * time.Millisecond,
QueryTimeout: 5 * time.Minute,
})
defer typ.AssertExpectations(t)
c := TestCache(t)
@ -504,9 +504,9 @@ func TestCacheGet_periodicRefreshMultiple(t *testing.T) {
typ := &MockType{}
typ.On("RegisterOptions").Return(RegisterOptions{
Refresh: true,
RefreshTimer: 0 * time.Millisecond,
RefreshTimeout: 5 * time.Minute,
Refresh: true,
RefreshTimer: 0 * time.Millisecond,
QueryTimeout: 5 * time.Minute,
})
defer typ.AssertExpectations(t)
c := TestCache(t)
@ -553,9 +553,9 @@ func TestCacheGet_periodicRefreshErrorBackoff(t *testing.T) {
typ := &MockType{}
typ.On("RegisterOptions").Return(RegisterOptions{
Refresh: true,
RefreshTimer: 0,
RefreshTimeout: 5 * time.Minute,
Refresh: true,
RefreshTimer: 0,
QueryTimeout: 5 * time.Minute,
})
defer typ.AssertExpectations(t)
c := TestCache(t)
@ -595,9 +595,9 @@ func TestCacheGet_periodicRefreshBadRPCZeroIndexErrorBackoff(t *testing.T) {
typ := &MockType{}
typ.On("RegisterOptions").Return(RegisterOptions{
Refresh: true,
RefreshTimer: 0,
RefreshTimeout: 5 * time.Minute,
Refresh: true,
RefreshTimer: 0,
QueryTimeout: 5 * time.Minute,
})
defer typ.AssertExpectations(t)
c := TestCache(t)
@ -642,7 +642,7 @@ func TestCacheGet_noIndexSetsOne(t *testing.T) {
SupportsBlocking: true,
Refresh: true,
RefreshTimer: 0,
RefreshTimeout: 5 * time.Minute,
QueryTimeout: 5 * time.Minute,
})
defer typ.AssertExpectations(t)
c := TestCache(t)
@ -702,7 +702,7 @@ func TestCacheGet_fetchTimeout(t *testing.T) {
typ := &MockType{}
timeout := 10 * time.Minute
typ.On("RegisterOptions").Return(RegisterOptions{
RefreshTimeout: timeout,
QueryTimeout: timeout,
SupportsBlocking: true,
})
defer typ.AssertExpectations(t)
@ -954,9 +954,9 @@ func TestCacheGet_refreshAge(t *testing.T) {
typ := &MockType{}
typ.On("RegisterOptions").Return(RegisterOptions{
Refresh: true,
RefreshTimer: 0,
RefreshTimeout: 5 * time.Minute,
Refresh: true,
RefreshTimer: 0,
QueryTimeout: 5 * time.Minute,
})
defer typ.AssertExpectations(t)
c := TestCache(t)

View File

@ -0,0 +1,471 @@
package certmon
import (
"context"
"fmt"
"io/ioutil"
"sync"
"time"
"github.com/hashicorp/consul/agent/cache"
cachetype "github.com/hashicorp/consul/agent/cache-types"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/go-hclog"
)
const (
// ID of the roots watch
rootsWatchID = "roots"
// ID of the leaf watch
leafWatchID = "leaf"
)
// Cache is an interface to represent the methods of the
// agent/cache.Cache struct that we care about
type Cache interface {
Notify(ctx context.Context, t string, r cache.Request, correlationID string, ch chan<- cache.UpdateEvent) error
Prepopulate(t string, result cache.FetchResult, dc string, token string, key string) error
}
// CertMonitor will setup the proper watches to ensure that
// the Agent's Connect TLS certificate remains up to date
type CertMonitor struct {
logger hclog.Logger
cache Cache
tlsConfigurator *tlsutil.Configurator
tokens *token.Store
leafReq cachetype.ConnectCALeafRequest
rootsReq structs.DCSpecificRequest
fallback FallbackFunc
fallbackLeeway time.Duration
fallbackRetry time.Duration
l sync.Mutex
running bool
// cancel is used to cancel the entire CertMonitor
// go routine. This is the main field protected
// by the mutex as it being non-nil indicates that
// the go routine has been started and is stoppable.
// note that it doesn't indcate that the go routine
// is currently running.
cancel context.CancelFunc
// cancelWatches is used to cancel the existing
// cache watches. This is mainly only necessary
// when the Agent token changes
cancelWatches context.CancelFunc
// cacheUpdates is the chan used to have the cache
// send us back events
cacheUpdates chan cache.UpdateEvent
// tokenUpdates is the struct used to receive
// events from the token store when the Agent
// token is updated.
tokenUpdates token.Notifier
}
// New creates a new CertMonitor for automatically rotating
// an Agent's Connect Certificate
func New(config *Config) (*CertMonitor, error) {
logger := config.Logger
if logger == nil {
logger = hclog.New(&hclog.LoggerOptions{
Level: 0,
Output: ioutil.Discard,
})
}
if config.FallbackLeeway == 0 {
config.FallbackLeeway = 10 * time.Second
}
if config.FallbackRetry == 0 {
config.FallbackRetry = time.Minute
}
if config.Cache == nil {
return nil, fmt.Errorf("CertMonitor creation requires a Cache")
}
if config.TLSConfigurator == nil {
return nil, fmt.Errorf("CertMonitor creation requires a TLS Configurator")
}
if config.Fallback == nil {
return nil, fmt.Errorf("CertMonitor creation requires specifying a FallbackFunc")
}
if config.Datacenter == "" {
return nil, fmt.Errorf("CertMonitor creation requires specifying the datacenter")
}
if config.NodeName == "" {
return nil, fmt.Errorf("CertMonitor creation requires specifying the agent's node name")
}
if config.Tokens == nil {
return nil, fmt.Errorf("CertMonitor creation requires specifying a token store")
}
return &CertMonitor{
logger: logger,
cache: config.Cache,
tokens: config.Tokens,
tlsConfigurator: config.TLSConfigurator,
fallback: config.Fallback,
fallbackLeeway: config.FallbackLeeway,
fallbackRetry: config.FallbackRetry,
rootsReq: structs.DCSpecificRequest{Datacenter: config.Datacenter},
leafReq: cachetype.ConnectCALeafRequest{
Datacenter: config.Datacenter,
Agent: config.NodeName,
DNSSAN: config.DNSSANs,
IPSAN: config.IPSANs,
},
}, nil
}
// Update is responsible for priming the cache with the certificates
// as well as injecting them into the TLS configurator
func (m *CertMonitor) Update(certs *structs.SignedResponse) error {
if certs == nil {
return nil
}
if err := m.populateCache(certs); err != nil {
return fmt.Errorf("error populating cache with certificates: %w", err)
}
connectCAPems := []string{}
for _, ca := range certs.ConnectCARoots.Roots {
connectCAPems = append(connectCAPems, ca.RootCert)
}
// Note that its expected that the private key be within the IssuedCert in the
// SignedResponse. This isn't how a server would send back the response and requires
// that the recipient of the response who also has access to the private key will
// have filled it in. The Cache definitely does this but auto-encrypt/auto-config
// will need to ensure the original response is setup this way too.
err := m.tlsConfigurator.UpdateAutoEncrypt(
certs.ManualCARoots,
connectCAPems,
certs.IssuedCert.CertPEM,
certs.IssuedCert.PrivateKeyPEM,
certs.VerifyServerHostname)
if err != nil {
return fmt.Errorf("error updating TLS configurator with certificates: %w", err)
}
return nil
}
// populateCache is responsible for inserting the certificates into the cache
func (m *CertMonitor) populateCache(resp *structs.SignedResponse) error {
cert, err := connect.ParseCert(resp.IssuedCert.CertPEM)
if err != nil {
return fmt.Errorf("Failed to parse certificate: %w", err)
}
// prepolutate roots cache
rootRes := cache.FetchResult{Value: &resp.ConnectCARoots, Index: resp.ConnectCARoots.QueryMeta.Index}
// getting the roots doesn't require a token so in order to potentially share the cache with another
if err := m.cache.Prepopulate(cachetype.ConnectCARootName, rootRes, m.rootsReq.Datacenter, "", m.rootsReq.CacheInfo().Key); err != nil {
return err
}
// copy the template and update the token
leafReq := m.leafReq
leafReq.Token = m.tokens.AgentToken()
// prepolutate leaf cache
certRes := cache.FetchResult{
Value: &resp.IssuedCert,
Index: resp.ConnectCARoots.QueryMeta.Index,
State: cachetype.ConnectCALeafSuccess(connect.EncodeSigningKeyID(cert.AuthorityKeyId)),
}
if err := m.cache.Prepopulate(cachetype.ConnectCALeafName, certRes, leafReq.Datacenter, leafReq.Token, leafReq.Key()); err != nil {
return err
}
return nil
}
// Start spawns the go routine to monitor the certificate and ensure it is
// rotated/renewed as necessary. The chan will indicate once the started
// go routine has exited
func (m *CertMonitor) Start(ctx context.Context) (<-chan struct{}, error) {
m.l.Lock()
defer m.l.Unlock()
if m.running || m.cancel != nil {
return nil, fmt.Errorf("the CertMonitor is already running")
}
// create the top level context to control the go
// routine executing the `run` method
ctx, cancel := context.WithCancel(ctx)
// create the channel to get cache update events through
// really we should only ever get 10 updates
m.cacheUpdates = make(chan cache.UpdateEvent, 10)
// setup the cache watches
cancelWatches, err := m.setupCacheWatches(ctx)
if err != nil {
cancel()
return nil, fmt.Errorf("error setting up cache watches: %w", err)
}
// start the token update notifier
m.tokenUpdates = m.tokens.Notify(token.TokenKindAgent)
// store the cancel funcs
m.cancel = cancel
m.cancelWatches = cancelWatches
m.running = true
exit := make(chan struct{})
go m.run(ctx, exit)
return exit, nil
}
// Stop manually stops the go routine spawned by Start and
// returns whether the go routine was still running before
// cancelling.
//
// Note that cancelling the context passed into Start will
// also cause the go routine to stop
func (m *CertMonitor) Stop() bool {
m.l.Lock()
defer m.l.Unlock()
if !m.running {
return false
}
if m.cancel != nil {
m.cancel()
}
return true
}
// IsRunning returns whether the go routine to perform certificate monitoring
// is already running.
func (m *CertMonitor) IsRunning() bool {
m.l.Lock()
defer m.l.Unlock()
return m.running
}
// setupCacheWatches will start both the roots and leaf cert watch with a new child
// context and an up to date ACL token. The watches are started with a new child context
// whose CancelFunc is also returned.
func (m *CertMonitor) setupCacheWatches(ctx context.Context) (context.CancelFunc, error) {
notificationCtx, cancel := context.WithCancel(ctx)
// copy the request
rootsReq := m.rootsReq
err := m.cache.Notify(notificationCtx, cachetype.ConnectCARootName, &rootsReq, rootsWatchID, m.cacheUpdates)
if err != nil {
cancel()
return nil, err
}
// copy the request
leafReq := m.leafReq
leafReq.Token = m.tokens.AgentToken()
err = m.cache.Notify(notificationCtx, cachetype.ConnectCALeafName, &leafReq, leafWatchID, m.cacheUpdates)
if err != nil {
cancel()
return nil, err
}
return cancel, nil
}
// handleCacheEvent is used to handle event notifications from the cache for the roots
// or leaf cert watches.
func (m *CertMonitor) handleCacheEvent(u cache.UpdateEvent) error {
switch u.CorrelationID {
case rootsWatchID:
m.logger.Debug("roots watch fired - updating CA certificates")
if u.Err != nil {
return fmt.Errorf("root watch returned an error: %w", u.Err)
}
roots, ok := u.Result.(*structs.IndexedCARoots)
if !ok {
return fmt.Errorf("invalid type for roots watch response: %T", u.Result)
}
var pems []string
for _, root := range roots.Roots {
pems = append(pems, root.RootCert)
}
if err := m.tlsConfigurator.UpdateAutoEncryptCA(pems); err != nil {
return fmt.Errorf("failed to update Connect CA certificates: %w", err)
}
case leafWatchID:
m.logger.Debug("leaf certificate watch fired - updating TLS certificate")
if u.Err != nil {
return fmt.Errorf("leaf watch returned an error: %w", u.Err)
}
leaf, ok := u.Result.(*structs.IssuedCert)
if !ok {
return fmt.Errorf("invalid type for agent leaf cert watch response: %T", u.Result)
}
if err := m.tlsConfigurator.UpdateAutoEncryptCert(leaf.CertPEM, leaf.PrivateKeyPEM); err != nil {
return fmt.Errorf("failed to update the agent leaf cert: %w", err)
}
}
return nil
}
// handleTokenUpdate is used when a notification about the agent token being updated
// is received and various watches need cancelling/restarting to use the new token.
func (m *CertMonitor) handleTokenUpdate(ctx context.Context) error {
m.logger.Debug("Agent token updated - resetting watches")
// TODO (autoencrypt) Prepopulate the cache with the new token with
// the existing cache entry with the old token. The certificate doesn't
// need to change just because the token has. However there isn't a
// good way to make that happen and this behavior is benign enough
// that I am going to push off implementing it.
// the agent token has been updated so we must update our leaf cert watch.
// this cancels the current watches before setting up new ones
m.cancelWatches()
// recreate the chan for cache updates. This is a precautionary measure to ensure
// that we don't accidentally get notified for the new watches being setup before
// a blocking query in the cache returns and sends data to the old chan. In theory
// the code in agent/cache/watch.go should prevent this where we specifically check
// for context cancellation prior to sending the event. However we could cancel
// it after that check and finish setting up the new watches before getting the old
// events. Both the go routine scheduler and the OS thread scheduler would have to
// be acting up for this to happen. Regardless the way to ensure we don't get events
// for the old watches is to simply replace the chan we are expecting them from.
close(m.cacheUpdates)
m.cacheUpdates = make(chan cache.UpdateEvent, 10)
// restart watches - this will be done with the correct token
cancelWatches, err := m.setupCacheWatches(ctx)
if err != nil {
return fmt.Errorf("failed to restart watches after agent token update: %w", err)
}
m.cancelWatches = cancelWatches
return nil
}
// handleFallback is used when the current TLS certificate has expired and the normal
// updating mechanisms have failed to renew it quickly enough. This function will
// use the configured fallback mechanism to retrieve a new cert and start monitoring
// that one.
func (m *CertMonitor) handleFallback(ctx context.Context) error {
m.logger.Warn("agent's client certificate has expired")
// Background because the context is mainly useful when the agent is first starting up.
reply, err := m.fallback(ctx)
if err != nil {
return fmt.Errorf("error when getting new agent certificate: %w", err)
}
return m.Update(reply)
}
// run is the private method to be spawn by the Start method for
// executing the main monitoring loop.
func (m *CertMonitor) run(ctx context.Context, exit chan struct{}) {
// The fallbackTimer is used to notify AFTER the agents
// leaf certificate has expired and where we need
// to fall back to the less secure RPC endpoint just like
// if the agent was starting up new.
//
// Check 10sec (fallback leeway duration) after cert
// expires. The agent cache should be handling the expiration
// and renew it before then.
//
// If there is no cert, AutoEncryptCertNotAfter returns
// a value in the past which immediately triggers the
// renew, but this case shouldn't happen because at
// this point, auto_encrypt was just being setup
// successfully.
calcFallbackInterval := func() time.Duration {
certExpiry := m.tlsConfigurator.AutoEncryptCertNotAfter()
return certExpiry.Add(m.fallbackLeeway).Sub(time.Now())
}
fallbackTimer := time.NewTimer(calcFallbackInterval())
// cleanup for once we are stopped
defer func() {
// cancel the go routines performing the cache watches
m.cancelWatches()
// ensure we don't leak the timers go routine
fallbackTimer.Stop()
// stop receiving notifications for token updates
m.tokens.StopNotify(m.tokenUpdates)
m.logger.Debug("certificate monitor has been stopped")
m.l.Lock()
m.cancel = nil
m.running = false
m.l.Unlock()
// this should be the final cleanup task as its what notifies
// the rest of the world that this go routine has exited.
close(exit)
}()
for {
select {
case <-ctx.Done():
m.logger.Debug("stopping the certificate monitor")
return
case <-m.tokenUpdates.Ch:
m.logger.Debug("handling a token update event")
if err := m.handleTokenUpdate(ctx); err != nil {
m.logger.Error("error in handling token update event", "error", err)
}
case u := <-m.cacheUpdates:
m.logger.Debug("handling a cache update event", "correlation_id", u.CorrelationID)
if err := m.handleCacheEvent(u); err != nil {
m.logger.Error("error in handling cache update event", "error", err)
}
// reset the fallback timer as the certificate may have been updated
fallbackTimer.Stop()
fallbackTimer = time.NewTimer(calcFallbackInterval())
case <-fallbackTimer.C:
// This is a safety net in case the auto_encrypt cert doesn't get renewed
// in time. The agent would be stuck in that case because the watches
// never use the AutoEncrypt.Sign endpoint.
// check auto encrypt client cert expiration
if m.tlsConfigurator.AutoEncryptCertExpired() {
if err := m.handleFallback(ctx); err != nil {
m.logger.Error("error when handling a certificate expiry event", "error", err)
fallbackTimer = time.NewTimer(m.fallbackRetry)
} else {
fallbackTimer = time.NewTimer(calcFallbackInterval())
}
} else {
// this shouldn't be possible. We calculate the timer duration to be the certificate
// expiration time + some leeway (10s default). So whenever we get here the certificate
// should be expired. Regardless its probably worth resetting the timer.
fallbackTimer = time.NewTimer(calcFallbackInterval())
}
}
}
}

View File

@ -0,0 +1,693 @@
package certmon
import (
"context"
"crypto/tls"
"fmt"
"net"
"sync"
"testing"
"time"
"github.com/hashicorp/consul/agent/cache"
cachetype "github.com/hashicorp/consul/agent/cache-types"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/go-uuid"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)
type mockFallback struct {
mock.Mock
}
func (m *mockFallback) fallback(ctx context.Context) (*structs.SignedResponse, error) {
ret := m.Called()
resp, _ := ret.Get(0).(*structs.SignedResponse)
return resp, ret.Error(1)
}
type mockWatcher struct {
ch chan<- cache.UpdateEvent
done <-chan struct{}
}
type mockCache struct {
mock.Mock
lock sync.Mutex
watchers map[string][]mockWatcher
}
func (m *mockCache) Notify(ctx context.Context, t string, r cache.Request, correlationID string, ch chan<- cache.UpdateEvent) error {
m.lock.Lock()
key := r.CacheInfo().Key
m.watchers[key] = append(m.watchers[key], mockWatcher{ch: ch, done: ctx.Done()})
m.lock.Unlock()
ret := m.Called(t, r, correlationID)
return ret.Error(0)
}
func (m *mockCache) Prepopulate(t string, result cache.FetchResult, dc string, token string, key string) error {
ret := m.Called(t, result, dc, token, key)
return ret.Error(0)
}
func (m *mockCache) sendNotification(ctx context.Context, key string, u cache.UpdateEvent) bool {
m.lock.Lock()
defer m.lock.Unlock()
watchers, ok := m.watchers[key]
if !ok || len(m.watchers) < 1 {
return false
}
var newWatchers []mockWatcher
for _, watcher := range watchers {
select {
case watcher.ch <- u:
newWatchers = append(newWatchers, watcher)
case <-watcher.done:
// do nothing, this watcher will be removed from the list
case <-ctx.Done():
// return doesn't matter here really, the test is being cancelled
return true
}
}
// this removes any already cancelled watches from being sent to
m.watchers[key] = newWatchers
return true
}
func newMockCache(t *testing.T) *mockCache {
mcache := mockCache{watchers: make(map[string][]mockWatcher)}
mcache.Test(t)
return &mcache
}
func waitForChan(timer *time.Timer, ch <-chan struct{}) bool {
select {
case <-timer.C:
return false
case <-ch:
return true
}
}
func waitForChans(timeout time.Duration, chans ...<-chan struct{}) bool {
timer := time.NewTimer(timeout)
defer timer.Stop()
for _, ch := range chans {
if !waitForChan(timer, ch) {
return false
}
}
return true
}
func testTLSConfigurator(t *testing.T) *tlsutil.Configurator {
t.Helper()
logger := testutil.Logger(t)
cfg, err := tlsutil.NewConfigurator(tlsutil.Config{AutoEncryptTLS: true}, logger)
require.NoError(t, err)
return cfg
}
func newLeaf(t *testing.T, ca *structs.CARoot, idx uint64, expiration time.Duration) *structs.IssuedCert {
t.Helper()
pub, priv, err := connect.TestAgentLeaf(t, "node", "foo", ca, expiration)
require.NoError(t, err)
cert, err := connect.ParseCert(pub)
require.NoError(t, err)
spiffeID, err := connect.ParseCertURI(cert.URIs[0])
require.NoError(t, err)
agentID, ok := spiffeID.(*connect.SpiffeIDAgent)
require.True(t, ok, "certificate doesn't have an agent leaf cert URI")
return &structs.IssuedCert{
SerialNumber: cert.SerialNumber.String(),
CertPEM: pub,
PrivateKeyPEM: priv,
ValidAfter: cert.NotBefore,
ValidBefore: cert.NotAfter,
Agent: agentID.Agent,
AgentURI: agentID.URI().String(),
EnterpriseMeta: *structs.DefaultEnterpriseMeta(),
RaftIndex: structs.RaftIndex{
CreateIndex: idx,
ModifyIndex: idx,
},
}
}
type testCertMonitor struct {
monitor *CertMonitor
mcache *mockCache
tls *tlsutil.Configurator
tokens *token.Store
fallback *mockFallback
extraCACerts []string
initialCert *structs.IssuedCert
initialRoots *structs.IndexedCARoots
// these are some variables that the CertMonitor was created with
datacenter string
nodeName string
dns []string
ips []net.IP
verifyServerHostname bool
}
func newTestCertMonitor(t *testing.T) testCertMonitor {
t.Helper()
tlsConfigurator := testTLSConfigurator(t)
tokens := new(token.Store)
id, err := uuid.GenerateUUID()
require.NoError(t, err)
tokens.UpdateAgentToken(id, token.TokenSourceConfig)
ca := connect.TestCA(t, nil)
manualCA := connect.TestCA(t, nil)
// this cert is setup to not expire quickly. this will prevent
// the test from accidentally running the fallback routine
// before we want to force that to happen.
issued := newLeaf(t, ca, 1, 10*time.Minute)
indexedRoots := structs.IndexedCARoots{
ActiveRootID: ca.ID,
TrustDomain: connect.TestClusterID,
Roots: []*structs.CARoot{
ca,
},
QueryMeta: structs.QueryMeta{
Index: 1,
},
}
initialCerts := &structs.SignedResponse{
ConnectCARoots: indexedRoots,
IssuedCert: *issued,
ManualCARoots: []string{manualCA.RootCert},
VerifyServerHostname: true,
}
dnsSANs := []string{"test.dev"}
ipSANs := []net.IP{net.IPv4(198, 18, 0, 1)}
// this chan should be unbuffered so we can detect when the fallback func has been called.
fallback := &mockFallback{}
mcache := newMockCache(t)
rootRes := cache.FetchResult{Value: &indexedRoots, Index: 1}
rootsReq := structs.DCSpecificRequest{Datacenter: "foo"}
mcache.On("Prepopulate", cachetype.ConnectCARootName, rootRes, "foo", "", rootsReq.CacheInfo().Key).Return(nil).Once()
leafReq := cachetype.ConnectCALeafRequest{
Token: tokens.AgentToken(),
Agent: "node",
Datacenter: "foo",
DNSSAN: dnsSANs,
IPSAN: ipSANs,
}
leafRes := cache.FetchResult{
Value: issued,
Index: 1,
State: cachetype.ConnectCALeafSuccess(ca.SigningKeyID),
}
mcache.On("Prepopulate", cachetype.ConnectCALeafName, leafRes, "foo", tokens.AgentToken(), leafReq.Key()).Return(nil).Once()
// we can assert more later but this should always be done.
defer mcache.AssertExpectations(t)
cfg := new(Config).
WithCache(mcache).
WithLogger(testutil.Logger(t)).
WithTLSConfigurator(tlsConfigurator).
WithTokens(tokens).
WithFallback(fallback.fallback).
WithDNSSANs(dnsSANs).
WithIPSANs(ipSANs).
WithDatacenter("foo").
WithNodeName("node").
WithFallbackLeeway(time.Nanosecond).
WithFallbackRetry(time.Millisecond)
monitor, err := New(cfg)
require.NoError(t, err)
require.NotNil(t, monitor)
require.NoError(t, monitor.Update(initialCerts))
return testCertMonitor{
monitor: monitor,
tls: tlsConfigurator,
tokens: tokens,
mcache: mcache,
fallback: fallback,
extraCACerts: []string{manualCA.RootCert},
initialCert: issued,
initialRoots: &indexedRoots,
datacenter: "foo",
nodeName: "node",
dns: dnsSANs,
ips: ipSANs,
verifyServerHostname: true,
}
}
func tlsCertificateFromIssued(t *testing.T, issued *structs.IssuedCert) *tls.Certificate {
t.Helper()
cert, err := tls.X509KeyPair([]byte(issued.CertPEM), []byte(issued.PrivateKeyPEM))
require.NoError(t, err)
return &cert
}
// convenience method to get a TLS Certificate from the intial issued certificate and priv key
func (cm *testCertMonitor) initialTLSCertificate(t *testing.T) *tls.Certificate {
t.Helper()
return tlsCertificateFromIssued(t, cm.initialCert)
}
// just a convenience method to get a list of all the CA pems that we set up regardless
// of manual vs connect.
func (cm *testCertMonitor) initialCACerts() []string {
pems := cm.extraCACerts
for _, root := range cm.initialRoots.Roots {
pems = append(pems, root.RootCert)
}
return pems
}
func (cm *testCertMonitor) assertExpectations(t *testing.T) {
cm.mcache.AssertExpectations(t)
cm.fallback.AssertExpectations(t)
}
func TestCertMonitor_InitialCerts(t *testing.T) {
// this also ensures that the cache was prepopulated properly
cm := newTestCertMonitor(t)
// verify that the certificate was injected into the TLS configurator correctly
require.Equal(t, cm.initialTLSCertificate(t), cm.tls.Cert())
// verify that the CA certs (both Connect and manual ones) were injected correctly
require.ElementsMatch(t, cm.initialCACerts(), cm.tls.CAPems())
// verify that the auto-tls verify server hostname setting was injected correctly
require.Equal(t, cm.verifyServerHostname, cm.tls.VerifyServerHostname())
}
func TestCertMonitor_GoRoutineManagement(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
cm := newTestCertMonitor(t)
// ensure that the monitor is not running
require.False(t, cm.monitor.IsRunning())
// ensure that nothing bad happens and that it reports as stopped
require.False(t, cm.monitor.Stop())
// we will never send notifications so these just ignore everything
cm.mcache.On("Notify", cachetype.ConnectCARootName, &structs.DCSpecificRequest{Datacenter: cm.datacenter}, rootsWatchID).Return(nil).Times(2)
cm.mcache.On("Notify", cachetype.ConnectCALeafName,
&cachetype.ConnectCALeafRequest{
Token: cm.tokens.AgentToken(),
Datacenter: cm.datacenter,
Agent: cm.nodeName,
DNSSAN: cm.dns,
IPSAN: cm.ips,
},
leafWatchID,
).Return(nil).Times(2)
done, err := cm.monitor.Start(ctx)
require.NoError(t, err)
require.True(t, cm.monitor.IsRunning())
_, err = cm.monitor.Start(ctx)
testutil.RequireErrorContains(t, err, "the CertMonitor is already running")
require.True(t, cm.monitor.Stop())
require.True(t, waitForChans(100*time.Millisecond, done), "monitor didn't shut down")
require.False(t, cm.monitor.IsRunning())
done, err = cm.monitor.Start(ctx)
require.NoError(t, err)
// ensure that context cancellation causes us to stop as well
cancel()
require.True(t, waitForChans(100*time.Millisecond, done))
cm.assertExpectations(t)
}
func startedCertMonitor(t *testing.T) (context.Context, testCertMonitor) {
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
cm := newTestCertMonitor(t)
rootsCtx, rootsCancel := context.WithCancel(ctx)
defer rootsCancel()
leafCtx, leafCancel := context.WithCancel(ctx)
defer leafCancel()
// initial roots watch
cm.mcache.On("Notify", cachetype.ConnectCARootName,
&structs.DCSpecificRequest{
Datacenter: cm.datacenter,
},
rootsWatchID).
Return(nil).
Once().
Run(func(_ mock.Arguments) {
rootsCancel()
})
// the initial watch after starting the monitor
cm.mcache.On("Notify", cachetype.ConnectCALeafName,
&cachetype.ConnectCALeafRequest{
Token: cm.tokens.AgentToken(),
Datacenter: cm.datacenter,
Agent: cm.nodeName,
DNSSAN: cm.dns,
IPSAN: cm.ips,
},
leafWatchID).
Return(nil).
Once().
Run(func(_ mock.Arguments) {
leafCancel()
})
done, err := cm.monitor.Start(ctx)
require.NoError(t, err)
// this prevents logs after the test finishes
t.Cleanup(func() {
cm.monitor.Stop()
<-done
})
require.True(t,
waitForChans(100*time.Millisecond, rootsCtx.Done(), leafCtx.Done()),
"not all watches were started within the alotted time")
return ctx, cm
}
// This test ensures that the cache watches are restarted with the updated
// token after receiving a token update
func TestCertMonitor_TokenUpdate(t *testing.T) {
ctx, cm := startedCertMonitor(t)
rootsCtx, rootsCancel := context.WithCancel(ctx)
defer rootsCancel()
leafCtx, leafCancel := context.WithCancel(ctx)
defer leafCancel()
newToken := "8e4fe8db-162d-42d8-81ca-710fb2280ad0"
// we expect a new roots watch because when the leaf cert watch is restarted so is the root cert watch
cm.mcache.On("Notify", cachetype.ConnectCARootName,
&structs.DCSpecificRequest{
Datacenter: cm.datacenter,
},
rootsWatchID).
Return(nil).
Once().
Run(func(_ mock.Arguments) {
rootsCancel()
})
secondWatch := &cachetype.ConnectCALeafRequest{
Token: newToken,
Datacenter: cm.datacenter,
Agent: cm.nodeName,
DNSSAN: cm.dns,
IPSAN: cm.ips,
}
// the new watch after updating the token
cm.mcache.On("Notify", cachetype.ConnectCALeafName, secondWatch, leafWatchID).
Return(nil).
Once().
Run(func(args mock.Arguments) {
leafCancel()
})
cm.tokens.UpdateAgentToken(newToken, token.TokenSourceAPI)
require.True(t,
waitForChans(100*time.Millisecond, rootsCtx.Done(), leafCtx.Done()),
"not all watches were restarted within the alotted time")
cm.assertExpectations(t)
}
func TestCertMonitor_RootsUpdate(t *testing.T) {
ctx, cm := startedCertMonitor(t)
secondCA := connect.TestCA(t, cm.initialRoots.Roots[0])
secondRoots := structs.IndexedCARoots{
ActiveRootID: secondCA.ID,
TrustDomain: connect.TestClusterID,
Roots: []*structs.CARoot{
secondCA,
cm.initialRoots.Roots[0],
},
QueryMeta: structs.QueryMeta{
Index: 99,
},
}
// assert value of the CA certs prior to updating
require.ElementsMatch(t, cm.initialCACerts(), cm.tls.CAPems())
req := structs.DCSpecificRequest{Datacenter: cm.datacenter}
require.True(t, cm.mcache.sendNotification(ctx, req.CacheInfo().Key, cache.UpdateEvent{
CorrelationID: rootsWatchID,
Result: &secondRoots,
Meta: cache.ResultMeta{
Index: secondRoots.Index,
},
}))
expectedCAs := append(cm.extraCACerts, secondCA.RootCert, cm.initialRoots.Roots[0].RootCert)
// this will wait up to 200ms (8 x 25 ms waits between the 9 requests)
retry.RunWith(&retry.Counter{Count: 9, Wait: 25 * time.Millisecond}, t, func(r *retry.R) {
require.ElementsMatch(r, expectedCAs, cm.tls.CAPems())
})
cm.assertExpectations(t)
}
func TestCertMonitor_CertUpdate(t *testing.T) {
ctx, cm := startedCertMonitor(t)
secondCert := newLeaf(t, cm.initialRoots.Roots[0], 100, 10*time.Minute)
// assert value of cert prior to updating the leaf
require.Equal(t, cm.initialTLSCertificate(t), cm.tls.Cert())
key := cm.monitor.leafReq.CacheInfo().Key
// send the new certificate - this notifies only the watchers utilizing
// the new ACL token
require.True(t, cm.mcache.sendNotification(ctx, key, cache.UpdateEvent{
CorrelationID: leafWatchID,
Result: secondCert,
Meta: cache.ResultMeta{
Index: secondCert.ModifyIndex,
},
}))
tlsCert := tlsCertificateFromIssued(t, secondCert)
// this will wait up to 200ms (8 x 25 ms waits between the 9 requests)
retry.RunWith(&retry.Counter{Count: 9, Wait: 25 * time.Millisecond}, t, func(r *retry.R) {
require.Equal(r, tlsCert, cm.tls.Cert())
})
cm.assertExpectations(t)
}
func TestCertMonitor_Fallback(t *testing.T) {
ctx, cm := startedCertMonitor(t)
// at this point everything is operating normally and the monitor is just
// waiting for events. We are going to send a new cert that is basically
// already expired and then allow the fallback routine to kick in.
secondCert := newLeaf(t, cm.initialRoots.Roots[0], 100, time.Nanosecond)
secondCA := connect.TestCA(t, cm.initialRoots.Roots[0])
secondRoots := structs.IndexedCARoots{
ActiveRootID: secondCA.ID,
TrustDomain: connect.TestClusterID,
Roots: []*structs.CARoot{
secondCA,
cm.initialRoots.Roots[0],
},
QueryMeta: structs.QueryMeta{
Index: 101,
},
}
thirdCert := newLeaf(t, secondCA, 102, 10*time.Minute)
// inject a fallback routine error to check that we rerun it quickly
cm.fallback.On("fallback").Return(nil, fmt.Errorf("induced error")).Once()
// expect the fallback routine to be executed and setup the return
cm.fallback.On("fallback").Return(&structs.SignedResponse{
ConnectCARoots: secondRoots,
IssuedCert: *thirdCert,
ManualCARoots: cm.extraCACerts,
VerifyServerHostname: true,
}, nil).Once()
// Add another roots cache prepopulation expectation which should happen
// in response to executing the fallback mechanism
rootRes := cache.FetchResult{Value: &secondRoots, Index: 101}
rootsReq := structs.DCSpecificRequest{Datacenter: cm.datacenter}
cm.mcache.On("Prepopulate", cachetype.ConnectCARootName, rootRes, cm.datacenter, "", rootsReq.CacheInfo().Key).Return(nil).Once()
// add another leaf cert cache prepopulation expectation which should happen
// in response to executing the fallback mechanism
leafReq := cachetype.ConnectCALeafRequest{
Token: cm.tokens.AgentToken(),
Agent: cm.nodeName,
Datacenter: cm.datacenter,
DNSSAN: cm.dns,
IPSAN: cm.ips,
}
leafRes := cache.FetchResult{
Value: thirdCert,
Index: 101,
State: cachetype.ConnectCALeafSuccess(secondCA.SigningKeyID),
}
cm.mcache.On("Prepopulate", cachetype.ConnectCALeafName, leafRes, leafReq.Datacenter, leafReq.Token, leafReq.Key()).Return(nil).Once()
// nothing in the monitor should be looking at this as its only done
// in response to sending token updates, no need to synchronize
key := cm.monitor.leafReq.CacheInfo().Key
// send the new certificate - this notifies only the watchers utilizing
// the new ACL token
require.True(t, cm.mcache.sendNotification(ctx, key, cache.UpdateEvent{
CorrelationID: leafWatchID,
Result: secondCert,
Meta: cache.ResultMeta{
Index: secondCert.ModifyIndex,
},
}))
// if all went well we would have updated the first certificate which was pretty much expired
// causing the fallback handler to be invoked almost immediately. The fallback routine will
// return the response containing the third cert and second CA roots so now we should wait
// a little while and ensure they were applied to the TLS Configurator
tlsCert := tlsCertificateFromIssued(t, thirdCert)
expectedCAs := append(cm.extraCACerts, secondCA.RootCert, cm.initialRoots.Roots[0].RootCert)
// this will wait up to 200ms (8 x 25 ms waits between the 9 requests)
retry.RunWith(&retry.Counter{Count: 9, Wait: 25 * time.Millisecond}, t, func(r *retry.R) {
require.Equal(r, tlsCert, cm.tls.Cert())
require.ElementsMatch(r, expectedCAs, cm.tls.CAPems())
})
cm.assertExpectations(t)
}
func TestCertMonitor_New_Errors(t *testing.T) {
type testCase struct {
cfg Config
err string
}
fallback := func(_ context.Context) (*structs.SignedResponse, error) {
return nil, fmt.Errorf("Unimplemented")
}
tokens := new(token.Store)
cases := map[string]testCase{
"no-cache": {
cfg: Config{
TLSConfigurator: testTLSConfigurator(t),
Fallback: fallback,
Tokens: tokens,
Datacenter: "foo",
NodeName: "bar",
},
err: "CertMonitor creation requires a Cache",
},
"no-tls-configurator": {
cfg: Config{
Cache: cache.New(nil),
Fallback: fallback,
Tokens: tokens,
Datacenter: "foo",
NodeName: "bar",
},
err: "CertMonitor creation requires a TLS Configurator",
},
"no-fallback": {
cfg: Config{
Cache: cache.New(nil),
TLSConfigurator: testTLSConfigurator(t),
Tokens: tokens,
Datacenter: "foo",
NodeName: "bar",
},
err: "CertMonitor creation requires specifying a FallbackFunc",
},
"no-tokens": {
cfg: Config{
Cache: cache.New(nil),
TLSConfigurator: testTLSConfigurator(t),
Fallback: fallback,
Datacenter: "foo",
NodeName: "bar",
},
err: "CertMonitor creation requires specifying a token store",
},
"no-datacenter": {
cfg: Config{
Cache: cache.New(nil),
TLSConfigurator: testTLSConfigurator(t),
Fallback: fallback,
Tokens: tokens,
NodeName: "bar",
},
err: "CertMonitor creation requires specifying the datacenter",
},
"no-node-name": {
cfg: Config{
Cache: cache.New(nil),
TLSConfigurator: testTLSConfigurator(t),
Fallback: fallback,
Tokens: tokens,
Datacenter: "foo",
},
err: "CertMonitor creation requires specifying the agent's node name",
},
}
for name, tcase := range cases {
t.Run(name, func(t *testing.T) {
monitor, err := New(&tcase.cfg)
testutil.RequireErrorContains(t, err, tcase.err)
require.Nil(t, monitor)
})
}
}

View File

@ -0,0 +1,137 @@
package certmon
import (
"context"
"net"
"time"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/go-hclog"
)
// FallbackFunc is used when the normal cache watch based Certificate
// updating fails to update the Certificate in time and a different
// method of updating the certificate is required.
type FallbackFunc func(context.Context) (*structs.SignedResponse, error)
type Config struct {
// Logger is the logger to be used while running. If not set
// then no logging will be performed.
Logger hclog.Logger
// TLSConfigurator is where the certificates and roots are set when
// they are updated. This field is required.
TLSConfigurator *tlsutil.Configurator
// Cache is an object implementing our Cache interface. The Cache
// used at runtime must be able to handle Roots and Leaf Cert watches
Cache Cache
// Tokens is the shared token store. It is used to retrieve the current
// agent token as well as getting notifications when that token is updated.
// This field is required.
Tokens *token.Store
// Fallback is a function to run when the normal cache updating of the
// agent's certificates has failed to work for one reason or another.
// This field is required.
Fallback FallbackFunc
// FallbackLeeway is the amount of time after certificate expiration before
// invoking the fallback routine. If not set this will default to 10s.
FallbackLeeway time.Duration
// FallbackRetry is the duration between Fallback invocations when the configured
// fallback routine returns an error. If not set this will default to 1m.
FallbackRetry time.Duration
// DNSSANs is a list of DNS SANs that certificate requests should include. This
// field is optional and no extra DNS SANs will be requested if unset. 'localhost'
// is unconditionally requested by the cache implementation.
DNSSANs []string
// IPSANs is a list of IP SANs to include in the certificate signing request. This
// field is optional and no extra IP SANs will be requested if unset. Both '127.0.0.1'
// and '::1' IP SANs are unconditionally requested by the cache implementation.
IPSANs []net.IP
// Datacenter is the datacenter to request certificates within. This filed is required
Datacenter string
// NodeName is the agent's node name to use when requesting certificates. This field
// is required.
NodeName string
}
// WithCache will cause the created CertMonitor type to use the provided Cache
func (cfg *Config) WithCache(cache Cache) *Config {
cfg.Cache = cache
return cfg
}
// WithLogger will cause the created CertMonitor type to use the provided logger
func (cfg *Config) WithLogger(logger hclog.Logger) *Config {
cfg.Logger = logger
return cfg
}
// WithTLSConfigurator will cause the created CertMonitor type to use the provided configurator
func (cfg *Config) WithTLSConfigurator(tlsConfigurator *tlsutil.Configurator) *Config {
cfg.TLSConfigurator = tlsConfigurator
return cfg
}
// WithTokens will cause the created CertMonitor type to use the provided token store
func (cfg *Config) WithTokens(tokens *token.Store) *Config {
cfg.Tokens = tokens
return cfg
}
// WithFallback configures a fallback function to use if the normal update mechanisms
// fail to renew the certificate in time.
func (cfg *Config) WithFallback(fallback FallbackFunc) *Config {
cfg.Fallback = fallback
return cfg
}
// WithDNSSANs configures the CertMonitor to request these DNS SANs when requesting a new
// certificate
func (cfg *Config) WithDNSSANs(sans []string) *Config {
cfg.DNSSANs = sans
return cfg
}
// WithIPSANs configures the CertMonitor to request these IP SANs when requesting a new
// certificate
func (cfg *Config) WithIPSANs(sans []net.IP) *Config {
cfg.IPSANs = sans
return cfg
}
// WithDatacenter configures the CertMonitor to request Certificates in this DC
func (cfg *Config) WithDatacenter(dc string) *Config {
cfg.Datacenter = dc
return cfg
}
// WithNodeName configures the CertMonitor to request Certificates with this agent name
func (cfg *Config) WithNodeName(name string) *Config {
cfg.NodeName = name
return cfg
}
// WithFallbackLeeway configures how long after a certificate expires before attempting to
// generarte a new certificate using the fallback mechanism. The default is 10s.
func (cfg *Config) WithFallbackLeeway(leeway time.Duration) *Config {
cfg.FallbackLeeway = leeway
return cfg
}
// WithFallbackRetry controls how quickly we will make subsequent invocations of
// the fallback func in the case of it erroring out.
func (cfg *Config) WithFallbackRetry(after time.Duration) *Config {
cfg.FallbackRetry = after
return cfg
}

View File

@ -168,7 +168,21 @@ func TestCAWithKeyType(t testing.T, xc *structs.CARoot, keyType string, keyBits
return testCA(t, xc, keyType, keyBits)
}
func testLeaf(t testing.T, service string, namespace string, root *structs.CARoot, keyType string, keyBits int) (string, string, error) {
// testCertID is an interface to be implemented the various spiffe ID / CertURI types
// It adds an addition CommonName method to the CertURI interface to prevent the need
// for any type switching on the actual CertURI's concrete type in order to figure
// out its common name
type testCertID interface {
CommonName() string
CertURI
}
func testLeafWithID(t testing.T, spiffeId testCertID, root *structs.CARoot, keyType string, keyBits int, expiration time.Duration) (string, string, error) {
if expiration == 0 {
// this is 10 years
expiration = 10 * 365 * 24 * time.Hour
}
// Parse the CA cert and signing key from the root
cert := root.SigningCert
if cert == "" {
@ -183,14 +197,6 @@ func testLeaf(t testing.T, service string, namespace string, root *structs.CARoo
return "", "", fmt.Errorf("error parsing signing key: %s", err)
}
// Build the SPIFFE ID
spiffeId := &SpiffeIDService{
Host: fmt.Sprintf("%s.consul", TestClusterID),
Namespace: namespace,
Datacenter: "dc1",
Service: service,
}
// The serial number for the cert
sn, err := testSerialNumber()
if err != nil {
@ -211,7 +217,7 @@ func testLeaf(t testing.T, service string, namespace string, root *structs.CARoo
// Cert template for generation
template := x509.Certificate{
SerialNumber: sn,
Subject: pkix.Name{CommonName: ServiceCN(service, "default", TestClusterID)},
Subject: pkix.Name{CommonName: spiffeId.CommonName()},
URIs: []*url.URL{spiffeId.URI()},
SignatureAlgorithm: SigAlgoForKeyType(rootKeyType),
BasicConstraintsValid: true,
@ -223,7 +229,7 @@ func testLeaf(t testing.T, service string, namespace string, root *structs.CARoo
x509.ExtKeyUsageClientAuth,
x509.ExtKeyUsageServerAuth,
},
NotAfter: time.Now().AddDate(10, 0, 0),
NotAfter: time.Now().Add(expiration),
NotBefore: time.Now(),
AuthorityKeyId: testKeyID(t, caSigner.Public()),
SubjectKeyId: testKeyID(t, pkSigner.Public()),
@ -244,6 +250,29 @@ func testLeaf(t testing.T, service string, namespace string, root *structs.CARoo
return buf.String(), pkPEM, nil
}
func TestAgentLeaf(t testing.T, node string, datacenter string, root *structs.CARoot, expiration time.Duration) (string, string, error) {
// Build the SPIFFE ID
spiffeId := &SpiffeIDAgent{
Host: fmt.Sprintf("%s.consul", TestClusterID),
Datacenter: datacenter,
Agent: node,
}
return testLeafWithID(t, spiffeId, root, DefaultPrivateKeyType, DefaultPrivateKeyBits, expiration)
}
func testLeaf(t testing.T, service string, namespace string, root *structs.CARoot, keyType string, keyBits int) (string, string, error) {
// Build the SPIFFE ID
spiffeId := &SpiffeIDService{
Host: fmt.Sprintf("%s.consul", TestClusterID),
Namespace: namespace,
Datacenter: "dc1",
Service: service,
}
return testLeafWithID(t, spiffeId, root, keyType, keyBits, 0)
}
// TestLeaf returns a valid leaf certificate and it's private key for the named
// service with the given CA Root.
func TestLeaf(t testing.T, service string, root *structs.CARoot) (string, string) {

View File

@ -27,3 +27,7 @@ func (id *SpiffeIDAgent) URI() *url.URL {
func (id *SpiffeIDAgent) Authorize(_ *structs.Intention) (bool, bool) {
return false, false
}
func (id *SpiffeIDAgent) CommonName() string {
return AgentCN(id.Agent, id.Host)
}

View File

@ -40,3 +40,7 @@ func (id *SpiffeIDService) Authorize(ixn *structs.Intention) (bool, bool) {
// Match, return allow value
return ixn.Action == structs.IntentionActionAllow, true
}
func (id *SpiffeIDService) CommonName() string {
return ServiceCN(id.Service, id.Namespace, id.Host)
}

View File

@ -65,9 +65,9 @@ func (c *Client) autoEncryptCSR(extraDNSSANs []string, extraIPSANs []net.IP) (st
return pkPEM, csr, nil
}
func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, port int, token string, extraDNSSANs []string, extraIPSANs []net.IP) (*structs.SignedResponse, string, error) {
errFn := func(err error) (*structs.SignedResponse, string, error) {
return nil, "", err
func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, port int, token string, extraDNSSANs []string, extraIPSANs []net.IP) (*structs.SignedResponse, error) {
errFn := func(err error) (*structs.SignedResponse, error) {
return nil, err
}
// Check if we know about a server already through gossip. Depending on
@ -120,7 +120,8 @@ func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string,
addr := net.TCPAddr{IP: ip, Port: port}
if err = c.connPool.RPC(c.config.Datacenter, c.config.NodeName, &addr, "AutoEncrypt.Sign", &args, &reply); err == nil {
return &reply, pkPEM, nil
reply.IssuedCert.PrivateKeyPEM = pkPEM
return &reply, nil
} else {
c.logger.Warn("AutoEncrypt failed", "error", err)
}

View File

@ -104,7 +104,7 @@ func TestAutoEncrypt_RequestAutoEncryptCerts(t *testing.T) {
doneCh := make(chan struct{})
var err error
go func() {
_, _, err = c1.RequestAutoEncryptCerts(ctx, servers, port, token, nil, nil)
_, err = c1.RequestAutoEncryptCerts(ctx, servers, port, token, nil, nil)
close(doneCh)
}()
select {

View File

@ -1995,7 +1995,7 @@ func (ct *ControllableCacheType) RegisterOptions() cache.RegisterOptions {
return cache.RegisterOptions{
Refresh: ct.blocking,
SupportsBlocking: ct.blocking,
RefreshTimeout: 10 * time.Minute,
QueryTimeout: 10 * time.Minute,
}
}