consul/agent/leafcert/cert.go

package leafcert

import (
	"sync"
	"time"

	"golang.org/x/time/rate"

	"github.com/hashicorp/consul/agent/structs"
	"github.com/hashicorp/consul/lib/ttlcache"
)

// certData tracks all of the metadata about a leaf cert.
type certData struct {
	// lock locks access to all fields
	lock sync.Mutex

	// index is the last raft index associated with an update of the 'value' field
	index uint64

	// value is the last updated cert contents or nil if not populated initially
	value *structs.IssuedCert

	// state is metadata related to cert generation
	state fetchState

	// fetchedAt was the time when 'value' was last updated
	fetchedAt time.Time

	// refreshing indicates if there is an active request attempting to refresh
	// the current leaf cert contents.
	refreshing bool

	// lastFetchErr is the last error encountered when attempting to populate
	// the 'value' field.
	lastFetchErr error

	// expiry contains information about the expiration of this
	// cert. This is a pointer as its shared as a value in the
	// ExpiryHeap as well.
	expiry *ttlcache.Entry

	// refreshRateLimiter limits the rate at which the cert can be regenerated
	refreshRateLimiter *rate.Limiter
}

func (c *certData) MarkRefreshing(v bool) {
	c.lock.Lock()
	defer c.lock.Unlock()
	c.refreshing = v
}

func (c *certData) GetValueAndState() (*structs.IssuedCert, fetchState) {
	c.lock.Lock()
	defer c.lock.Unlock()
	return c.value, c.state
}

func (c *certData) GetError() error {
	c.lock.Lock()
	defer c.lock.Unlock()
	return c.lastFetchErr
}

// NOTE: this function only has one goroutine in it per key at all times
func (c *certData) Update(
	newCert *structs.IssuedCert,
	newState fetchState,
	err error,
) {
	c.lock.Lock()
	defer c.lock.Unlock()

	// Importantly, always reset the Error. Having both Error and a Value that
	// are non-nil is allowed in the cache entry but it indicates that the Error
	// is _newer_ than the last good value. So if the err is nil then we need to
	// reset to replace any _older_ errors and avoid them bubbling up. If the
	// error is non-nil then we need to set it anyway and used to do it in the
	// code below. See https://github.com/hashicorp/consul/issues/4480.
	c.lastFetchErr = err

	c.state = newState
	if newCert != nil {
		c.index = newCert.ModifyIndex
		c.value = newCert
		c.fetchedAt = time.Now()
	}

	if c.index < 1 {
		// Less than one is invalid unless there was an error and in this case
		// there wasn't since a value was returned. If a badly behaved RPC
		// returns 0 when it has no data, we might get into a busy loop here. We
		// set this to minimum of 1 which is safe because no valid user data can
		// ever be written at raft index 1 due to the bootstrap process for
		// raft. This insure that any subsequent background refresh request will
		// always block, but allows the initial request to return immediately
		// even if there is no data.
		c.index = 1
	}
}

// fetchState is some additional metadata we store with each cert in the cache
// to track things like expiry and coordinate paces root rotations. It's
// important this doesn't contain any pointer types since we rely on the struct
// being copied to avoid modifying the actual state in the cache entry during
// Fetch. Pointers themselves are OK, but if we point to another struct that we
// call a method or modify in some way that would directly mutate the cache and
// cause problems. We'd need to deep-clone in that case in Fetch below.
// time.Time technically contains a pointer to the Location but we ignore that
// since all times we get from our wall clock should point to the same Location
// anyway.
type fetchState struct {
	// authorityKeyId is the ID of the CA key (whether root or intermediate) that signed
	// the current cert.  This is just to save parsing the whole cert everytime
	// we have to check if the root changed.
	authorityKeyID string

	// forceExpireAfter is used to coordinate renewing certs after a CA rotation
	// in a staggered way so that we don't overwhelm the servers.
	forceExpireAfter time.Time

	// activeRootRotationStart is set when the root has changed and we need to get
	// a new cert but haven't got one yet. forceExpireAfter will be set to the
	// next scheduled time we should try our CSR, but this is needed to calculate
	// the retry windows if we are rate limited when we try. See comment on
	// const caChangeJitterWindow above for more.
	activeRootRotationStart time.Time

	// consecutiveRateLimitErrs stores how many rate limit errors we've hit. We
	// use this to choose a new window for the next retry. See comment on
	// const caChangeJitterWindow above for more.
	consecutiveRateLimitErrs int
}