You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
consul/agent/leafcert/leafcert_test.go

1134 lines
33 KiB

agent: remove agent cache dependency from service mesh leaf certificate management (#17075) * agent: remove agent cache dependency from service mesh leaf certificate management This extracts the leaf cert management from within the agent cache. This code was produced by the following process: 1. All tests in agent/cache, agent/cache-types, agent/auto-config, agent/consul/servercert were run at each stage. - The tests in agent matching .*Leaf were run at each stage. - The tests in agent/leafcert were run at each stage after they existed. 2. The former leaf cert Fetch implementation was extracted into a new package behind a "fake RPC" endpoint to make it look almost like all other cache type internals. 3. The old cache type was shimmed to use the fake RPC endpoint and generally cleaned up. 4. I selectively duplicated all of Get/Notify/NotifyCallback/Prepopulate from the agent/cache.Cache implementation over into the new package. This was renamed as leafcert.Manager. - Code that was irrelevant to the leaf cert type was deleted (inlining blocking=true, refresh=false) 5. Everything that used the leaf cert cache type (including proxycfg stuff) was shifted to use the leafcert.Manager instead. 6. agent/cache-types tests were moved and gently replumbed to execute as-is against a leafcert.Manager. 7. Inspired by some of the locking changes from derek's branch I split the fat lock into N+1 locks. 8. The waiter chan struct{} was eventually replaced with a singleflight.Group around cache updates, which was likely the biggest net structural change. 9. The awkward two layers or logic produced as a byproduct of marrying the agent cache management code with the leaf cert type code was slowly coalesced and flattened to remove confusion. 10. The .*Leaf tests from the agent package were copied and made to work directly against a leafcert.Manager to increase direct coverage. I have done a best effort attempt to port the previous leaf-cert cache type's tests over in spirit, as well as to take the e2e-ish tests in the agent package with Leaf in the test name and copy those into the agent/leafcert package to get more direct coverage, rather than coverage tangled up in the agent logic. There is no net-new test coverage, just coverage that was pushed around from elsewhere.
1 year ago
package leafcert
import (
"context"
"crypto/tls"
"crypto/x509"
"encoding/pem"
"fmt"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/sdk/testutil/retry"
)
// Test that after an initial signing, new CA roots (new ID) will
// trigger a blocking query to execute.
func TestManager_changingRoots(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
m, signer := testManager(t, nil)
caRoot := signer.UpdateCA(t, nil)
// We'll reuse the fetch options and request
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: "web",
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var idx uint64
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
requireLeafValidUnderCA(t, result.Value, caRoot)
require.True(t, result.Index > 0)
idx = result.Index
}
// Second fetch should block with set index
req.MinQueryIndex = idx
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}
// Let's send in new roots, which should trigger the sign req. We need to take
// care to set the new root as active
caRoot2 := signer.UpdateCA(t, nil)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
require.True(t, result.Index > idx)
requireLeafValidUnderCA(t, result.Value, caRoot2)
}
// Third fetch should block
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}
}
// Tests that if the root change jitter is longer than the time left on the
// timeout, we return normally but then still renew the cert on a subsequent
// call.
func TestManager_changingRootsJitterBetweenCalls(t *testing.T) {
t.Parallel()
const TestOverrideCAChangeInitialDelay = 100 * time.Millisecond
m, signer := testManager(t, func(cfg *Config) {
// Override the root-change delay so we will timeout first. We can't set it to
// a crazy high value otherwise we'll have to wait that long in the test to
// see if it actually happens on subsequent calls. We instead reduce the
// timeout in FetchOptions to be much shorter than this.
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
})
caRoot := signer.UpdateCA(t, nil)
// We'll reuse the fetch options and request. Timeout must be much shorter
// than the initial root delay. 20ms means that if we deliver the root change
// during the first blocking call, we should need to block fully for 5 more
// calls before the cert is renewed. We pick a timeout that is not an exact
// multiple of the 100ms delay above to reduce the chance that timing works
// out in a way that makes it hard to tell a timeout from an early return due
// to a cert renewal.
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: "web",
MinQueryIndex: 0, MaxQueryTime: 35 * time.Millisecond,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var (
idx uint64
issued *structs.IssuedCert
)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
require.True(t, result.Index > 0)
requireLeafValidUnderCA(t, result.Value, caRoot)
idx = result.Index
issued = result.Value
}
// Let's send in new roots, which should eventually trigger the sign req. We
// need to take care to set the new root as active. Note that this is
// implicitly testing that root updates that happen in between leaf blocking
// queries are still noticed too. At this point no leaf blocking query is
// running so the root watch should be stopped. By pushing this update, the
// next blocking query will _immediately_ see the new root which means it
// needs to correctly notice that it is not the same one that generated the
// current cert and start the rotation. This is good, just not obvious that
// the behavior is actually well tested here when it is.
caRoot2 := signer.UpdateCA(t, nil)
earliestRootDelivery := time.Now()
// Some number of fetches (2,3,4 likely) should timeout after 20ms and after
// 100ms has elapsed total we should see the new cert. Since this is all very
// timing dependent, we don't hard code exact numbers here and instead loop
// for plenty of time and do as many calls as it takes and just assert on the
// time taken and that the call either blocks and returns the cached cert, or
// returns the new one.
req.MinQueryIndex = idx
var shouldExpireAfter time.Time
i := 1
rootsDelivered := false
for rootsDelivered {
start := time.Now()
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
require.NoError(t, result.Err)
timeTaken := time.Since(start)
// There are two options, either it blocked waiting for the delay after
// the rotation or it returned the new CA cert before the timeout was
// done. TO be more robust against timing, we take the value as the
// decider for which case it is, and assert timing matches our expected
// bounds rather than vice versa.
if result.Index > idx {
// Got a new cert
require.NotEqual(t, issued, result.Value)
require.NotNil(t, result.Value)
requireLeafValidUnderCA(t, result.Value, caRoot2)
// Should not have been delivered before the delay
require.True(t, time.Since(earliestRootDelivery) > TestOverrideCAChangeInitialDelay)
// All good. We are done!
rootsDelivered = true
} else {
// Should be the cached cert
require.Equal(t, issued, result.Value)
require.Equal(t, idx, result.Index)
requireLeafValidUnderCA(t, result.Value, caRoot)
// Sanity check we blocked for the whole timeout
require.Truef(t, timeTaken > req.MaxQueryTime,
"should block for at least %s, returned after %s",
req.MaxQueryTime, timeTaken)
// Sanity check that the forceExpireAfter state was set correctly
shouldExpireAfter := testObserveLeafCert(m, req, func(cd *certData) time.Time {
return cd.state.forceExpireAfter
})
require.True(t, shouldExpireAfter.After(time.Now()))
require.True(t, shouldExpireAfter.Before(time.Now().Add(TestOverrideCAChangeInitialDelay)))
}
case <-time.After(50 * time.Millisecond):
t.Fatalf("request %d blocked too long", i)
}
i++
// Sanity check that we've not gone way beyond the deadline without a
// new cert. We give some leeway to make it less brittle.
require.Falsef(t, time.Now().After(shouldExpireAfter.Add(100*time.Millisecond)),
"waited extra 100ms and delayed CA rotate renew didn't happen")
}
}
func testObserveLeafCert[T any](m *Manager, req *ConnectCALeafRequest, cb func(*certData) T) T {
key := req.Key()
cd := m.getCertData(key)
cd.lock.Lock()
defer cd.lock.Unlock()
return cb(cd)
}
// Tests that if the root changes in between blocking calls we still pick it up.
func TestManager_changingRootsBetweenBlockingCalls(t *testing.T) {
t.Parallel()
m, signer := testManager(t, nil)
caRoot := signer.UpdateCA(t, nil)
// We'll reuse the fetch options and request. Short timeout important since we
// wait the full timeout before chaning roots.
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: "web",
MinQueryIndex: 0, MaxQueryTime: 35 * time.Millisecond,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var (
idx uint64
issued *structs.IssuedCert
)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
requireLeafValidUnderCA(t, result.Value, caRoot)
require.True(t, result.Index > 0)
idx = result.Index
issued = result.Value
}
// Next fetch should block for the full timeout
start := time.Now()
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block for too long waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.Equal(t, issued, result.Value)
// Still the initial cached result
require.Equal(t, idx, result.Index)
// Sanity check that it waited
require.True(t, time.Since(start) > req.MaxQueryTime)
}
// No active requests, simulate root change now
caRoot2 := signer.UpdateCA(t, nil)
earliestRootDelivery := time.Now()
// We should get the new cert immediately on next fetch (since test override
// root change jitter to be 1 nanosecond so no delay expected).
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block too long waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotEqual(t, issued, result.Value)
requireLeafValidUnderCA(t, result.Value, caRoot2)
require.True(t, result.Index > idx)
// Sanity check that we didn't wait too long
require.True(t, time.Since(earliestRootDelivery) < req.MaxQueryTime)
}
}
func TestManager_CSRRateLimiting(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
m, signer := testManager(t, func(cfg *Config) {
// Each jitter window will be only 100 ms long to make testing quick but
// highly likely not to fail based on scheduling issues.
cfg.TestOverrideCAChangeInitialDelay = 100 * time.Millisecond
})
signer.UpdateCA(t, nil)
signer.SetSignCallErrors(
// First call return rate limit error. This is important as it checks
// behavior when cache is empty and we have to return a nil Value but need to
// save state to do the right thing for retry.
consul.ErrRateLimited, // inc
// Then succeed on second call
nil,
// Then be rate limited again on several further calls
consul.ErrRateLimited, // inc
consul.ErrRateLimited, // inc
// Then fine after that
)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "web",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
// First fetch should return rate limit error directly - client is expected to
// backoff itself.
getCh := testAsyncGet(t, m, req)
select {
case <-time.After(200 * time.Millisecond):
t.Fatal("shouldn't block longer than one jitter window for success")
case result := <-getCh:
require.Error(t, result.Err)
require.Equal(t, consul.ErrRateLimited.Error(), result.Err.Error())
}
// Second call should return correct cert immediately.
getCh = testAsyncGet(t, m, req)
var (
idx uint64
issued *structs.IssuedCert
)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
require.True(t, result.Index > 0)
idx = result.Index
issued = result.Value
}
// Send in new roots, which should trigger the next sign req. We need to take
// care to set the new root as active
signer.UpdateCA(t, nil)
earliestRootDelivery := time.Now()
// Sanity check state
require.Equal(t, uint64(1), signer.GetSignCallErrorCount())
// After root rotation jitter has been waited out, a new CSR will
// be attempted but will fail and return the previous cached result with no
// error since we will try again soon.
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(200 * time.Millisecond):
t.Fatal("shouldn't block too long waiting for fetch")
case result := <-getCh:
// We should block for _at least_ one jitter period since we set that to
// 100ms and in test override mode we always pick the max jitter not a
// random amount.
require.True(t, time.Since(earliestRootDelivery) > 100*time.Millisecond)
require.Equal(t, uint64(2), signer.GetSignCallErrorCount())
require.NoError(t, result.Err)
require.Equal(t, issued, result.Value)
// 1 since this should still be the original cached result as we failed to
// get a new cert.
require.Equal(t, idx, result.Index)
}
// Root rotation state is now only captured in the opts.LastResult.State so a
// subsequent call should also wait for 100ms and then attempt to generate a
// new cert since we failed last time.
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(200 * time.Millisecond):
t.Fatal("shouldn't block too long waiting for fetch")
case result := <-getCh:
// We should block for _at least_ two jitter periods now.
require.True(t, time.Since(earliestRootDelivery) > 200*time.Millisecond)
require.Equal(t, uint64(3), signer.GetSignCallErrorCount())
require.NoError(t, result.Err)
require.Equal(t, issued, result.Value)
// 1 since this should still be the original cached result as we failed to
// get a new cert.
require.Equal(t, idx, result.Index)
}
// Now we've had two rate limit failures and seen root rotation state work
// across both the blocking request that observed the rotation and the
// subsequent one. The next request should wait out the rest of the backoff
// and then actually fetch a new cert at last!
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(200 * time.Millisecond):
t.Fatal("shouldn't block too long waiting for fetch")
case result := <-getCh:
// We should block for _at least_ three jitter periods now.
require.True(t, time.Since(earliestRootDelivery) > 300*time.Millisecond)
require.Equal(t, uint64(3), signer.GetSignCallErrorCount())
require.NoError(t, result.Err)
require.NotEqual(t, issued, result.Value)
// 3 since the rootCA change used 2
require.True(t, result.Index > idx)
}
}
// This test runs multiple concurrent callers watching different leaf certs and
// tries to ensure that the background root watch activity behaves correctly.
func TestManager_watchRootsDedupingMultipleCallers(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
m, signer := testManager(t, nil)
caRoot := signer.UpdateCA(t, nil)
// n is the number of clients we'll run
n := 3
// setup/testDoneCh are used for coordinating clients such that each has
// initial cert delivered and is blocking before the root changes. It's not a
// wait group since we want to be able to timeout the main test goroutine if
// one of the clients gets stuck. Instead it's a buffered chan.
setupDoneCh := make(chan error, n)
testDoneCh := make(chan error, n)
// rootsUpdate is used to coordinate clients so they know when they should
// expect to see leaf renewed after root change.
rootsUpdatedCh := make(chan struct{})
// Create a function that models a single client. It should go through the
// steps of getting an initial cert and then watching for changes until root
// updates.
client := func(i int) {
// We'll reuse the fetch options and request
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: fmt.Sprintf("web-%d", i),
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var idx uint64
select {
case <-time.After(100 * time.Millisecond):
setupDoneCh <- fmt.Errorf("shouldn't block waiting for fetch")
return
case result := <-getCh:
require.NoError(t, result.Err)
idx = result.Index
}
// Second fetch should block with set index
req.MinQueryIndex = idx
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
setupDoneCh <- fmt.Errorf("should not return: %#v", result)
return
case <-time.After(100 * time.Millisecond):
}
// We're done with setup and the blocking call is still blocking in
// background.
setupDoneCh <- nil
// Wait until all others are also done and roots change incase there are
// stragglers delaying the root update.
select {
case <-rootsUpdatedCh:
case <-time.After(200 * time.Millisecond):
testDoneCh <- fmt.Errorf("waited too long for root update")
return
}
// Now we should see root update within a short period
select {
case <-time.After(100 * time.Millisecond):
testDoneCh <- fmt.Errorf("shouldn't block waiting for fetch")
return
case result := <-getCh:
require.NoError(t, result.Err)
if req.MinQueryIndex == result.Value.CreateIndex {
testDoneCh <- fmt.Errorf("index must be different")
return
}
}
testDoneCh <- nil
}
// Sanity check the roots watcher is not running yet
assertRootsWatchCounts(t, m, 0, 0)
for i := 0; i < n; i++ {
go client(i)
}
timeoutCh := time.After(200 * time.Millisecond)
for i := 0; i < n; i++ {
select {
case <-timeoutCh:
t.Fatal("timed out waiting for clients")
case err := <-setupDoneCh:
if err != nil {
t.Fatalf(err.Error())
}
}
}
// Should be 3 clients running now, so the roots watcher should have started
// once and not stopped.
assertRootsWatchCounts(t, m, 1, 0)
caRootCopy := caRoot.Clone()
caRootCopy.Active = false
// Now we deliver the root update
_ = signer.UpdateCA(t, nil)
// And notify clients
close(rootsUpdatedCh)
timeoutCh = time.After(200 * time.Millisecond)
for i := 0; i < n; i++ {
select {
case <-timeoutCh:
t.Fatalf("timed out waiting for %d of %d clients to renew after root change", n-i, n)
case err := <-testDoneCh:
if err != nil {
t.Fatalf(err.Error())
}
}
}
// All active requests have returned the new cert so the rootsWatcher should
// have stopped. This is timing dependent though so retry a few times
retry.RunWith(retry.ThreeTimes(), t, func(r *retry.R) {
assertRootsWatchCounts(r, m, 1, 1)
})
}
func assertRootsWatchCounts(t require.TestingT, m *Manager, wantStarts, wantStops int) {
if tt, ok := t.(*testing.T); ok {
tt.Helper()
}
starts := atomic.LoadUint32(&m.rootWatcher.testStartCount)
stops := atomic.LoadUint32(&m.rootWatcher.testStopCount)
require.Equal(t, wantStarts, int(starts))
require.Equal(t, wantStops, int(stops))
}
// Test that after an initial signing, an expiringLeaf will trigger a
// blocking query to resign.
func TestManager_expiringLeaf(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
m, signer := testManager(t, nil)
caRoot := signer.UpdateCA(t, nil)
signer.SetSignCallErrors(
// First call returns expired cert to prime cache with an expired one.
ReplyWithExpiredCert,
)
// We'll reuse the fetch options and request
req := &ConnectCALeafRequest{
Datacenter: "dc1", Service: "web",
MinQueryIndex: 0, MaxQueryTime: 10 * time.Second,
}
// First fetch should return immediately
getCh := testAsyncGet(t, m, req)
var (
idx uint64
issued *structs.IssuedCert
)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotNil(t, result.Value)
require.True(t, result.Index > 0)
idx = result.Index
issued = result.Value
}
// Second fetch should return immediately despite there being
// no updated CA roots, because we issued an expired cert.
getCh = testAsyncGet(t, m, req)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-getCh:
require.NoError(t, result.Err)
require.NotEqual(t, issued, result.Value)
require.True(t, result.Index > idx)
requireLeafValidUnderCA(t, result.Value, caRoot)
idx = result.Index
}
// Third fetch should block since the cert is not expiring and
// we also didn't update CA certs.
req.MinQueryIndex = idx
getCh = testAsyncGet(t, m, req)
select {
case result := <-getCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}
}
func TestManager_DNSSANForService(t *testing.T) {
t.Parallel()
m, signer := testManager(t, nil)
_ = signer.UpdateCA(t, nil)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "web",
DNSSAN: []string{"test.example.com"},
}
_, _, err := m.Get(context.Background(), req)
require.NoError(t, err)
caReq := signer.GetCapture(0)
require.NotNil(t, caReq)
pemBlock, _ := pem.Decode([]byte(caReq.CSR))
csr, err := x509.ParseCertificateRequest(pemBlock.Bytes)
require.NoError(t, err)
require.Equal(t, csr.DNSNames, []string{"test.example.com"})
}
func TestManager_workflow_good(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
const TestOverrideCAChangeInitialDelay = 1 * time.Nanosecond
m, signer := testManager(t, func(cfg *Config) {
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
})
ca1 := signer.UpdateCA(t, nil)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
// List
issued, meta, err := m.Get(ctx, req)
require.NoError(t, err)
require.False(t, meta.Hit)
require.NotNil(t, issued)
// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, ca1)
// Verify blocking index
require.True(t, issued.ModifyIndex > 0)
require.Equal(t, issued.ModifyIndex, meta.Index)
index := meta.Index
// Fetch it again
testutil.RunStep(t, "test you get a cache hit on another read", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, _, err := m.Get(ctx, req)
require.NoError(t, err)
require.NotNil(t, issued2)
require.Equal(t, issued, issued2)
})
type reply struct {
cert *structs.IssuedCert
meta cache.ResultMeta
err error
}
replyCh := make(chan *reply, 1)
go func() {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
MinQueryIndex: index,
}
issued2, meta2, err := m.Get(ctx, req)
replyCh <- &reply{issued2, meta2, err}
}()
// Set a new CA
ca2 := signer.UpdateCA(t, nil)
// Issue a blocking query to ensure that the cert gets updated appropriately
testutil.RunStep(t, "test blocking queries update leaf cert", func(t *testing.T) {
var got *reply
select {
case got = <-replyCh:
case <-time.After(500 * time.Millisecond):
t.Fatal("blocking query did not wake up during rotation")
}
issued2, meta2, err := got.cert, got.meta, got.err
require.NoError(t, err)
require.NotNil(t, issued2)
require.NotEqual(t, issued.CertPEM, issued2.CertPEM)
require.NotEqual(t, issued.PrivateKeyPEM, issued2.PrivateKeyPEM)
// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(t, issued2, ca2)
// Should not be a cache hit! The data was updated in response to the blocking
// query being made.
require.False(t, meta2.Hit)
})
testutil.RunStep(t, "test non-blocking queries update leaf cert", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued, _, err := m.Get(ctx, req)
require.NoError(t, err)
require.NotNil(t, issued)
// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, ca2)
// Issue a non blocking query to ensure that the cert gets updated appropriately
{
// Set a new CA
ca3 := signer.UpdateCA(t, nil)
retry.Run(t, func(r *retry.R) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, meta2, err := m.Get(ctx, req)
require.NoError(r, err)
require.NotNil(r, issued2)
requireLeafValidUnderCA(r, issued2, ca3)
// Should not be a cache hit!
require.False(r, meta2.Hit)
require.NotEqual(r, issued.CertPEM, issued2.CertPEM)
require.NotEqual(r, issued.PrivateKeyPEM, issued2.PrivateKeyPEM)
// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(r, issued2, ca3)
})
}
})
}
// Test we can request a leaf cert for a service and witness correct caching,
// blocking, and update semantics.
//
// This test originally was a client agent test in
// agent.TestAgentConnectCALeafCert_goodNotLocal and was cloned here to
// increase complex coverage, but the specific naming of the parent test is
// irrelevant here since there's no notion of the catalog at all at this layer.
func TestManager_workflow_goodNotLocal(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
const TestOverrideCAChangeInitialDelay = 1 * time.Nanosecond
m, signer := testManager(t, func(cfg *Config) {
cfg.TestOverrideCAChangeInitialDelay = TestOverrideCAChangeInitialDelay
})
ca1 := signer.UpdateCA(t, nil)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
// List
issued, meta, err := m.Get(ctx, req)
require.NoError(t, err)
require.False(t, meta.Hit)
require.NotNil(t, issued)
// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, ca1)
// Verify blocking index
require.True(t, issued.ModifyIndex > 0)
require.Equal(t, issued.ModifyIndex, meta.Index)
// Fetch it again
testutil.RunStep(t, "test you get a cache hit on another read", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, _, err := m.Get(ctx, req)
require.NoError(t, err)
require.NotNil(t, issued2)
require.Equal(t, issued, issued2)
})
// Test Blocking - see https://github.com/hashicorp/consul/issues/4462
testutil.RunStep(t, "test blocking issue 4462", func(t *testing.T) {
// Fetch it again
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
MinQueryIndex: issued.ModifyIndex,
MaxQueryTime: 125 * time.Millisecond,
}
var (
respCh = make(chan *structs.IssuedCert)
errCh = make(chan error, 1)
)
go func() {
issued2, _, err := m.Get(ctx, req)
if err != nil {
errCh <- err
} else {
respCh <- issued2
}
}()
select {
case <-time.After(500 * time.Millisecond):
require.FailNow(t, "Shouldn't block for this long - not respecting wait parameter in the query")
case err := <-errCh:
require.NoError(t, err)
case <-respCh:
}
})
testutil.RunStep(t, "test that caching is updated in the background", func(t *testing.T) {
// Set a new CA
ca := signer.UpdateCA(t, nil)
retry.Run(t, func(r *retry.R) {
// Try and sign again (note no index/wait arg since cache should update in
// background even if we aren't actively blocking)
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, _, err := m.Get(ctx, req)
require.NoError(r, err)
if issued.CertPEM == issued2.CertPEM {
r.Fatalf("leaf has not updated")
}
// Got a new leaf. Sanity check it's a whole new key as well as different
// cert.
if issued.PrivateKeyPEM == issued2.PrivateKeyPEM {
r.Fatalf("new leaf has same private key as before")
}
// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(r, issued2, ca)
require.NotEqual(r, issued, issued2)
})
})
}
func TestManager_workflow_nonBlockingQuery_after_blockingQuery_shouldNotBlock(t *testing.T) {
// see: https://github.com/hashicorp/consul/issues/12048
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
m, signer := testManager(t, nil)
_ = signer.UpdateCA(t, nil)
var (
serialNumber string
index uint64
issued *structs.IssuedCert
)
testutil.RunStep(t, "do initial non-blocking query", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued1, meta, err := m.Get(ctx, req)
require.NoError(t, err)
serialNumber = issued1.SerialNumber
require.False(t, meta.Hit, "for the leaf cert cache type these are always MISS")
index = meta.Index
issued = issued1
})
go func() {
// launch goroutine for blocking query
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
MinQueryIndex: index,
}
_, _, _ = m.Get(ctx, req)
}()
// We just need to ensure that the above blocking query is in-flight before
// the next step, so do a little sleep.
time.Sleep(50 * time.Millisecond)
// The initial non-blocking query populated the leaf cert cache entry
// implicitly. The agent cache doesn't prune entries very often at all, so
// in between both of these steps the data should still be there, causing
// this to be a HIT that completes in less than 10m (the default inner leaf
// cert blocking query timeout).
testutil.RunStep(t, "do a non-blocking query that should not block", func(t *testing.T) {
req := &ConnectCALeafRequest{
Datacenter: "dc1",
Service: "test",
EnterpriseMeta: *acl.DefaultEnterpriseMeta(),
}
issued2, meta2, err := m.Get(ctx, req)
require.NoError(t, err)
require.True(t, meta2.Hit)
// If this is actually returning a cached result, the serial number
// should be unchanged.
require.Equal(t, serialNumber, issued2.SerialNumber)
require.Equal(t, issued, issued2)
})
}
func requireLeafValidUnderCA(t require.TestingT, issued *structs.IssuedCert, ca *structs.CARoot) {
require.NotNil(t, issued)
require.NotNil(t, ca)
leaf, intermediates, err := connect.ParseLeafCerts(issued.CertPEM)
require.NoError(t, err)
roots := x509.NewCertPool()
require.True(t, roots.AppendCertsFromPEM([]byte(ca.RootCert)))
_, err = leaf.Verify(x509.VerifyOptions{
Roots: roots,
Intermediates: intermediates,
})
require.NoError(t, err)
// Verify the private key matches. tls.LoadX509Keypair does this for us!
_, err = tls.X509KeyPair([]byte(issued.CertPEM), []byte(issued.PrivateKeyPEM))
require.NoError(t, err)
}
// testManager returns a *Manager that is pre-configured to use a mock RPC
// implementation that can sign certs, and an in-memory CA roots reader that
// interacts well with it.
func testManager(t *testing.T, mut func(*Config)) (*Manager, *testSigner) {
signer := newTestSigner(t, nil, nil)
deps := Deps{
Logger: testutil.Logger(t),
RootsReader: signer.RootsReader,
CertSigner: signer,
Config: Config{
// Override the root-change spread so we don't have to wait up to 20 seconds
// to see root changes work. Can be changed back for specific tests that
// need to test this, Note it's not 0 since that used default but is
// effectively the same.
TestOverrideCAChangeInitialDelay: 1 * time.Microsecond,
},
}
if mut != nil {
mut(&deps.Config)
}
m := NewManager(deps)
t.Cleanup(m.Stop)
return m, signer
}
type testRootsReader struct {
mu sync.Mutex
index uint64
roots *structs.IndexedCARoots
watcher chan struct{}
}
func newTestRootsReader(t *testing.T) *testRootsReader {
r := &testRootsReader{
watcher: make(chan struct{}),
}
t.Cleanup(func() {
r.mu.Lock()
watcher := r.watcher
r.mu.Unlock()
close(watcher)
})
return r
}
var _ RootsReader = (*testRootsReader)(nil)
func (r *testRootsReader) Set(roots *structs.IndexedCARoots) {
r.mu.Lock()
oldWatcher := r.watcher
r.watcher = make(chan struct{})
r.roots = roots
if roots == nil {
r.index = 1
} else {
r.index = roots.Index
}
r.mu.Unlock()
close(oldWatcher)
}
func (r *testRootsReader) Get() (*structs.IndexedCARoots, error) {
r.mu.Lock()
defer r.mu.Unlock()
return r.roots, nil
}
func (r *testRootsReader) Notify(ctx context.Context, correlationID string, ch chan<- cache.UpdateEvent) error {
r.mu.Lock()
watcher := r.watcher
r.mu.Unlock()
go func() {
<-watcher
r.mu.Lock()
defer r.mu.Unlock()
ch <- cache.UpdateEvent{
CorrelationID: correlationID,
Result: r.roots,
Meta: cache.ResultMeta{Index: r.index},
Err: nil,
}
}()
return nil
}
type testGetResult struct {
Index uint64
Value *structs.IssuedCert
Err error
}
// testAsyncGet returns a channel that returns the result of the testGet call.
//
// This is useful for testing timing and concurrency with testGet calls.
func testAsyncGet(t *testing.T, m *Manager, req *ConnectCALeafRequest) <-chan testGetResult {
ch := make(chan testGetResult)
go func() {
index, cert, err := m.testGet(req)
if err != nil {
ch <- testGetResult{Err: err}
return
}
ch <- testGetResult{Index: index, Value: cert}
}()
return ch
}