consul/test-integ/peering_commontopo/ac6_failovers_test.go

467 lines
12 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package peering
import (
"fmt"
"testing"
"github.com/hashicorp/consul/testing/deployer/topology"
"github.com/stretchr/testify/require"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/test/integration/consul-container/libs/utils"
)
type ac6FailoversSuite struct {
// inputs
// with all false, this gives us a scenario with:
// - a "near" server in the accepter cluster (DC1), partitition default, namespace default
// - a "far" server in the dialer cluster (DC2), partition default, namespace default
// - a client in the accepter cluster (DC1), partition default, namespace default, with:
// - upstream near server (DC1)
// - failover to far server (DC2)
//
// TODO: technically if NearInDial && !FarInAcc (i.e., near == far), then we're not doing peering at all,
// and could do this test in a single DC
// when true, put the client (and its default upstream server) in the dialer peer; otherwise, put client in accepter
NearInDial bool
// when true, put the client (and its default upstream server) in the nondefault partition/namespace; otherwise in the default
NearInPartAlt bool
NearInNSAlt bool
// when true, put far server to the accepter peer; otherwise the dialer
FarInAcc bool
// when true, put far server to nondefault partition/namespace (ENT-only); otherwise, failover to default
FarInPartAlt bool
FarInNSAlt bool
// launch outputs, for querying during test
clientSID topology.ServiceID
// near = same DC as client; far = other DC
nearServerSID topology.ServiceID
// used to remove the node and trigger failover
nearServerNode topology.NodeID
farServerSID topology.ServiceID
farServerNode topology.NodeID
}
// Note: this test cannot share topo
func TestAC6Failovers(t *testing.T) {
// bit banging to get all permutations of all params
const nParams = 3
// i.e 2**nParams
const n = int(1) << nParams
for i := 0; i < n; i++ {
s := ac6FailoversSuite{
// xth bit == 1
NearInDial: (i>>0)&1 == 1,
NearInPartAlt: (i>>1)&1 == 1,
FarInPartAlt: (i>>2)&1 == 1,
}
// ensure the servers are always in separate DCs
s.FarInAcc = s.NearInDial
t.Run(fmt.Sprintf("%02d_%s", i, s.testName()), func(t *testing.T) {
t.Parallel()
ct := NewCommonTopo(t)
s.setup(t, ct)
ct.Launch(t)
s.test(t, ct)
})
}
}
func TestNET5029Failovers(t *testing.T) {
// TODO: *.{a,b} are not actually peering tests, and should technically be moved elsewhere
suites := map[string]ac6FailoversSuite{
"1.a": {
FarInAcc: true,
FarInPartAlt: true,
},
"1.b": {
FarInAcc: true,
FarInNSAlt: true,
},
"1.c": {
FarInNSAlt: true,
},
"1.d": {
FarInPartAlt: true,
},
"2.a": {
FarInAcc: true,
NearInPartAlt: true,
},
"2.b": {
FarInAcc: true,
NearInNSAlt: true,
},
"2.c": {
NearInDial: true,
NearInNSAlt: true,
FarInAcc: true,
},
"2.d": {
NearInDial: true,
NearInPartAlt: true,
FarInAcc: true,
},
}
for name, s := range suites {
s := s
t.Run(fmt.Sprintf("%s_%s", name, s.testName()), func(t *testing.T) {
if name == "1.b" {
t.Skip("TODO: fails with 503/504")
}
t.Parallel()
ct := NewCommonTopo(t)
s.setup(t, ct)
ct.Launch(t)
s.test(t, ct)
})
}
}
func TestAC6Failovers_AllPermutations(t *testing.T) {
//
t.Skip("Too many permutations")
// bit banging to get all permutations of all params
const nParams = 6
// i.e 2**nParams
const n = int(1) << nParams
for i := 0; i < n; i++ {
s := ac6FailoversSuite{
// xth bit == 1
NearInDial: (i>>0)&1 == 1,
FarInAcc: (i>>1)&1 == 1,
NearInPartAlt: (i>>2)&1 == 1,
FarInPartAlt: (i>>3)&1 == 1,
NearInNSAlt: (i>>4)&1 == 1,
FarInNSAlt: (i>>5)&1 == 1,
}
t.Run(fmt.Sprintf("%02d_%s", i, s.testName()), func(t *testing.T) {
t.Parallel()
ct := NewCommonTopo(t)
s.setup(t, ct)
ct.Launch(t)
s.test(t, ct)
})
}
}
func (s *ac6FailoversSuite) testName() (ret string) {
switch s.NearInDial {
case true:
ret += "dial"
default:
ret += "acc"
}
ret += "."
switch s.NearInPartAlt {
case true:
ret += "alt"
default:
ret += "default"
}
ret += "."
switch s.NearInNSAlt {
case true:
ret += "alt"
default:
ret += "default"
}
ret += "->"
switch s.FarInAcc {
case true:
ret += "acc"
default:
ret += "dial"
}
ret += "."
switch s.FarInPartAlt {
case true:
ret += "alt"
default:
ret += "default"
}
ret += "."
switch s.FarInNSAlt {
case true:
ret += "alt"
default:
ret += "default"
}
return
}
func (s *ac6FailoversSuite) setup(t *testing.T, ct *commonTopo) {
if !utils.IsEnterprise() && (s.NearInPartAlt || s.FarInPartAlt) {
t.Skip("ENT required for nondefault partitions")
}
nearClu := ct.DC1
farClu := ct.DC2
if s.NearInDial {
nearClu = ct.DC2
}
if s.FarInAcc {
farClu = ct.DC1
}
// - server in clientPartition/DC (main target)
nearServerSID := topology.ServiceID{
Name: "ac6-server",
Partition: ConfigEntryPartition("default"),
Namespace: "default",
}
if s.NearInPartAlt {
nearServerSID.Partition = "part1"
}
if s.NearInNSAlt {
nearServerSID.Namespace = "ns1"
}
nearServer := NewFortioServiceWithDefaults(
nearClu.Datacenter,
nearServerSID,
nil,
)
nearServerNode := ct.AddServiceNode(nearClu, serviceExt{Service: nearServer})
nearClu.InitialConfigEntries = append(nearClu.InitialConfigEntries,
&api.ServiceConfigEntry{
Kind: api.ServiceDefaults,
Name: nearServerSID.Name,
Partition: ConfigEntryPartition(nearServerSID.Partition),
Namespace: nearServerSID.Namespace,
Protocol: "http",
},
)
// - server in otherPartition/otherDC
farServerSID := topology.ServiceID{
Name: nearServerSID.Name,
Partition: "default",
Namespace: "default",
}
if s.FarInPartAlt {
farServerSID.Partition = "part1"
}
if s.FarInNSAlt {
farServerSID.Namespace = "ns1"
}
farServer := NewFortioServiceWithDefaults(
farClu.Datacenter,
farServerSID,
nil,
)
farServerNode := ct.AddServiceNode(farClu, serviceExt{Service: farServer})
if nearClu != farClu {
ct.ExportService(farClu, farServerSID.Partition,
api.ExportedService{
Name: farServerSID.Name,
Namespace: farServerSID.Namespace,
Consumers: []api.ServiceConsumer{
{
Peer: LocalPeerName(nearClu, nearServerSID.Partition),
},
},
},
)
} else if nearClu == farClu && farServerSID.Partition != nearServerSID.Partition {
ct.ExportService(farClu, farServerSID.Partition,
api.ExportedService{
Name: farServerSID.Name,
Namespace: farServerSID.Namespace,
Consumers: []api.ServiceConsumer{
{
// this must not be "", or else it is basically ignored altogether
// TODO: bug? if this whole struct is empty, that should be an error
Partition: topology.PartitionOrDefault(nearServerSID.Partition),
},
},
},
)
}
var targets []api.ServiceResolverFailoverTarget
if nearClu != farClu {
targets = []api.ServiceResolverFailoverTarget{
{
Service: farServerSID.Name,
Peer: LocalPeerName(farClu, farServerSID.Partition),
Namespace: farServerSID.Namespace,
},
}
} else {
part := ConfigEntryPartition(farServerSID.Partition)
// weird exception here where target partition set to "" means "inherit from parent"
// TODO: bug? docs say "" -> default:
// https://developer.hashicorp.com/consul/docs/connect/config-entries/service-resolver#failover-targets-partition
if farServerSID.Partition == "default" && nearServerSID.Partition != "default" {
part = "default"
}
targets = []api.ServiceResolverFailoverTarget{
{
Service: farServerSID.Name,
Partition: part,
Namespace: farServerSID.Namespace,
},
}
}
nearClu.InitialConfigEntries = append(nearClu.InitialConfigEntries,
&api.ServiceConfigEntry{
Kind: api.ServiceDefaults,
Name: farServerSID.Name,
Partition: ConfigEntryPartition(farServerSID.Partition),
Namespace: farServerSID.Namespace,
Protocol: "http",
},
&api.ServiceResolverConfigEntry{
Kind: api.ServiceResolver,
Name: nearServerSID.Name,
Partition: ConfigEntryPartition(nearServerSID.Partition),
Namespace: nearServerSID.Namespace,
Failover: map[string]api.ServiceResolverFailover{
"*": {
Targets: targets,
},
},
},
)
clientSID := topology.ServiceID{
Name: "ac6-client",
Partition: nearServerSID.Partition,
Namespace: nearServerSID.Namespace,
}
client := NewFortioServiceWithDefaults(
nearClu.Datacenter,
clientSID,
func(s *topology.Service) {
// Upstream per partition
s.Upstreams = []*topology.Upstream{
{
ID: topology.ServiceID{
Name: nearServerSID.Name,
Partition: nearServerSID.Partition,
Namespace: nearServerSID.Namespace,
},
LocalPort: 5000,
// exposed so we can hit it directly
// TODO: we shouldn't do this; it's not realistic
LocalAddress: "0.0.0.0",
},
}
},
)
ct.AddServiceNode(nearClu, serviceExt{Service: client})
nearClu.InitialConfigEntries = append(nearClu.InitialConfigEntries,
&api.ServiceConfigEntry{
Kind: api.ServiceDefaults,
Name: clientSID.Name,
Partition: ConfigEntryPartition(clientSID.Partition),
Namespace: clientSID.Namespace,
Protocol: "http",
},
)
// intentions
nearClu.InitialConfigEntries = append(nearClu.InitialConfigEntries,
&api.ServiceIntentionsConfigEntry{
Kind: api.ServiceIntentions,
Name: nearServerSID.Name,
Partition: ConfigEntryPartition(nearServerSID.Partition),
Namespace: nearServerSID.Namespace,
Sources: []*api.SourceIntention{{
Name: clientSID.Name,
Namespace: clientSID.Namespace,
// in this field, "" -> destination partition, so no ConfigEntryPartition :eyeroll:
// https://developer.hashicorp.com/consul/docs/connect/config-entries/service-intentions#sources-partition
Partition: topology.PartitionOrDefault(clientSID.Partition),
Action: api.IntentionActionAllow,
}},
},
)
farSource := api.SourceIntention{
Name: clientSID.Name,
Namespace: clientSID.Namespace,
Peer: LocalPeerName(nearClu, clientSID.Partition),
Action: api.IntentionActionAllow,
}
if nearClu == farClu {
farSource.Peer = ""
// in this field, "" -> destination partition, so no ConfigEntryPartition :eyeroll:
// https://developer.hashicorp.com/consul/docs/connect/config-entries/service-intentions#sources-partition
farSource.Partition = topology.PartitionOrDefault(clientSID.Partition)
}
farClu.InitialConfigEntries = append(farClu.InitialConfigEntries,
&api.ServiceIntentionsConfigEntry{
Kind: api.ServiceIntentions,
Name: farServerSID.Name,
Partition: ConfigEntryPartition(farServerSID.Partition),
Namespace: farServerSID.Namespace,
Sources: []*api.SourceIntention{&farSource},
},
)
s.clientSID = clientSID
s.nearServerSID = nearServerSID
s.farServerSID = farServerSID
s.nearServerNode = nearServerNode.ID()
s.farServerNode = farServerNode.ID()
}
func (s *ac6FailoversSuite) test(t *testing.T, ct *commonTopo) {
// NOTE: *not parallel* because we mutate resources that are shared
// between test cases (disable/enable nodes)
nearClu := ct.Sprawl.Topology().Clusters["dc1"]
farClu := ct.Sprawl.Topology().Clusters["dc2"]
if s.NearInDial {
nearClu = ct.Sprawl.Topology().Clusters["dc2"]
}
if s.FarInAcc {
farClu = ct.Sprawl.Topology().Clusters["dc1"]
}
svcs := nearClu.ServicesByID(s.clientSID)
require.Len(t, svcs, 1, "expected exactly one client in datacenter")
client := svcs[0]
require.Len(t, client.Upstreams, 1, "expected one upstream for client")
upstream := client.Upstreams[0]
fmt.Println("### preconditions")
// this is the server in the same DC and partitions as client
serverSID := s.nearServerSID
serverSID.Normalize()
ct.Assert.FortioFetch2FortioName(t, client, upstream, nearClu.Name, serverSID)
ct.Assert.CatalogServiceExists(t, nearClu.Name, upstream.ID.Name, utils.CompatQueryOpts(&api.QueryOptions{
Partition: upstream.ID.Partition,
Namespace: upstream.ID.Namespace,
}))
if t.Failed() {
t.Fatal("failed preconditions")
}
fmt.Println("### failover")
cfg := ct.Sprawl.Config()
DisableNode(t, cfg, nearClu.Name, s.nearServerNode)
require.NoError(t, ct.Sprawl.RelaunchWithPhase(cfg, "failover"))
// Clusters for imported services rely on outlier detection for
// failovers, NOT eds_health_status. This means that killing the
// node above does not actually make the envoy cluster UNHEALTHY
// so we do not assert for it.
expectSID := s.farServerSID
expectSID.Normalize()
ct.Assert.FortioFetch2FortioName(t, client, upstream, farClu.Name, expectSID)
}