// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package peering
import (
"fmt"
"testing"
"github.com/hashicorp/consul/testing/deployer/topology"
"github.com/stretchr/testify/require"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/test/integration/consul-container/libs/utils"
)
// note: unlike other *Suite structs that are per-peering direction,
// this one is special and does all directions itself, because the
// setup is not exactly symmetrical
type ac6FailoversSuite struct {
ac6 map [ nodeKey ] ac6FailoversContext
}
type ac6FailoversContext struct {
clientSID topology . ServiceID
serverSID topology . ServiceID
// used to remove the node and trigger failover
serverNode topology . NodeID
}
type nodeKey struct {
dc string
partition string
}
// Note: this test cannot share topo
func TestAC6Failovers ( t * testing . T ) {
ct := NewCommonTopo ( t )
s := & ac6FailoversSuite { }
s . setup ( t , ct )
ct . Launch ( t )
s . test ( t , ct )
}
func ( s * ac6FailoversSuite ) setup ( t * testing . T , ct * commonTopo ) {
// TODO: update setups to loop through a cluster's partitions+namespaces internally
s . setupAC6Failovers ( ct , ct . DC1 , ct . DC2 )
s . setupAC6Failovers ( ct , ct . DC2 , ct . DC1 )
s . setupAC6FailoversDC3 ( ct , ct . DC3 , ct . DC1 , ct . DC2 )
}
// dc1 is peered with dc2 and dc3.
// dc1 has an ac6-client in "default" and "part1" partitions (only default in OSS).
// ac6-client has a single upstream ac6-failover-svc in its respective partition^.
//
// ac6-failover-svc has the following failovers:
// - peer-dc2-default
// - peer-dc2-part1 (not in OSS)
// - peer-dc3-default
//
// This setup is mirrored from dc2->dc1 as well
// (both dcs have dc3 as the last failover target)
//
// ^NOTE: There are no cross-partition upstreams because MeshGatewayMode = local
// and failover information gets stripped out by the mesh gateways so we
// can't test failovers.
func ( s * ac6FailoversSuite ) setupAC6Failovers ( ct * commonTopo , clu , peerClu * topology . Cluster ) {
for _ , part := range clu . Partitions {
partition := part . Name
// There is a peering per partition in the peered cluster
var peers [ ] string
for _ , peerPart := range peerClu . Partitions {
peers = append ( peers , LocalPeerName ( peerClu , peerPart . Name ) )
}
// Make an HTTP server with various failover targets
serverSID := topology . ServiceID {
Name : "ac6-failover-svc" ,
Partition : partition ,
}
server := NewFortioServiceWithDefaults (
clu . Datacenter ,
serverSID ,
nil ,
)
// Export to all known peers
ct . ExportService ( clu , partition ,
api . ExportedService {
Name : server . ID . Name ,
Consumers : func ( ) [ ] api . ServiceConsumer {
var consumers [ ] api . ServiceConsumer
for _ , peer := range peers {
consumers = append ( consumers , api . ServiceConsumer {
Peer : peer ,
} )
}
return consumers
} ( ) ,
} ,
)
serverNode := ct . AddServiceNode ( clu , serviceExt { Service : server } )
clu . InitialConfigEntries = append ( clu . InitialConfigEntries ,
& api . ServiceConfigEntry {
Kind : api . ServiceDefaults ,
Name : server . ID . Name ,
Partition : ConfigEntryPartition ( partition ) ,
Protocol : "http" ,
} ,
& api . ServiceResolverConfigEntry {
Kind : api . ServiceResolver ,
Name : server . ID . Name ,
Partition : ConfigEntryPartition ( partition ) ,
Failover : map [ string ] api . ServiceResolverFailover {
"*" : {
Targets : func ( ) [ ] api . ServiceResolverFailoverTarget {
// Make a failover target for every partition in the peer cluster
var targets [ ] api . ServiceResolverFailoverTarget
for _ , peer := range peers {
targets = append ( targets , api . ServiceResolverFailoverTarget {
Peer : peer ,
} )
}
// Just hard code default partition for dc3, since the exhaustive
// testing will be done against dc2.
targets = append ( targets , api . ServiceResolverFailoverTarget {
Peer : "peer-dc3-default" ,
} )
return targets
} ( ) ,
} ,
} ,
} ,
)
// Make client which will dial server
clientSID := topology . ServiceID {
Name : "ac6-client" ,
Partition : partition ,
}
client := NewFortioServiceWithDefaults (
clu . Datacenter ,
clientSID ,
func ( s * topology . Service ) {
// Upstream per partition
s . Upstreams = [ ] * topology . Upstream {
{
ID : topology . ServiceID {
Name : server . ID . Name ,
Partition : part . Name ,
} ,
LocalPort : 5000 ,
// exposed so we can hit it directly
// TODO: we shouldn't do this; it's not realistic
LocalAddress : "0.0.0.0" ,
} ,
}
} ,
)
ct . ExportService ( clu , partition ,
api . ExportedService {
Name : client . ID . Name ,
Consumers : func ( ) [ ] api . ServiceConsumer {
var consumers [ ] api . ServiceConsumer
// Export to each peer
for _ , peer := range peers {
consumers = append ( consumers , api . ServiceConsumer {
Peer : peer ,
} )
}
return consumers
} ( ) ,
} ,
)
ct . AddServiceNode ( clu , serviceExt { Service : client } )
clu . InitialConfigEntries = append ( clu . InitialConfigEntries ,
& api . ServiceConfigEntry {
Kind : api . ServiceDefaults ,
Name : client . ID . Name ,
Partition : ConfigEntryPartition ( partition ) ,
Protocol : "http" ,
} ,
)
// Add intention allowing local and peered clients to call server
clu . InitialConfigEntries = append ( clu . InitialConfigEntries ,
& api . ServiceIntentionsConfigEntry {
Kind : api . ServiceIntentions ,
Name : server . ID . Name ,
Partition : ConfigEntryPartition ( partition ) ,
// SourceIntention for local client and peered clients
Sources : func ( ) [ ] * api . SourceIntention {
ixns := [ ] * api . SourceIntention {
{
Name : client . ID . Name ,
Partition : ConfigEntryPartition ( part . Name ) ,
Action : api . IntentionActionAllow ,
} ,
}
for _ , peer := range peers {
ixns = append ( ixns , & api . SourceIntention {
Name : client . ID . Name ,
Peer : peer ,
Action : api . IntentionActionAllow ,
} )
}
return ixns
} ( ) ,
} ,
)
if s . ac6 == nil {
s . ac6 = map [ nodeKey ] ac6FailoversContext { }
}
s . ac6 [ nodeKey { clu . Datacenter , partition } ] = struct {
clientSID topology . ServiceID
serverSID topology . ServiceID
serverNode topology . NodeID
} {
clientSID : clientSID ,
serverSID : serverSID ,
serverNode : serverNode . ID ( ) ,
}
}
}
func ( s * ac6FailoversSuite ) setupAC6FailoversDC3 ( ct * commonTopo , clu , peer1 , peer2 * topology . Cluster ) {
var peers [ ] string
for _ , part := range peer1 . Partitions {
peers = append ( peers , LocalPeerName ( peer1 , part . Name ) )
}
for _ , part := range peer2 . Partitions {
peers = append ( peers , LocalPeerName ( peer2 , part . Name ) )
}
partition := "default"
// Make an HTTP server
server := NewFortioServiceWithDefaults (
clu . Datacenter ,
topology . ServiceID {
Name : "ac6-failover-svc" ,
Partition : partition ,
} ,
nil ,
)
ct . AddServiceNode ( clu , serviceExt {
Service : server ,
Config : & api . ServiceConfigEntry {
Kind : api . ServiceDefaults ,
Name : server . ID . Name ,
Partition : ConfigEntryPartition ( partition ) ,
Protocol : "http" ,
} ,
Intentions : & api . ServiceIntentionsConfigEntry {
Kind : api . ServiceIntentions ,
Name : server . ID . Name ,
Partition : ConfigEntryPartition ( partition ) ,
Sources : func ( ) [ ] * api . SourceIntention {
var ixns [ ] * api . SourceIntention
for _ , peer := range peers {
ixns = append ( ixns , & api . SourceIntention {
Name : "ac6-client" ,
Peer : peer ,
Action : api . IntentionActionAllow ,
} )
}
return ixns
} ( ) ,
} ,
Exports : func ( ) [ ] api . ServiceConsumer {
var consumers [ ] api . ServiceConsumer
for _ , peer := range peers {
consumers = append ( consumers , api . ServiceConsumer {
Peer : peer ,
} )
}
return consumers
} ( ) ,
} )
}
func ( s * ac6FailoversSuite ) test ( t * testing . T , ct * commonTopo ) {
dc1 := ct . Sprawl . Topology ( ) . Clusters [ "dc1" ]
dc2 := ct . Sprawl . Topology ( ) . Clusters [ "dc2" ]
type testcase struct {
name string
cluster * topology . Cluster
peer * topology . Cluster
partition string
}
tcs := [ ] testcase {
{
name : "dc1 default partition failovers" ,
cluster : dc1 ,
peer : dc2 , // dc3 is hardcoded
partition : "default" ,
} ,
{
name : "dc1 part1 partition failovers" ,
cluster : dc1 ,
peer : dc2 , // dc3 is hardcoded
partition : "part1" ,
} ,
{
name : "dc2 default partition failovers" ,
cluster : dc2 ,
peer : dc1 , // dc3 is hardcoded
partition : "default" ,
} ,
{
name : "dc2 part1 partition failovers" ,
cluster : dc2 ,
peer : dc1 , // dc3 is hardcoded
partition : "part1" ,
} ,
}
for _ , tc := range tcs {
t . Run ( tc . name , func ( t * testing . T ) {
// NOTE: *not parallel* because we mutate resources that are shared
// between test cases (disable/enable nodes)
if ! utils . IsEnterprise ( ) && tc . partition != "default" {
t . Skip ( "skipping enterprise test" )
}
partition := tc . partition
clu := tc . cluster
peerClu := tc . peer
svcs := clu . ServicesByID ( s . ac6 [ nodeKey { clu . Datacenter , partition } ] . clientSID )
require . Len ( t , svcs , 1 , "expected exactly one client in datacenter" )
serverSID := s . ac6 [ nodeKey { clu . Datacenter , partition } ] . serverSID
serverSID . Normalize ( )
client := svcs [ 0 ]
require . Len ( t , client . Upstreams , 1 , "expected one upstream for client" )
u := client . Upstreams [ 0 ]
ct . Assert . CatalogServiceExists ( t , clu . Name , u . ID . Name , utils . CompatQueryOpts ( & api . QueryOptions {
Partition : u . ID . Partition ,
} ) )
t . Cleanup ( func ( ) {
cfg := ct . Sprawl . Config ( )
for _ , part := range clu . Partitions {
EnableNode ( t , cfg , clu . Name , s . ac6 [ nodeKey { clu . Datacenter , part . Name } ] . serverNode )
}
for _ , part := range peerClu . Partitions {
EnableNode ( t , cfg , peerClu . Name , s . ac6 [ nodeKey { peerClu . Datacenter , part . Name } ] . serverNode )
}
require . NoError ( t , ct . Sprawl . Relaunch ( cfg ) )
} )
fmt . Println ( "### preconditions" )
// TODO: deduce this number, instead of hard-coding
nFailoverTargets := 4
// in OSS, we don't have failover targets for non-default partitions
if ! utils . IsEnterprise ( ) {
nFailoverTargets = 3
}
for i := 0 ; i < nFailoverTargets ; i ++ {
ct . Assert . UpstreamEndpointStatus ( t , client , fmt . Sprintf ( "failover-target~%d~%s" , i , clusterPrefix ( u , clu . Datacenter ) ) , "HEALTHY" , 1 )
}
ct . Assert . FortioFetch2FortioName ( t , client , u , clu . Name , serverSID )
if t . Failed ( ) {
t . Fatalf ( "failed preconditions" )
}
fmt . Println ( "### Failover to peer target" )
cfg := ct . Sprawl . Config ( )
DisableNode ( t , cfg , clu . Name , s . ac6 [ nodeKey { clu . Datacenter , partition } ] . serverNode )
require . NoError ( t , ct . Sprawl . Relaunch ( cfg ) )
// Clusters for imported services rely on outlier detection for
// failovers, NOT eds_health_status. This means that killing the
// node above does not actually make the envoy cluster UNHEALTHY
// so we do not assert for it.
expectUID := topology . ServiceID {
Name : u . ID . Name ,
Partition : "default" ,
}
expectUID . Normalize ( )
ct . Assert . FortioFetch2FortioName ( t , client , u , peerClu . Name , expectUID )
if utils . IsEnterprise ( ) {
fmt . Println ( "### Failover to peer target in non-default partition" )
cfg = ct . Sprawl . Config ( )
DisableNode ( t , cfg , clu . Name , s . ac6 [ nodeKey { clu . Datacenter , partition } ] . serverNode )
DisableNode ( t , cfg , peerClu . Name , s . ac6 [ nodeKey { peerClu . Datacenter , "default" } ] . serverNode )
require . NoError ( t , ct . Sprawl . Relaunch ( cfg ) )
// Retry until outlier_detection deems the cluster
// unhealthy and fails over to peer part1.
expectUID = topology . ServiceID {
Name : u . ID . Name ,
Partition : "part1" ,
}
expectUID . Normalize ( )
ct . Assert . FortioFetch2FortioName ( t , client , u , peerClu . Name , expectUID )
}
fmt . Println ( "### Failover to dc3 peer target" )
cfg = ct . Sprawl . Config ( )
DisableNode ( t , cfg , clu . Name , s . ac6 [ nodeKey { clu . Datacenter , partition } ] . serverNode )
// Disable all partitions for peer
for _ , part := range peerClu . Partitions {
DisableNode ( t , cfg , peerClu . Name , s . ac6 [ nodeKey { peerClu . Datacenter , part . Name } ] . serverNode )
}
require . NoError ( t , ct . Sprawl . Relaunch ( cfg ) )
// This will retry until outlier_detection deems the cluster
// unhealthy and fails over to dc3.
expectUID = topology . ServiceID {
Name : u . ID . Name ,
Partition : "default" ,
}
expectUID . Normalize ( )
ct . Assert . FortioFetch2FortioName ( t , client , u , "dc3" , expectUID )
} )
}
}
func clusterPrefix ( u * topology . Upstream , dc string ) string {
u . ID . Normalize ( )
switch u . ID . Partition {
case "default" :
return fmt . Sprintf ( "%s.%s.%s.internal" , u . ID . Name , u . ID . Namespace , dc )
default :
return fmt . Sprintf ( "%s.%s.%s.%s.internal-v1" , u . ID . Name , u . ID . Namespace , u . ID . Partition , dc )
}
}