From 94d28cd6cf96d0f56f8cd5f3f0b7b9777eb26640 Mon Sep 17 00:00:00 2001
From: machine424 <ayoubmrini424@gmail.com>
Date: Fri, 31 May 2024 16:40:09 +0200
Subject: [PATCH] chore(notifier): add a reproducer for 
 https://github.com/prometheus/prometheus/issues/13676

to show "targets groups update" starvation when the notifications queue is full and an Alertmanager
is down.

The existing `TestHangingNotifier` that was added in https://github.com/prometheus/prometheus/pull/10948 doesn't really reflect the reality as the SD changes are manually fed into `syncCh` in a continuous way, whereas in reality, updates are only resent every `updatert`.

The test added here sets up an SD manager and links it to the notifier. The SD changes will be triggered by that manager as it's done in reality.

Signed-off-by: machine424 <ayoubmrini424@gmail.com>

Co-authored-by: Ethan Hunter <ehunter@hudson-trading.com>
---
 discovery/manager.go      |  10 +++
 notifier/notifier_test.go | 149 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 159 insertions(+)

diff --git a/discovery/manager.go b/discovery/manager.go
index f14071af3..897d7d151 100644
--- a/discovery/manager.go
+++ b/discovery/manager.go
@@ -120,6 +120,16 @@ func Name(n string) func(*Manager) {
 	}
 }
 
+// Updatert sets the updatert of the manager.
+// Used to speed up tests.
+func Updatert(u time.Duration) func(*Manager) {
+	return func(m *Manager) {
+		m.mtx.Lock()
+		defer m.mtx.Unlock()
+		m.updatert = u
+	}
+}
+
 // HTTPClientOptions sets the list of HTTP client options to expose to
 // Discoverers. It is up to Discoverers to choose to use the options provided.
 func HTTPClientOptions(opts ...config.HTTPClientOption) func(*Manager) {
diff --git a/notifier/notifier_test.go b/notifier/notifier_test.go
index d2e72ca33..5c82decbe 100644
--- a/notifier/notifier_test.go
+++ b/notifier/notifier_test.go
@@ -26,13 +26,17 @@ import (
 	"testing"
 	"time"
 
+	"github.com/go-kit/log"
 	"github.com/prometheus/alertmanager/api/v2/models"
+	"github.com/prometheus/client_golang/prometheus"
 	config_util "github.com/prometheus/common/config"
 	"github.com/prometheus/common/model"
 	"github.com/stretchr/testify/require"
 	"go.uber.org/atomic"
 	"gopkg.in/yaml.v2"
 
+	"github.com/prometheus/prometheus/discovery"
+
 	"github.com/prometheus/prometheus/config"
 	"github.com/prometheus/prometheus/discovery/targetgroup"
 	"github.com/prometheus/prometheus/model/labels"
@@ -811,3 +815,148 @@ func TestHangingNotifier(t *testing.T) {
 		})
 	}
 }
+
+// TODO: renameit and even replace TestHangingNotifier with it.
+// TestHangingNotifierXXX ensures that the notifier takes into account SD changes even when there are
+// queued alerts. This test reproduces the issue described in https://github.com/prometheus/prometheus/issues/13676.
+func TestHangingNotifierXXX(t *testing.T) {
+	const (
+		batches     = 100
+		alertsCount = maxBatchSize * batches
+	)
+
+	var (
+		sendTimeout = 10 * time.Millisecond
+		sdUpdatert  = sendTimeout / 2
+
+		done = make(chan struct{})
+	)
+
+	defer func() {
+		close(done)
+	}()
+
+	// Set up a faulty Alertmanager.
+	var faultyCalled atomic.Bool
+	faultyServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		faultyCalled.Store(true)
+		select {
+		case <-done:
+		case <-time.After(time.Hour):
+		}
+	}))
+	faultyURL, err := url.Parse(faultyServer.URL)
+	require.NoError(t, err)
+
+	// Set up a functional Alertmanager.
+	var functionalCalled atomic.Bool
+	functionalServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		functionalCalled.Store(true)
+	}))
+	functionalURL, err := url.Parse(functionalServer.URL)
+	require.NoError(t, err)
+
+	// Initialize the discovery manager
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	reg := prometheus.NewRegistry()
+	sdMetrics, err := discovery.RegisterSDMetrics(reg, discovery.NewRefreshMetrics(reg))
+	require.NoError(t, err)
+	sdManager := discovery.NewManager(
+		ctx,
+		log.NewNopLogger(),
+		reg,
+		sdMetrics,
+		discovery.Name("sd-manager"),
+		discovery.Updatert(sdUpdatert),
+	)
+	go sdManager.Run()
+
+	// Set up the notifier with both faulty and functional Alertmanagers.
+	notifier := NewManager(
+		&Options{
+			QueueCapacity: alertsCount,
+		},
+		nil,
+	)
+	notifier.alertmanagers = make(map[string]*alertmanagerSet)
+	amCfg := config.DefaultAlertmanagerConfig
+	amCfg.Timeout = model.Duration(sendTimeout)
+	notifier.alertmanagers["config-0"] = &alertmanagerSet{
+		ams: []alertmanager{
+			alertmanagerMock{
+				urlf: func() string { return faultyURL.String() },
+			},
+			alertmanagerMock{
+				urlf: func() string { return functionalURL.String() },
+			},
+		},
+		cfg:     &amCfg,
+		metrics: notifier.metrics,
+	}
+	go notifier.Run(sdManager.SyncCh())
+	defer notifier.Stop()
+
+	require.Len(t, notifier.Alertmanagers(), 2)
+
+	// Enqueue the alerts.
+	var alerts []*Alert
+	for i := range make([]struct{}, alertsCount) {
+		alerts = append(alerts, &Alert{
+			Labels: labels.FromStrings("alertname", strconv.Itoa(i)),
+		})
+	}
+	notifier.Send(alerts...)
+
+	// Wait for the Alertmanagers to start receiving alerts.
+	// 10*sdUpdatert is used as an arbitrary timeout here.
+	timeout := time.After(10 * sdUpdatert)
+loop1:
+	for {
+		select {
+		case <-timeout:
+			t.Fatalf("Timeout waiting for the alertmanagers to be reached for the first time.")
+		default:
+			if faultyCalled.Load() && functionalCalled.Load() {
+				break loop1
+			}
+		}
+	}
+
+	// Request to remove the faulty Alertmanager.
+	c := map[string]discovery.Configs{
+		"config-0": {
+			discovery.StaticConfig{
+				&targetgroup.Group{
+					Targets: []model.LabelSet{
+						{
+							model.AddressLabel: model.LabelValue(functionalURL.Host),
+						},
+					},
+				},
+			},
+		},
+	}
+	require.NoError(t, sdManager.ApplyConfig(c))
+
+	// The notifier should not wait until the alerts queue is empty to apply the discovery changes
+	// A faulty Alertmanager could cause each alert sending cycle to take up to AlertmanagerConfig.Timeout
+	// The queue may never be emptied, as the arrival rate could be larger than the departure rate
+	// It could even overflow and alerts could be dropped.
+	timeout = time.After(batches * sendTimeout)
+loop2:
+	for {
+		select {
+		case <-timeout:
+			t.Fatalf("Timeout, the faulty alertmanager not removed on time.")
+		default:
+			// The faulty alertmanager was dropped.
+			if len(notifier.Alertmanagers()) == 1 {
+				// Prevent from TOCTOU.
+				require.Positive(t, notifier.queueLen())
+				break loop2
+			}
+			require.Positive(t, notifier.queueLen(), "The faulty alertmanager wasn't dropped before the alerts queue was emptied.")
+		}
+	}
+}