From 1af81dc5c9f67227aaf7d404b0ffe05355790513 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Mon, 3 Jan 2022 11:13:48 +0100 Subject: [PATCH 1/2] Update sent timestamp when write irrecoverably fails. We have an alert that fires when prometheus_remote_storage_highest_timestamp_in_seconds - prometheus_remote_storage_queue_highest_sent_timestamp_seconds becomes too high. But we have an agent that fires this when the remote "rate-limits" the user. This is because prometheus_remote_storage_queue_highest_sent_timestamp_seconds doesn't get updated when the remote sends a 429. I think we should update the metrics, and the change I made makes sense. Because if the requests fails because of connectivity issues, etc. we will never exit the `sendWriteRequestWithBackoff` function. It only exits the function when there is a non-recoverable error, like a bad status code, and in that case, I think the metric needs to be updated. Signed-off-by: Goutham Veeramachaneni --- storage/remote/queue_manager.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 49de2c8a3..05ead4712 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -1311,12 +1311,11 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti } err = sendWriteRequestWithBackoff(ctx, s.qm.cfg, s.qm.logger, attemptStore, onRetry) - if err != nil { - return err - } + s.qm.metrics.sentBytesTotal.Add(float64(reqSize)) s.qm.metrics.highestSentTimestamp.Set(float64(highest / 1000)) - return nil + + return err } func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l log.Logger, attempt func(int) error, onRetry func()) error { From 6696b7a5f09a228728732d736c8474c3cc16dae0 Mon Sep 17 00:00:00 2001 From: Goutham Veeramachaneni Date: Tue, 4 Jan 2022 10:46:52 +0100 Subject: [PATCH 2/2] Don't update metrics on context cancellation Signed-off-by: Goutham Veeramachaneni --- storage/remote/queue_manager.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 05ead4712..e4a2b5399 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -15,6 +15,7 @@ package remote import ( "context" + "errors" "math" "strconv" "sync" @@ -1311,6 +1312,11 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti } err = sendWriteRequestWithBackoff(ctx, s.qm.cfg, s.qm.logger, attemptStore, onRetry) + if errors.Is(err, context.Canceled) { + // When there is resharding, we cancel the context for this queue, which means the data is not sent. + // So we exit early to not update the metrics. + return err + } s.qm.metrics.sentBytesTotal.Add(float64(reqSize)) s.qm.metrics.highestSentTimestamp.Set(float64(highest / 1000))