notifier: optionally drain queued notifications before shutting down (#14290)
* Add draining of queued notifications to `notifier.Manager`
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Update docs
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Address PR feedback
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Add more logging
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Address offline feedback: remove timeout
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Ensure stopping takes priority over further processing, make tests more robust
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Make channel unbuffered
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Update docs
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Fix race in test
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Remove unnecessary context
Signed-off-by: Charles Korn <charles.korn@grafana.com>
* Make Stop safe to call multiple times
Signed-off-by: Charles Korn <charles.korn@grafana.com>
---------
Signed-off-by: Charles Korn <charles.korn@grafana.com>
serverOnlyFlag(a,"alertmanager.drain-notification-queue-on-shutdown","Send any outstanding Alertmanager notifications when shutting down. If false, any outstanding Alertmanager notifications will be dropped when shutting down.").
| <codeclass="text-nowrap">--rules.alert.resend-delay</code> | Minimum amount of time to wait before resending an alert to Alertmanager. Use with server mode only. | `1m` |
| <codeclass="text-nowrap">--rules.alert.resend-delay</code> | Minimum amount of time to wait before resending an alert to Alertmanager. Use with server mode only. | `1m` |
| <codeclass="text-nowrap">--rules.max-concurrent-evals</code> | Global concurrency limit for independent rules that can run concurrently. When set, "query.max-concurrency" may need to be adjusted accordingly. Use with server mode only. | `4` |
| <codeclass="text-nowrap">--rules.max-concurrent-evals</code> | Global concurrency limit for independent rules that can run concurrently. When set, "query.max-concurrency" may need to be adjusted accordingly. Use with server mode only. | `4` |
| <codeclass="text-nowrap">--alertmanager.notification-queue-capacity</code> | The capacity of the queue for pending Alertmanager notifications. Use with server mode only. | `10000` |
| <codeclass="text-nowrap">--alertmanager.notification-queue-capacity</code> | The capacity of the queue for pending Alertmanager notifications. Use with server mode only. | `10000` |
| <codeclass="text-nowrap">--alertmanager.drain-notification-queue-on-shutdown</code> | Send any outstanding Alertmanager notifications when shutting down. If false, any outstanding Alertmanager notifications will be dropped when shutting down. Use with server mode only. | `true` |
| <codeclass="text-nowrap">--query.lookback-delta</code> | The maximum lookback duration for retrieving metrics during expression evaluations and federation. Use with server mode only. | `5m` |
| <codeclass="text-nowrap">--query.lookback-delta</code> | The maximum lookback duration for retrieving metrics during expression evaluations and federation. Use with server mode only. | `5m` |
| <codeclass="text-nowrap">--query.timeout</code> | Maximum time a query may take before being aborted. Use with server mode only. | `2m` |
| <codeclass="text-nowrap">--query.timeout</code> | Maximum time a query may take before being aborted. Use with server mode only. | `2m` |
| <codeclass="text-nowrap">--query.max-concurrency</code> | Maximum number of queries executed concurrently. Use with server mode only. | `20` |
| <codeclass="text-nowrap">--query.max-concurrency</code> | Maximum number of queries executed concurrently. Use with server mode only. | `20` |
// If we've been asked to stop, that takes priority over processing any further target group updates.
select{
select{
case<-n.ctx.Done():
case<-n.stopRequested:
return
default:
select{
case<-n.stopRequested:
return
return
casets:=<-tsets:
casets:=<-tsets:
n.reload(ts)
n.reload(ts)
}
}
}
}
}
}
}
func(n*Manager)sendOneBatch(){
alerts:=n.nextBatch()
if!n.sendAll(alerts...){
n.metrics.dropped.Add(float64(len(alerts)))
}
}
func(n*Manager)drainQueue(){
if!n.opts.DrainOnShutdown{
ifn.queueLen()>0{
level.Warn(n.logger).Log("msg","Draining remaining notifications on shutdown is disabled, and some notifications have been dropped","count",n.queueLen())
n.metrics.dropped.Add(float64(n.queueLen()))
}
return
}
level.Info(n.logger).Log("msg","Draining any remaining notifications...")