Fix updating rule manager never finishing (#7138)

Rather than sending a value to the done channel on a group to indicate
whether or not to add stale markers to a closing rule group use an
explicit boolean. This allows more functions than just run() to read
from the done channel and fixes an issue where Eval() could consume the
channel during an update, causing run() to never return.

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
pull/7139/head
Chris Marchbanks 2020-04-18 06:32:18 -06:00 committed by GitHub
parent ca23cd064e
commit a7b449320d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 11 additions and 17 deletions

View File

@ -232,7 +232,8 @@ type Group struct {
shouldRestore bool
done chan bool
markStale bool
done chan struct{}
terminated chan struct{}
managerDone chan struct{}
@ -270,7 +271,7 @@ func NewGroup(o GroupOptions) *Group {
shouldRestore: o.ShouldRestore,
opts: o.Opts,
seriesInPreviousEval: make([]map[string]labels.Labels, len(o.Rules)),
done: make(chan bool),
done: make(chan struct{}),
managerDone: o.done,
terminated: make(chan struct{}),
logger: log.With(o.Opts.Logger, "group", o.Name),
@ -326,8 +327,8 @@ func (g *Group) run(ctx context.Context) {
tick := time.NewTicker(g.interval)
defer tick.Stop()
makeStale := func(s bool) {
if !s {
defer func() {
if !g.markStale {
return
}
go func(now time.Time) {
@ -347,7 +348,7 @@ func (g *Group) run(ctx context.Context) {
g.cleanupStaleSeries(now)
}
}(time.Now())
}
}()
iter()
if g.shouldRestore {
@ -356,8 +357,7 @@ func (g *Group) run(ctx context.Context) {
// we might not have enough data scraped, and recording rules would not
// have updated the latest values, on which some alerts might depend.
select {
case stale := <-g.done:
makeStale(stale)
case <-g.done:
return
case <-tick.C:
missed := (time.Since(evalTimestamp) / g.interval) - 1
@ -375,13 +375,11 @@ func (g *Group) run(ctx context.Context) {
for {
select {
case stale := <-g.done:
makeStale(stale)
case <-g.done:
return
default:
select {
case stale := <-g.done:
makeStale(stale)
case <-g.done:
return
case <-tick.C:
missed := (time.Since(evalTimestamp) / g.interval) - 1
@ -396,11 +394,6 @@ func (g *Group) run(ctx context.Context) {
}
}
func (g *Group) stopAndMakeStale() {
g.done <- true
<-g.terminated
}
func (g *Group) stop() {
close(g.done)
<-g.terminated
@ -943,7 +936,8 @@ func (m *Manager) Update(interval time.Duration, files []string, externalLabels
wg.Add(len(m.groups))
for n, oldg := range m.groups {
go func(n string, g *Group) {
g.stopAndMakeStale()
g.markStale = true
g.stop()
if m := g.metrics; m != nil {
m.groupInterval.DeleteLabelValues(n)
m.groupLastEvalTime.DeleteLabelValues(n)