SD: Add target creation failure counter and change failure handling (#8786)

* Added metric and changed failure/drop strategy

Signed-off-by: Levi Harrison <git@leviharrison.dev>
pull/8833/head
Levi Harrison 2021-05-28 17:50:59 -04:00 committed by GitHub
parent ae086c73cb
commit 2826fbeeb7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 48 additions and 7 deletions

View File

@ -275,6 +275,20 @@
description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.' % $._config,
},
},
{
alert: 'PrometheusTargetSyncFailure',
expr: |||
increase(prometheus_target_sync_failed_total{%(prometheusSelector)s}[30m]) > 0
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Prometheus has failed to sync targets.',
description: '{{ printf "%%.0f" $value }} targets in Prometheus %(prometheusName)s have failed to sync because invalid configuration was supplied.' % $._config,
},
},
] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA,
rulesWithoutHA:: [
{

View File

@ -176,6 +176,13 @@ var (
Help: "Total number of times scrape pools hit the label limits, during sync or config reload.",
},
)
targetSyncFailed = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "prometheus_target_sync_failed_total",
Help: "Total number of target sync failures.",
},
[]string{"scrape_job"},
)
)
func init() {
@ -199,6 +206,7 @@ func init() {
targetMetadataCache,
targetScrapeExemplarOutOfOrder,
targetScrapePoolExceededLabelLimits,
targetSyncFailed,
)
}
@ -346,6 +354,7 @@ func (sp *scrapePool) stop() {
targetScrapePoolTargetLimit.DeleteLabelValues(sp.config.JobName)
targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName)
targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName)
targetSyncFailed.DeleteLabelValues(sp.config.JobName)
}
}
@ -445,11 +454,11 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
var all []*Target
sp.droppedTargets = []*Target{}
for _, tg := range tgs {
targets, err := targetsFromGroup(tg, sp.config)
if err != nil {
level.Error(sp.logger).Log("msg", "creating targets failed", "err", err)
continue
targets, failures := targetsFromGroup(tg, sp.config)
for _, err := range failures {
level.Error(sp.logger).Log("msg", "Creating target failed", "err", err)
}
targetSyncFailed.WithLabelValues(sp.config.JobName).Add(float64(len(failures)))
for _, t := range targets {
if t.Labels().Len() > 0 {
all = append(all, t)

View File

@ -414,8 +414,9 @@ func populateLabels(lset labels.Labels, cfg *config.ScrapeConfig) (res, orig lab
}
// targetsFromGroup builds targets based on the given TargetGroup and config.
func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Target, error) {
func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Target, []error) {
targets := make([]*Target, 0, len(tg.Targets))
failures := []error{}
for i, tlset := range tg.Targets {
lbls := make([]labels.Label, 0, len(tlset)+len(tg.Labels))
@ -433,11 +434,11 @@ func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Targe
lbls, origLabels, err := populateLabels(lset, cfg)
if err != nil {
return nil, errors.Wrapf(err, "instance %d in group %s", i, tg)
failures = append(failures, errors.Wrapf(err, "instance %d in group %s", i, tg))
}
if lbls != nil || origLabels != nil {
targets = append(targets, NewTarget(lbls, origLabels, cfg.Params))
}
}
return targets, nil
return targets, failures
}

View File

@ -29,6 +29,8 @@ import (
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"github.com/prometheus/prometheus/config"
"github.com/prometheus/prometheus/discovery/targetgroup"
"github.com/prometheus/prometheus/pkg/labels"
)
@ -365,3 +367,18 @@ func TestNewClientWithBadTLSConfig(t *testing.T) {
t.Fatalf("Expected error, got nil.")
}
}
func TestTargetsFromGroup(t *testing.T) {
expectedError := "instance 0 in group : no address"
targets, failures := targetsFromGroup(&targetgroup.Group{Targets: []model.LabelSet{{}, {model.AddressLabel: "localhost:9090"}}}, &config.ScrapeConfig{})
if len(targets) != 1 {
t.Fatalf("Expected 1 target, got %v", len(targets))
}
if len(failures) != 1 {
t.Fatalf("Expected 1 failure, got %v", len(failures))
}
if failures[0].Error() != expectedError {
t.Fatalf("Expected error %s, got %s", expectedError, failures[0])
}
}