mirror of https://github.com/prometheus/prometheus
SD: Add target creation failure counter and change failure handling (#8786)
* Added metric and changed failure/drop strategy Signed-off-by: Levi Harrison <git@leviharrison.dev>pull/8833/head
parent
ae086c73cb
commit
2826fbeeb7
|
@ -275,6 +275,20 @@
|
|||
description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTargetSyncFailure',
|
||||
expr: |||
|
||||
increase(prometheus_target_sync_failed_total{%(prometheusSelector)s}[30m]) > 0
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus has failed to sync targets.',
|
||||
description: '{{ printf "%%.0f" $value }} targets in Prometheus %(prometheusName)s have failed to sync because invalid configuration was supplied.' % $._config,
|
||||
},
|
||||
},
|
||||
] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA,
|
||||
rulesWithoutHA:: [
|
||||
{
|
||||
|
|
|
@ -176,6 +176,13 @@ var (
|
|||
Help: "Total number of times scrape pools hit the label limits, during sync or config reload.",
|
||||
},
|
||||
)
|
||||
targetSyncFailed = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Name: "prometheus_target_sync_failed_total",
|
||||
Help: "Total number of target sync failures.",
|
||||
},
|
||||
[]string{"scrape_job"},
|
||||
)
|
||||
)
|
||||
|
||||
func init() {
|
||||
|
@ -199,6 +206,7 @@ func init() {
|
|||
targetMetadataCache,
|
||||
targetScrapeExemplarOutOfOrder,
|
||||
targetScrapePoolExceededLabelLimits,
|
||||
targetSyncFailed,
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -346,6 +354,7 @@ func (sp *scrapePool) stop() {
|
|||
targetScrapePoolTargetLimit.DeleteLabelValues(sp.config.JobName)
|
||||
targetScrapePoolTargetsAdded.DeleteLabelValues(sp.config.JobName)
|
||||
targetSyncIntervalLength.DeleteLabelValues(sp.config.JobName)
|
||||
targetSyncFailed.DeleteLabelValues(sp.config.JobName)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -445,11 +454,11 @@ func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
|
|||
var all []*Target
|
||||
sp.droppedTargets = []*Target{}
|
||||
for _, tg := range tgs {
|
||||
targets, err := targetsFromGroup(tg, sp.config)
|
||||
if err != nil {
|
||||
level.Error(sp.logger).Log("msg", "creating targets failed", "err", err)
|
||||
continue
|
||||
targets, failures := targetsFromGroup(tg, sp.config)
|
||||
for _, err := range failures {
|
||||
level.Error(sp.logger).Log("msg", "Creating target failed", "err", err)
|
||||
}
|
||||
targetSyncFailed.WithLabelValues(sp.config.JobName).Add(float64(len(failures)))
|
||||
for _, t := range targets {
|
||||
if t.Labels().Len() > 0 {
|
||||
all = append(all, t)
|
||||
|
|
|
@ -414,8 +414,9 @@ func populateLabels(lset labels.Labels, cfg *config.ScrapeConfig) (res, orig lab
|
|||
}
|
||||
|
||||
// targetsFromGroup builds targets based on the given TargetGroup and config.
|
||||
func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Target, error) {
|
||||
func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Target, []error) {
|
||||
targets := make([]*Target, 0, len(tg.Targets))
|
||||
failures := []error{}
|
||||
|
||||
for i, tlset := range tg.Targets {
|
||||
lbls := make([]labels.Label, 0, len(tlset)+len(tg.Labels))
|
||||
|
@ -433,11 +434,11 @@ func targetsFromGroup(tg *targetgroup.Group, cfg *config.ScrapeConfig) ([]*Targe
|
|||
|
||||
lbls, origLabels, err := populateLabels(lset, cfg)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "instance %d in group %s", i, tg)
|
||||
failures = append(failures, errors.Wrapf(err, "instance %d in group %s", i, tg))
|
||||
}
|
||||
if lbls != nil || origLabels != nil {
|
||||
targets = append(targets, NewTarget(lbls, origLabels, cfg.Params))
|
||||
}
|
||||
}
|
||||
return targets, nil
|
||||
return targets, failures
|
||||
}
|
||||
|
|
|
@ -29,6 +29,8 @@ import (
|
|||
"github.com/prometheus/common/model"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/prometheus/prometheus/config"
|
||||
"github.com/prometheus/prometheus/discovery/targetgroup"
|
||||
"github.com/prometheus/prometheus/pkg/labels"
|
||||
)
|
||||
|
||||
|
@ -365,3 +367,18 @@ func TestNewClientWithBadTLSConfig(t *testing.T) {
|
|||
t.Fatalf("Expected error, got nil.")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTargetsFromGroup(t *testing.T) {
|
||||
expectedError := "instance 0 in group : no address"
|
||||
|
||||
targets, failures := targetsFromGroup(&targetgroup.Group{Targets: []model.LabelSet{{}, {model.AddressLabel: "localhost:9090"}}}, &config.ScrapeConfig{})
|
||||
if len(targets) != 1 {
|
||||
t.Fatalf("Expected 1 target, got %v", len(targets))
|
||||
}
|
||||
if len(failures) != 1 {
|
||||
t.Fatalf("Expected 1 failure, got %v", len(failures))
|
||||
}
|
||||
if failures[0].Error() != expectedError {
|
||||
t.Fatalf("Expected error %s, got %s", expectedError, failures[0])
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue