Added a failure counter to the HTTP service discovery (#10372)

* Added a failure counter to the http service discovery

Signed-off-by: David N Perkins <David.N.Perkins@ibm.com>
pull/10413/head
David N Perkins 2022-03-08 08:10:45 -05:00 committed by GitHub
parent 025528a5d6
commit 097b359b41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 49 additions and 3 deletions

View File

@ -28,6 +28,7 @@ import (
"github.com/go-kit/log"
"github.com/grafana/regexp"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/config"
"github.com/prometheus/common/model"
"github.com/prometheus/common/version"
@ -45,10 +46,17 @@ var (
}
userAgent = fmt.Sprintf("Prometheus/%s", version.Version)
matchContentType = regexp.MustCompile(`^(?i:application\/json(;\s*charset=("utf-8"|utf-8))?)$`)
failuresCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "prometheus_sd_http_failures_total",
Help: "Number of HTTP service discovery refresh failures.",
})
)
func init() {
discovery.RegisterConfig(&SDConfig{})
prometheus.MustRegister(failuresCount)
}
// SDConfig is the configuration for HTTP based discovery.
@ -145,6 +153,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
resp, err := d.client.Do(req.WithContext(ctx))
if err != nil {
failuresCount.Inc()
return nil, err
}
defer func() {
@ -153,26 +162,31 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) {
}()
if resp.StatusCode != http.StatusOK {
failuresCount.Inc()
return nil, errors.Errorf("server returned HTTP status %s", resp.Status)
}
if !matchContentType.MatchString(strings.TrimSpace(resp.Header.Get("Content-Type"))) {
failuresCount.Inc()
return nil, errors.Errorf("unsupported content type %q", resp.Header.Get("Content-Type"))
}
b, err := ioutil.ReadAll(resp.Body)
if err != nil {
failuresCount.Inc()
return nil, err
}
var targetGroups []*targetgroup.Group
if err := json.Unmarshal(b, &targetGroups); err != nil {
failuresCount.Inc()
return nil, err
}
for i, tg := range targetGroups {
if tg == nil {
failuresCount.Inc()
err = errors.New("nil target group item found")
return nil, err
}

View File

@ -22,6 +22,8 @@ import (
"time"
"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/config"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
@ -61,6 +63,7 @@ func TestHTTPValidRefresh(t *testing.T) {
},
}
require.Equal(t, tgs, expectedTargets)
require.Equal(t, 0.0, getFailureCount())
}
func TestHTTPInvalidCode(t *testing.T) {
@ -82,6 +85,7 @@ func TestHTTPInvalidCode(t *testing.T) {
ctx := context.Background()
_, err = d.refresh(ctx)
require.EqualError(t, err, "server returned HTTP status 400 Bad Request")
require.Equal(t, 1.0, getFailureCount())
}
func TestHTTPInvalidFormat(t *testing.T) {
@ -103,6 +107,32 @@ func TestHTTPInvalidFormat(t *testing.T) {
ctx := context.Background()
_, err = d.refresh(ctx)
require.EqualError(t, err, `unsupported content type "text/plain; charset=utf-8"`)
require.Equal(t, 1.0, getFailureCount())
}
var lastFailureCount float64
func getFailureCount() float64 {
failureChan := make(chan prometheus.Metric)
go func() {
failuresCount.Collect(failureChan)
close(failureChan)
}()
var counter dto.Metric
for {
metric, ok := <-failureChan
if ok == false {
break
}
metric.Write(&counter)
}
// account for failures in prior tests
count := *counter.Counter.Value - lastFailureCount
lastFailureCount = *counter.Counter.Value
return count
}
func TestContentTypeRegex(t *testing.T) {

View File

@ -1448,8 +1448,9 @@ Example response body:
]
```
The endpoint is queried periodically at the specified
refresh interval.
The endpoint is queried periodically at the specified refresh interval.
The `prometheus_sd_http_failures_total` counter metric tracks the number of
refresh failures.
Each target has a meta label `__meta_url` during the
[relabeling phase](#relabel_config). Its value is set to the

View File

@ -40,7 +40,8 @@ an empty list `[]`. Target lists are unordered.
Prometheus caches target lists. If an error occurs while fetching an updated
targets list, Prometheus keeps using the current targets list. The targets list
is not saved across restart.
is not saved across restart. The `prometheus_sd_http_failures_total` counter
metric tracks the number of refresh failures.
The whole list of targets must be returned on every scrape. There is no support
for incremental updates. A Prometheus instance does not send its hostname and it