From ed0c68224ee7a76d93230c1b38af369d330ee007 Mon Sep 17 00:00:00 2001 From: David N Perkins Date: Thu, 31 Mar 2022 08:50:17 -0400 Subject: [PATCH] Azure SD Failure metric and 404 Handling (#10476) * For Azure sd, added failure counter and skipping of 404's from Nic lookup Signed-off-by: David N Perkins --- discovery/azure/azure.go | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/discovery/azure/azure.go b/discovery/azure/azure.go index 4940daf1b..e7ab63a21 100644 --- a/discovery/azure/azure.go +++ b/discovery/azure/azure.go @@ -30,6 +30,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" config_util "github.com/prometheus/common/config" "github.com/prometheus/common/model" "github.com/prometheus/common/version" @@ -70,10 +71,17 @@ var ( AuthenticationMethod: authMethodOAuth, HTTPClientConfig: config_util.DefaultHTTPClientConfig, } + + failuresCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "prometheus_sd_azure_failures_total", + Help: "Number of Azure service discovery refresh failures.", + }) ) func init() { discovery.RegisterConfig(&SDConfig{}) + prometheus.MustRegister(failuresCount) } // SDConfig is the configuration for Azure based service discovery. @@ -279,11 +287,13 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { client, err := createAzureClient(*d.cfg) if err != nil { + failuresCount.Inc() return nil, errors.Wrap(err, "could not create Azure client") } machines, err := client.getVMs(ctx, d.cfg.ResourceGroup) if err != nil { + failuresCount.Inc() return nil, errors.Wrap(err, "could not get virtual machines") } @@ -292,12 +302,14 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { // Load the vms managed by scale sets. scaleSets, err := client.getScaleSets(ctx, d.cfg.ResourceGroup) if err != nil { + failuresCount.Inc() return nil, errors.Wrap(err, "could not get virtual machine scale sets") } for _, scaleSet := range scaleSets { scaleSetVms, err := client.getScaleSetVMs(ctx, scaleSet) if err != nil { + failuresCount.Inc() return nil, errors.Wrap(err, "could not get virtual machine scale set vms") } machines = append(machines, scaleSetVms...) @@ -313,8 +325,8 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { var wg sync.WaitGroup wg.Add(len(machines)) ch := make(chan target, len(machines)) - for i, vm := range machines { - go func(i int, vm virtualMachine) { + for _, vm := range machines { + go func(vm virtualMachine) { defer wg.Done() r, err := newAzureResourceFromID(vm.ID, d.logger) if err != nil { @@ -346,8 +358,11 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { for _, nicID := range vm.NetworkInterfaces { networkInterface, err := client.getNetworkInterfaceByID(ctx, nicID) if err != nil { - level.Error(d.logger).Log("msg", "Unable to get network interface", "name", nicID, "err", err) - ch <- target{labelSet: nil, err: err} + if errors.Is(err, errorNotFound) { + level.Warn(d.logger).Log("msg", "Network interface does not exist", "name", nicID, "err", err) + } else { + ch <- target{labelSet: nil, err: err} + } // Get out of this routine because we cannot continue without a network interface. return } @@ -387,7 +402,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { } } } - }(i, vm) + }(vm) } wg.Wait() @@ -396,6 +411,7 @@ func (d *Discovery) refresh(ctx context.Context) ([]*targetgroup.Group, error) { var tg targetgroup.Group for tgt := range ch { if tgt.err != nil { + failuresCount.Inc() return nil, errors.Wrap(tgt.err, "unable to complete Azure service discovery") } if tgt.labelSet != nil { @@ -561,6 +577,11 @@ func mapFromVMScaleSetVM(vm compute.VirtualMachineScaleSetVM, scaleSetName strin } } +var errorNotFound = errors.New("network interface does not exist") + +// getNetworkInterfaceByID gets the network interface. +// If a 404 is returned from the Azure API, `errorNotFound` is returned. +// On all other errors, an autorest.DetailedError is returned. func (client *azureClient) getNetworkInterfaceByID(ctx context.Context, networkInterfaceID string) (*network.Interface, error) { result := network.Interface{} queryParameters := map[string]interface{}{ @@ -585,6 +606,9 @@ func (client *azureClient) getNetworkInterfaceByID(ctx context.Context, networkI result, err = client.nic.GetResponder(resp) if err != nil { + if resp.StatusCode == http.StatusNotFound { + return nil, errorNotFound + } return nil, autorest.NewErrorWithError(err, "network.InterfacesClient", "Get", resp, "Failure responding to request") }