From 3d2fabb013ed1cd280e1711557382d64f536f327 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Thu, 5 Dec 2024 01:37:08 +0000 Subject: [PATCH] Add loadbalancer metrics Signed-off-by: Brad Davidson --- pkg/agent/loadbalancer/loadbalancer.go | 13 ++++++++++- pkg/agent/loadbalancer/metrics.go | 30 ++++++++++++++++++++++++++ pkg/agent/loadbalancer/servers.go | 9 +++++++- pkg/metrics/metrics.go | 3 +++ 4 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 pkg/agent/loadbalancer/metrics.go diff --git a/pkg/agent/loadbalancer/loadbalancer.go b/pkg/agent/loadbalancer/loadbalancer.go index 2f6d33fbf4..09727db189 100644 --- a/pkg/agent/loadbalancer/loadbalancer.go +++ b/pkg/agent/loadbalancer/loadbalancer.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/inetaf/tcpproxy" "github.com/k3s-io/k3s/pkg/version" @@ -95,8 +96,18 @@ func New(ctx context.Context, dataDir, serviceName, defaultServerURL string, lbS } lb.proxy.AddRoute(serviceName, &tcpproxy.DialProxy{ Addr: serviceName, - DialContext: lb.servers.dialContext, OnDialError: onDialError, + DialContext: func(ctx context.Context, network, address string) (net.Conn, error) { + start := time.Now() + status := "success" + conn, err := lb.servers.dialContext(ctx, network, address) + latency := time.Since(start) + if err != nil { + status = "error" + } + loadbalancerDials.WithLabelValues(serviceName, status).Observe(latency.Seconds()) + return conn, err + }, }) if err := lb.updateConfig(); err != nil { diff --git a/pkg/agent/loadbalancer/metrics.go b/pkg/agent/loadbalancer/metrics.go new file mode 100644 index 0000000000..11f27486ed --- /dev/null +++ b/pkg/agent/loadbalancer/metrics.go @@ -0,0 +1,30 @@ +package loadbalancer + +import ( + "github.com/k3s-io/k3s/pkg/version" + "github.com/prometheus/client_golang/prometheus" + "k8s.io/component-base/metrics" +) + +var ( + loadbalancerConnections = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: version.Program + "_loadbalancer_server_connections", + Help: "Count of current connections to loadbalancer server", + }, []string{"name", "server"}) + + loadbalancerState = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: version.Program + "_loadbalancer_server_health", + Help: "Current health value of loadbalancer server", + }, []string{"name", "server"}) + + loadbalancerDials = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: version.Program + "_loadbalancer_dial_duration_seconds", + Help: "Time taken to dial a connection to a backend server", + Buckets: metrics.ExponentialBuckets(0.001, 2, 15), + }, []string{"name", "status"}) +) + +// MustRegister registers loadbalancer metrics +func MustRegister(registerer prometheus.Registerer) { + registerer.MustRegister(loadbalancerConnections, loadbalancerState, loadbalancerDials) +} diff --git a/pkg/agent/loadbalancer/servers.go b/pkg/agent/loadbalancer/servers.go index 7cdf8466ed..13334ea881 100644 --- a/pkg/agent/loadbalancer/servers.go +++ b/pkg/agent/loadbalancer/servers.go @@ -79,6 +79,9 @@ func (sl *serverList) setAddresses(serviceName string, addresses []string) bool // set state to invalid to prevent server from making additional connections s.state = stateInvalid closeAllFuncs = append(closeAllFuncs, s.closeAll) + // remove metrics + loadbalancerState.DeleteLabelValues(serviceName, s.address) + loadbalancerConnections.DeleteLabelValues(serviceName, s.address) return true } return false @@ -459,7 +462,7 @@ func (sc *serverConn) Close() error { return sc.Conn.Close() } -// runHealthChecks periodically health-checks all servers. +// runHealthChecks periodically health-checks all servers and updates metrics func (sl *serverList) runHealthChecks(ctx context.Context, serviceName string) { wait.Until(func() { for _, s := range sl.getServers() { @@ -469,6 +472,10 @@ func (sl *serverList) runHealthChecks(ctx context.Context, serviceName string) { case HealthCheckResultFailed: sl.recordFailure(s, reasonHealthCheck) } + if s.state != stateInvalid { + loadbalancerState.WithLabelValues(serviceName, s.address).Set(float64(s.state)) + loadbalancerConnections.WithLabelValues(serviceName, s.address).Set(float64(len(s.connections))) + } } }, time.Second, ctx.Done()) logrus.Debugf("Stopped health checking for load balancer %s", serviceName) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index a769e6a384..eccb4abb0b 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -6,6 +6,7 @@ import ( "github.com/gorilla/mux" "github.com/k3s-io/k3s/pkg/agent/https" + "github.com/k3s-io/k3s/pkg/agent/loadbalancer" "github.com/k3s-io/k3s/pkg/daemons/config" "github.com/prometheus/client_golang/prometheus/promhttp" lassometrics "github.com/rancher/lasso/pkg/metrics" @@ -32,6 +33,8 @@ var DefaultMetrics = &Config{ func init() { // ensure that lasso exposes metrics through the same registry used by Kubernetes components lassometrics.MustRegister(DefaultRegisterer) + // same for loadbalancer metrics + loadbalancer.MustRegister(DefaultRegisterer) } // Config holds fields for the metrics listener