mirror of https://github.com/k3s-io/k3s
Merge pull request #41080 from shyamjvs/etcd-version-monitor
Automatic merge from submit-queue Added a basic monitor for providing etcd version related info Fixes #41071 This tool scrapes metrics partly from etcd's /version and /metrics endpoints and partly using etcdctl and exposes them as prometheus metrics at `http://localhost:9101/metrics` endpoint on the master. Here is a summary of the metrics it exposes (self-explanatory from the code): - etcdVersionFetchCount = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "etcd", Name: "version_info_fetch_count", Help: "Number of times etcd's version info was fetched, labeled by etcd's server binary and cluster version", }, []string{"serverversion", "clusterversion"}) - etcdGRPCRequestsTotal = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: namespace, Name: "grpc_requests_total", Help: "Counter of received grpc requests, labeled by grpc method and grpc service names", }, []string{"grpc_method", "grpc_service"}) For further info on how to run this as a binary/docker-container/kubernetes-pod and checking the metrics, have a look at the README.md file. cc @fgrzadkowski @wojtek-t @pioszpull/6/head
commit
6d5b2ef49e
|
@ -0,0 +1,20 @@
|
|||
# Copyright 2017 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
FROM scratch
|
||||
LABEL maintainer "Shyam JVS <shyamjvs@google.com>"
|
||||
|
||||
COPY etcd-version-monitor /etcd-version-monitor
|
||||
|
||||
EXPOSE 9101
|
|
@ -0,0 +1,43 @@
|
|||
# Copyright 2017 The Kubernetes Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Build the etcd-version-monitor image
|
||||
#
|
||||
# Usage:
|
||||
# [GOLANG_VERSION=1.7.4] [REGISTRY=gcr.io/google-containers] [TAG=test] make (build|push)
|
||||
# TODO(shyamjvs): Support architectures other than amd64 if needed.
|
||||
ARCH:=amd64
|
||||
GOLANG_VERSION?=1.7.4
|
||||
REGISTRY?=gcr.io/google-containers
|
||||
TAG?=0.1.0
|
||||
IMAGE:=$(REGISTRY)/etcd-version-monitor:$(TAG)
|
||||
CURRENT_DIR:=$(pwd)
|
||||
TEMP_DIR:=$(shell mktemp -d)
|
||||
|
||||
build:
|
||||
# Copy the necessary files for building the image to TEMP_DIR.
|
||||
cp etcd-version-monitor.go Dockerfile $(TEMP_DIR)
|
||||
|
||||
# Compile etcd-version-monitor.
|
||||
docker run -it -v $(shell pwd)/../../../:/go/src/k8s.io/kubernetes -v $(TEMP_DIR):/build -e GOARCH=$(ARCH) golang:$(GOLANG_VERSION) \
|
||||
/bin/bash -c "CGO_ENABLED=0 go build -o /build/etcd-version-monitor k8s.io/kubernetes/cluster/images/etcd-version-monitor"
|
||||
|
||||
docker build -t $(IMAGE) $(TEMP_DIR)
|
||||
|
||||
push: build
|
||||
gcloud docker -- push $(IMAGE)
|
||||
|
||||
all: build
|
||||
|
||||
.PHONY: build push
|
|
@ -0,0 +1,25 @@
|
|||
# etcd-version-monitor
|
||||
|
||||
This is a tool for exporting metrics related to etcd version, like etcd
|
||||
server's binary version, cluster version, and counts of different kinds of
|
||||
gRPC calls (which is a characteristic of v3), etc. These metrics are in
|
||||
prometheus format and can be scraped by a prometheus server.
|
||||
The metrics are exposed at the http://localhost:9101/metrics endpoint.
|
||||
|
||||
**RUNNING THE TOOL**
|
||||
|
||||
To run this tool as a docker container:
|
||||
- make build
|
||||
- docker run --net=host -i -t gcr.io/google_containers/etcd-version-monitor:test /etcd-version-monitor --logtostderr
|
||||
|
||||
To run this as a pod on the kubernetes cluster:
|
||||
- Place the 'etcd-version-monitor.yaml' in the manifests directory of
|
||||
kubelet on the master machine.
|
||||
|
||||
*Note*: This tool has to run on the same machine as etcd, as communication
|
||||
with etcd is over localhost.
|
||||
|
||||
**VERIFYING THE TOOL**
|
||||
|
||||
- Goto [http://localhost:9101/metrics](http://localhost:9101/metrics) in order to view the exported metrics.
|
||||
- The metrics prefixed with "etcd_" are the ones of interest to us.
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
Copyright 2017 The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
goflag "flag"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
"github.com/spf13/pflag"
|
||||
)
|
||||
|
||||
// Initialize the prometheus instrumentation and client related flags.
|
||||
var (
|
||||
listenAddress string
|
||||
metricsPath string
|
||||
etcdVersionScrapeURI string
|
||||
etcdMetricsScrapeURI string
|
||||
scrapeTimeout time.Duration
|
||||
)
|
||||
|
||||
func registerFlags(fs *pflag.FlagSet) {
|
||||
fs.StringVar(&listenAddress, "listen-address", "localhost:9101", "Address to listen on for serving prometheus metrics")
|
||||
fs.StringVar(&metricsPath, "metrics-path", "/metrics", "Path under which prometheus metrics are to be served")
|
||||
fs.StringVar(&etcdVersionScrapeURI, "etcd-version-scrape-uri", "http://localhost:2379/version", "URI to scrape etcd version info")
|
||||
fs.StringVar(&etcdMetricsScrapeURI, "etcd-metrics-scrape-uri", "http://localhost:2379/metrics", "URI to scrape etcd metrics")
|
||||
fs.DurationVar(&scrapeTimeout, "scrape-timeout", 15*time.Second, "Timeout for trying to get stats from etcd")
|
||||
}
|
||||
|
||||
const (
|
||||
namespace = "etcd" // For prefixing prometheus metrics
|
||||
)
|
||||
|
||||
// Initialize prometheus metrics to be exported.
|
||||
var (
|
||||
etcdVersion = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
Name: "version_info",
|
||||
Help: "Etcd server's binary version",
|
||||
},
|
||||
[]string{"binary_version"})
|
||||
etcdGRPCRequestsTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "grpc_requests_total",
|
||||
Help: "Counter of received grpc requests, labeled by the grpc method and service names",
|
||||
},
|
||||
[]string{"method", "service"})
|
||||
)
|
||||
|
||||
// Struct for unmarshalling the json response from etcd's /version endpoint.
|
||||
type EtcdVersion struct {
|
||||
BinaryVersion string `json:"etcdserver"`
|
||||
ClusterVersion string `json:"etcdcluster"`
|
||||
}
|
||||
|
||||
// Function for fetching etcd version info and feeding it to the prometheus metric.
|
||||
func getVersion(lastSeenBinaryVersion *string) error {
|
||||
// Create the get request for the etcd version endpoint.
|
||||
req, err := http.NewRequest("GET", etcdVersionScrapeURI, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to create GET request for etcd version: %v", err)
|
||||
}
|
||||
|
||||
// Send the get request and receive a response.
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to receive GET response for etcd version: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Obtain EtcdVersion from the JSON response.
|
||||
var version EtcdVersion
|
||||
if err := json.NewDecoder(resp.Body).Decode(&version); err != nil {
|
||||
return fmt.Errorf("Failed to decode etcd version JSON: %v", err)
|
||||
}
|
||||
|
||||
// Return without updating the version if it stayed the same since last time.
|
||||
if *lastSeenBinaryVersion == version.BinaryVersion {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Delete the metric for the previous version.
|
||||
if *lastSeenBinaryVersion != "" {
|
||||
deleted := etcdVersion.Delete(prometheus.Labels{"binary_version": *lastSeenBinaryVersion})
|
||||
if !deleted {
|
||||
return fmt.Errorf("Failed to delete previous version's metric")
|
||||
}
|
||||
}
|
||||
|
||||
// Record the new version in a metric.
|
||||
etcdVersion.With(prometheus.Labels{
|
||||
"binary_version": version.BinaryVersion,
|
||||
}).Set(0)
|
||||
*lastSeenBinaryVersion = version.BinaryVersion
|
||||
return nil
|
||||
}
|
||||
|
||||
// Periodically fetches etcd version info.
|
||||
func getVersionPeriodically(stopCh <-chan struct{}) {
|
||||
lastSeenBinaryVersion := ""
|
||||
for {
|
||||
if err := getVersion(&lastSeenBinaryVersion); err != nil {
|
||||
glog.Errorf("Failed to fetch etcd version: %v", err)
|
||||
}
|
||||
select {
|
||||
case <-stopCh:
|
||||
break
|
||||
case <-time.After(scrapeTimeout):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Struct for storing labels for gRPC request types.
|
||||
type GRPCRequestLabels struct {
|
||||
Method string
|
||||
Service string
|
||||
}
|
||||
|
||||
// Function for fetching etcd grpc request counts and feeding it to the prometheus metric.
|
||||
func getGRPCRequestCount(lastRecordedCount *map[GRPCRequestLabels]float64) error {
|
||||
// Create the get request for the etcd metrics endpoint.
|
||||
req, err := http.NewRequest("GET", etcdMetricsScrapeURI, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to create GET request for etcd metrics: %v", err)
|
||||
}
|
||||
|
||||
// Send the get request and receive a response.
|
||||
client := &http.Client{}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to receive GET response for etcd metrics: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Parse the metrics in text format to a MetricFamily struct.
|
||||
var textParser expfmt.TextParser
|
||||
metricFamilies, err := textParser.TextToMetricFamilies(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("Failed to parse etcd metrics: %v", err)
|
||||
}
|
||||
|
||||
// Look through the grpc requests metric family and update our promotheus metric.
|
||||
for _, metric := range metricFamilies["etcd_grpc_requests_total"].GetMetric() {
|
||||
var grpcRequestLabels GRPCRequestLabels
|
||||
for _, label := range metric.GetLabel() {
|
||||
if label.GetName() == "grpc_method" {
|
||||
grpcRequestLabels.Method = label.GetValue()
|
||||
}
|
||||
if label.GetName() == "grpc_service" {
|
||||
grpcRequestLabels.Service = label.GetValue()
|
||||
}
|
||||
}
|
||||
if grpcRequestLabels.Method == "" || grpcRequestLabels.Service == "" {
|
||||
return fmt.Errorf("Could not get value for grpc_method and/or grpc_service label")
|
||||
}
|
||||
|
||||
// Get last recorded value and new value of the metric and update it suitably.
|
||||
previousMetricValue := 0.0
|
||||
if value, ok := (*lastRecordedCount)[grpcRequestLabels]; ok {
|
||||
previousMetricValue = value
|
||||
}
|
||||
newMetricValue := metric.GetCounter().GetValue()
|
||||
(*lastRecordedCount)[grpcRequestLabels] = newMetricValue
|
||||
if newMetricValue >= previousMetricValue {
|
||||
etcdGRPCRequestsTotal.With(prometheus.Labels{
|
||||
"method": grpcRequestLabels.Method,
|
||||
"service": grpcRequestLabels.Service,
|
||||
}).Add(newMetricValue - previousMetricValue)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Function for periodically fetching etcd GRPC request counts.
|
||||
func getGRPCRequestCountPeriodically(stopCh <-chan struct{}) {
|
||||
// This map stores last recorded count for a given grpc request type.
|
||||
lastRecordedCount := make(map[GRPCRequestLabels]float64)
|
||||
for {
|
||||
if err := getGRPCRequestCount(&lastRecordedCount); err != nil {
|
||||
glog.Errorf("Failed to fetch etcd grpc request counts: %v", err)
|
||||
}
|
||||
select {
|
||||
case <-stopCh:
|
||||
break
|
||||
case <-time.After(scrapeTimeout):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Register the commandline flags passed to the tool.
|
||||
registerFlags(pflag.CommandLine)
|
||||
pflag.CommandLine.AddGoFlagSet(goflag.CommandLine)
|
||||
pflag.Parse()
|
||||
|
||||
// Register the metrics we defined above with prometheus.
|
||||
prometheus.MustRegister(etcdVersion)
|
||||
prometheus.MustRegister(etcdGRPCRequestsTotal)
|
||||
prometheus.Unregister(prometheus.NewGoCollector()) // Unregister go metrics.
|
||||
|
||||
// Spawn threads for periodically scraping etcd version metrics.
|
||||
stopCh := make(chan struct{})
|
||||
defer close(stopCh)
|
||||
go getVersionPeriodically(stopCh)
|
||||
go getGRPCRequestCountPeriodically(stopCh)
|
||||
|
||||
// Serve our metrics on listenAddress/metricsPath.
|
||||
glog.Infof("Listening on: %v", listenAddress)
|
||||
http.Handle(metricsPath, prometheus.Handler())
|
||||
glog.Errorf("Stopped listening/serving metrics: %v", http.ListenAndServe(listenAddress, nil))
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: etcd-version-monitor
|
||||
namespace: kube-system
|
||||
spec:
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: etcd-version-monitor
|
||||
image: gcr.io/google-containers/etcd-version-monitor:0.1.0
|
||||
command:
|
||||
- /etcd-version-monitor
|
||||
- --logtostderr
|
|
@ -88,7 +88,10 @@ cluster/vagrant/provision-utils.sh: api_servers: '$(echo "$MASTER_IP" | sed -e
|
|||
cluster/vagrant/provision-utils.sh: node_ip: '$(echo "$MASTER_IP" | sed -e "s/'/''/g")'
|
||||
cluster/vagrant/provision-utils.sh: runtime_config: '$(echo "$RUNTIME_CONFIG" | sed -e "s/'/''/g")'
|
||||
examples/cluster-dns/images/frontend/client.py: service_address = socket.gethostbyname(hostname)
|
||||
examples/storage/cassandra/image/files/jvm.options:# information in cassandra.yaml (such as listen_address).
|
||||
examples/storage/cassandra/image/files/jvm.options:#-Dcassandra.replace_address=listen_address or broadcast_address of dead node
|
||||
examples/storage/cassandra/image/files/run.sh: cluster_name \
|
||||
examples/storage/cassandra/image/files/run.sh: listen_address \
|
||||
examples/storage/vitess/env.sh: node_ip=$(get_node_ip)
|
||||
federation/cluster/common.sh: local cert_dir="${kube_temp}/easy-rsa-master/easyrsa3"
|
||||
federation/deploy/config.json.sample: "cloud_provider": "gce",
|
||||
|
@ -103,9 +106,6 @@ federation/deploy/config.json.sample: "cluster_name": "cluster3-kubernetes"
|
|||
federation/deploy/config.json.sample: "num_nodes": 3,
|
||||
federation/deploy/config.json.sample: "num_nodes": 3,
|
||||
federation/deploy/config.json.sample: "num_nodes": 3,
|
||||
hack/e2e.go:.phase1.cloud_provider="gce"
|
||||
hack/e2e.go:.phase1.cluster_name="{{.Cluster}}"
|
||||
hack/e2e.go:.phase1.num_nodes=4
|
||||
hack/lib/util.sh: local api_port=$5
|
||||
hack/local-up-cluster.sh: advertise_address="--advertise_address=${API_HOST_IP}"
|
||||
hack/local-up-cluster.sh: runtime_config="--runtime-config=${RUNTIME_CONFIG}"
|
||||
|
|
|
@ -161,6 +161,8 @@ dump-logs-on-failure
|
|||
duration-sec
|
||||
e2e-output-dir
|
||||
e2e-verify-service-account
|
||||
etcd-metrics-scrape-uri
|
||||
etcd-version-scrape-uri
|
||||
enable-controller-attach-detach
|
||||
enable-cri
|
||||
enable-custom-metrics
|
||||
|
@ -369,6 +371,7 @@ leader-elect-retry-period
|
|||
lease-duration
|
||||
leave-stdin-open
|
||||
limit-bytes
|
||||
listen-address
|
||||
listers-package
|
||||
load-balancer-ip
|
||||
lock-file
|
||||
|
@ -408,6 +411,7 @@ mesos-launch-grace-period
|
|||
mesos-master
|
||||
mesos-sandbox-overlay
|
||||
mesos-user
|
||||
metrics-path
|
||||
min-available
|
||||
min-pr-number
|
||||
min-request-timeout
|
||||
|
@ -546,6 +550,7 @@ scheduler-config
|
|||
scheduler-name
|
||||
schema-cache-dir
|
||||
scopes
|
||||
scrape-timeout
|
||||
seccomp-profile-root
|
||||
secondary-node-eviction-rate
|
||||
secret-name
|
||||
|
|
Loading…
Reference in New Issue