mirror of https://github.com/k3s-io/k3s
add a metric that can be used to notice stuck worker threads
parent
06e737367d
commit
6195d1005d
|
@ -71,6 +71,16 @@ func (prometheusMetricsProvider) NewWorkDurationMetric(name string) workqueue.Su
|
||||||
return workDuration
|
return workDuration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (prometheusMetricsProvider) NewUnfinishedWorkMicrosecondsMetric(name string) workqueue.SettableGaugeMetric {
|
||||||
|
unfinished := prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
|
Subsystem: name,
|
||||||
|
Name: "unfinished_work_microseconds",
|
||||||
|
Help: "How many microseconds of work has " + name + " done that is still in progress and hasn't yet been observed by work_duration.",
|
||||||
|
})
|
||||||
|
prometheus.Register(unfinished)
|
||||||
|
return unfinished
|
||||||
|
}
|
||||||
|
|
||||||
func (prometheusMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
|
func (prometheusMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
|
||||||
retries := prometheus.NewCounter(prometheus.CounterOpts{
|
retries := prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
Subsystem: name,
|
Subsystem: name,
|
||||||
|
|
|
@ -28,6 +28,7 @@ type queueMetrics interface {
|
||||||
add(item t)
|
add(item t)
|
||||||
get(item t)
|
get(item t)
|
||||||
done(item t)
|
done(item t)
|
||||||
|
updateUnfinishedWork()
|
||||||
}
|
}
|
||||||
|
|
||||||
// GaugeMetric represents a single numerical value that can arbitrarily go up
|
// GaugeMetric represents a single numerical value that can arbitrarily go up
|
||||||
|
@ -37,6 +38,12 @@ type GaugeMetric interface {
|
||||||
Dec()
|
Dec()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SettableGaugeMetric represents a single numerical value that can arbitrarily go up
|
||||||
|
// and down. (Separate from GaugeMetric to preserve backwards compatibility.)
|
||||||
|
type SettableGaugeMetric interface {
|
||||||
|
Set(float64)
|
||||||
|
}
|
||||||
|
|
||||||
// CounterMetric represents a single numerical value that only ever
|
// CounterMetric represents a single numerical value that only ever
|
||||||
// goes up.
|
// goes up.
|
||||||
type CounterMetric interface {
|
type CounterMetric interface {
|
||||||
|
@ -52,6 +59,7 @@ type noopMetric struct{}
|
||||||
|
|
||||||
func (noopMetric) Inc() {}
|
func (noopMetric) Inc() {}
|
||||||
func (noopMetric) Dec() {}
|
func (noopMetric) Dec() {}
|
||||||
|
func (noopMetric) Set(float64) {}
|
||||||
func (noopMetric) Observe(float64) {}
|
func (noopMetric) Observe(float64) {}
|
||||||
|
|
||||||
type defaultQueueMetrics struct {
|
type defaultQueueMetrics struct {
|
||||||
|
@ -65,6 +73,9 @@ type defaultQueueMetrics struct {
|
||||||
workDuration SummaryMetric
|
workDuration SummaryMetric
|
||||||
addTimes map[t]time.Time
|
addTimes map[t]time.Time
|
||||||
processingStartTimes map[t]time.Time
|
processingStartTimes map[t]time.Time
|
||||||
|
|
||||||
|
// how long have current threads been working?
|
||||||
|
unfinishedWorkMicroseconds SettableGaugeMetric
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *defaultQueueMetrics) add(item t) {
|
func (m *defaultQueueMetrics) add(item t) {
|
||||||
|
@ -103,6 +114,23 @@ func (m *defaultQueueMetrics) done(item t) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *defaultQueueMetrics) updateUnfinishedWork() {
|
||||||
|
var total float64
|
||||||
|
if m.processingStartTimes != nil {
|
||||||
|
for _, t := range m.processingStartTimes {
|
||||||
|
total += sinceInMicroseconds(t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m.unfinishedWorkMicroseconds.Set(total)
|
||||||
|
}
|
||||||
|
|
||||||
|
type noMetrics struct{}
|
||||||
|
|
||||||
|
func (noMetrics) add(item t) {}
|
||||||
|
func (noMetrics) get(item t) {}
|
||||||
|
func (noMetrics) done(item t) {}
|
||||||
|
func (noMetrics) updateUnfinishedWork() {}
|
||||||
|
|
||||||
// Gets the time since the specified start in microseconds.
|
// Gets the time since the specified start in microseconds.
|
||||||
func sinceInMicroseconds(start time.Time) float64 {
|
func sinceInMicroseconds(start time.Time) float64 {
|
||||||
return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
|
return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
|
||||||
|
@ -130,6 +158,7 @@ type MetricsProvider interface {
|
||||||
NewAddsMetric(name string) CounterMetric
|
NewAddsMetric(name string) CounterMetric
|
||||||
NewLatencyMetric(name string) SummaryMetric
|
NewLatencyMetric(name string) SummaryMetric
|
||||||
NewWorkDurationMetric(name string) SummaryMetric
|
NewWorkDurationMetric(name string) SummaryMetric
|
||||||
|
NewUnfinishedWorkMicrosecondsMetric(name string) SettableGaugeMetric
|
||||||
NewRetriesMetric(name string) CounterMetric
|
NewRetriesMetric(name string) CounterMetric
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -151,6 +180,10 @@ func (_ noopMetricsProvider) NewWorkDurationMetric(name string) SummaryMetric {
|
||||||
return noopMetric{}
|
return noopMetric{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (_ noopMetricsProvider) NewUnfinishedWorkMicrosecondsMetric(name string) SettableGaugeMetric {
|
||||||
|
return noopMetric{}
|
||||||
|
}
|
||||||
|
|
||||||
func (_ noopMetricsProvider) NewRetriesMetric(name string) CounterMetric {
|
func (_ noopMetricsProvider) NewRetriesMetric(name string) CounterMetric {
|
||||||
return noopMetric{}
|
return noopMetric{}
|
||||||
}
|
}
|
||||||
|
@ -163,15 +196,16 @@ var metricsFactory = struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
func newQueueMetrics(name string) queueMetrics {
|
func newQueueMetrics(name string) queueMetrics {
|
||||||
var ret *defaultQueueMetrics
|
mp := metricsFactory.metricsProvider
|
||||||
if len(name) == 0 {
|
if len(name) == 0 || mp == (noopMetricsProvider{}) {
|
||||||
return ret
|
return noMetrics{}
|
||||||
}
|
}
|
||||||
return &defaultQueueMetrics{
|
return &defaultQueueMetrics{
|
||||||
depth: metricsFactory.metricsProvider.NewDepthMetric(name),
|
depth: mp.NewDepthMetric(name),
|
||||||
adds: metricsFactory.metricsProvider.NewAddsMetric(name),
|
adds: mp.NewAddsMetric(name),
|
||||||
latency: metricsFactory.metricsProvider.NewLatencyMetric(name),
|
latency: mp.NewLatencyMetric(name),
|
||||||
workDuration: metricsFactory.metricsProvider.NewWorkDurationMetric(name),
|
workDuration: mp.NewWorkDurationMetric(name),
|
||||||
|
unfinishedWorkMicroseconds: mp.NewUnfinishedWorkMicrosecondsMetric(name),
|
||||||
addTimes: map[t]time.Time{},
|
addTimes: map[t]time.Time{},
|
||||||
processingStartTimes: map[t]time.Time{},
|
processingStartTimes: map[t]time.Time{},
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
/*
|
||||||
|
Copyright 2018 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package workqueue
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type testMetrics struct {
|
||||||
|
added, gotten, finished int64
|
||||||
|
|
||||||
|
updateCalled chan<- struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *testMetrics) add(item t) { m.added++ }
|
||||||
|
func (m *testMetrics) get(item t) { m.gotten++ }
|
||||||
|
func (m *testMetrics) done(item t) { m.finished++ }
|
||||||
|
func (m *testMetrics) updateUnfinishedWork() { m.updateCalled <- struct{}{} }
|
||||||
|
|
||||||
|
func TestMetrics(t *testing.T) {
|
||||||
|
ch := make(chan struct{})
|
||||||
|
m := &testMetrics{
|
||||||
|
updateCalled: ch,
|
||||||
|
}
|
||||||
|
q := newQueue("test", m, time.Millisecond)
|
||||||
|
<-ch
|
||||||
|
q.ShutDown()
|
||||||
|
select {
|
||||||
|
case <-time.After(time.Second):
|
||||||
|
return
|
||||||
|
case <-ch:
|
||||||
|
t.Errorf("Unexpected update after shutdown was called.")
|
||||||
|
}
|
||||||
|
}
|
|
@ -18,6 +18,7 @@ package workqueue
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Interface interface {
|
type Interface interface {
|
||||||
|
@ -35,14 +36,27 @@ func New() *Type {
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewNamed(name string) *Type {
|
func NewNamed(name string) *Type {
|
||||||
return &Type{
|
return newQueue(
|
||||||
|
name,
|
||||||
|
newQueueMetrics(name),
|
||||||
|
defaultUnfinishedWorkUpdatePeriod,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func newQueue(name string, metrics queueMetrics, updatePeriod time.Duration) *Type {
|
||||||
|
t := &Type{
|
||||||
dirty: set{},
|
dirty: set{},
|
||||||
processing: set{},
|
processing: set{},
|
||||||
cond: sync.NewCond(&sync.Mutex{}),
|
cond: sync.NewCond(&sync.Mutex{}),
|
||||||
metrics: newQueueMetrics(name),
|
metrics: metrics,
|
||||||
|
unfinishedWorkUpdatePeriod: updatePeriod,
|
||||||
}
|
}
|
||||||
|
go t.updateUnfinishedWorkLook()
|
||||||
|
return t
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const defaultUnfinishedWorkUpdatePeriod = 500 * time.Millisecond
|
||||||
|
|
||||||
// Type is a work queue (see the package comment).
|
// Type is a work queue (see the package comment).
|
||||||
type Type struct {
|
type Type struct {
|
||||||
// queue defines the order in which we will work on items. Every
|
// queue defines the order in which we will work on items. Every
|
||||||
|
@ -64,6 +78,8 @@ type Type struct {
|
||||||
shuttingDown bool
|
shuttingDown bool
|
||||||
|
|
||||||
metrics queueMetrics
|
metrics queueMetrics
|
||||||
|
|
||||||
|
unfinishedWorkUpdatePeriod time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
type empty struct{}
|
type empty struct{}
|
||||||
|
@ -170,3 +186,22 @@ func (q *Type) ShuttingDown() bool {
|
||||||
|
|
||||||
return q.shuttingDown
|
return q.shuttingDown
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (q *Type) updateUnfinishedWorkLook() {
|
||||||
|
t := time.NewTicker(q.unfinishedWorkUpdatePeriod)
|
||||||
|
defer t.Stop()
|
||||||
|
for range t.C {
|
||||||
|
if !func() bool {
|
||||||
|
q.cond.L.Lock()
|
||||||
|
defer q.cond.L.Unlock()
|
||||||
|
if !q.shuttingDown {
|
||||||
|
q.metrics.updateUnfinishedWork()
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
|
||||||
|
}() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue