mirror of https://github.com/k3s-io/k3s
Merge pull request #4901 from vmarmol/mon-startup
Adding sync pod latency metric (again).pull/6/head
commit
c6175facfb
|
@ -61,7 +61,10 @@ const podOomScoreAdj = -100
|
|||
|
||||
// SyncHandler is an interface implemented by Kubelet, for testability
|
||||
type SyncHandler interface {
|
||||
SyncPods([]api.BoundPod) error
|
||||
// Syncs current state to match the specified pods. SyncPodType specified what
|
||||
// type of sync is occuring per pod. StartTime specifies the time at which
|
||||
// syncing began (for use in monitoring).
|
||||
SyncPods(pods []api.BoundPod, podSyncTypes map[types.UID]metrics.SyncPodType, startTime time.Time) error
|
||||
}
|
||||
|
||||
type SourceReadyFn func(source string) bool
|
||||
|
@ -942,7 +945,7 @@ func (kl *Kubelet) createPodInfraContainer(pod *api.BoundPod) (dockertools.Docke
|
|||
func (kl *Kubelet) pullImage(img string, ref *api.ObjectReference) error {
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
metrics.ImagePullLatency.Observe(float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds()))
|
||||
metrics.ImagePullLatency.Observe(metrics.SinceInMicroseconds(start))
|
||||
}()
|
||||
|
||||
if err := kl.dockerPuller.Pull(img); err != nil {
|
||||
|
@ -1270,7 +1273,7 @@ func (kl *Kubelet) cleanupOrphanedVolumes(pods []api.BoundPod, running []*docker
|
|||
}
|
||||
|
||||
// SyncPods synchronizes the configured list of pods (desired state) with the host current state.
|
||||
func (kl *Kubelet) SyncPods(pods []api.BoundPod) error {
|
||||
func (kl *Kubelet) SyncPods(pods []api.BoundPod, podSyncTypes map[types.UID]metrics.SyncPodType, start time.Time) error {
|
||||
glog.V(4).Infof("Desired: %#v", pods)
|
||||
var err error
|
||||
desiredContainers := make(map[podContainer]empty)
|
||||
|
@ -1296,7 +1299,9 @@ func (kl *Kubelet) SyncPods(pods []api.BoundPod) error {
|
|||
}
|
||||
|
||||
// Run the sync in an async manifest worker.
|
||||
kl.podWorkers.UpdatePod(*pod)
|
||||
kl.podWorkers.UpdatePod(pod, func() {
|
||||
metrics.SyncPodLatency.WithLabelValues(podSyncTypes[pod.UID].String()).Observe(metrics.SinceInMicroseconds(start))
|
||||
})
|
||||
}
|
||||
|
||||
// Stop the workers for no-longer existing pods.
|
||||
|
@ -1416,19 +1421,21 @@ func (kl *Kubelet) handleUpdate(u PodUpdate) {
|
|||
func (kl *Kubelet) syncLoop(updates <-chan PodUpdate, handler SyncHandler) {
|
||||
for {
|
||||
unsyncedPod := false
|
||||
podSyncTypes := make(map[types.UID]metrics.SyncPodType)
|
||||
select {
|
||||
case u := <-updates:
|
||||
kl.updatePods(u)
|
||||
kl.updatePods(u, podSyncTypes)
|
||||
unsyncedPod = true
|
||||
case <-time.After(kl.resyncInterval):
|
||||
glog.V(4).Infof("Periodic sync")
|
||||
}
|
||||
start := time.Now()
|
||||
// If we already caught some update, try to wait for some short time
|
||||
// to possibly batch it with other incoming updates.
|
||||
for unsyncedPod {
|
||||
select {
|
||||
case u := <-updates:
|
||||
kl.updatePods(u)
|
||||
kl.updatePods(u, podSyncTypes)
|
||||
case <-time.After(5 * time.Millisecond):
|
||||
// Break the for loop.
|
||||
unsyncedPod = false
|
||||
|
@ -1440,25 +1447,54 @@ func (kl *Kubelet) syncLoop(updates <-chan PodUpdate, handler SyncHandler) {
|
|||
glog.Errorf("Failed to get bound pods.")
|
||||
return
|
||||
}
|
||||
if err := handler.SyncPods(pods); err != nil {
|
||||
if err := handler.SyncPods(pods, podSyncTypes, start); err != nil {
|
||||
glog.Errorf("Couldn't sync containers: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (kl *Kubelet) updatePods(u PodUpdate) {
|
||||
// Updated the Kubelet's internal pods with those provided by the update.
|
||||
// Records new and updated pods in newPods and updatedPods.
|
||||
func (kl *Kubelet) updatePods(u PodUpdate, podSyncTypes map[types.UID]metrics.SyncPodType) {
|
||||
switch u.Op {
|
||||
case SET:
|
||||
glog.V(3).Infof("SET: Containers changed")
|
||||
|
||||
// Store the new pods. Don't worry about filtering host ports since those
|
||||
// pods will never be looked up.
|
||||
existingPods := make(map[types.UID]struct{})
|
||||
for i := range kl.pods {
|
||||
existingPods[kl.pods[i].UID] = struct{}{}
|
||||
}
|
||||
for i := range u.Pods {
|
||||
if _, ok := existingPods[u.Pods[i].UID]; !ok {
|
||||
podSyncTypes[u.Pods[i].UID] = metrics.SyncPodCreate
|
||||
}
|
||||
}
|
||||
|
||||
kl.pods = u.Pods
|
||||
kl.pods = filterHostPortConflicts(kl.pods)
|
||||
case UPDATE:
|
||||
glog.V(3).Infof("Update: Containers changed")
|
||||
|
||||
// Store the updated pods. Don't worry about filtering host ports since those
|
||||
// pods will never be looked up.
|
||||
for i := range u.Pods {
|
||||
podSyncTypes[u.Pods[i].UID] = metrics.SyncPodUpdate
|
||||
}
|
||||
|
||||
kl.pods = updateBoundPods(u.Pods, kl.pods)
|
||||
kl.pods = filterHostPortConflicts(kl.pods)
|
||||
default:
|
||||
panic("syncLoop does not support incremental changes")
|
||||
}
|
||||
|
||||
// Mark all remaining pods as sync.
|
||||
for i := range kl.pods {
|
||||
if _, ok := podSyncTypes[kl.pods[i].UID]; !ok {
|
||||
podSyncTypes[u.Pods[i].UID] = metrics.SyncPodSync
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Returns Docker version for this Kubelet.
|
||||
|
|
|
@ -34,6 +34,7 @@ import (
|
|||
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/metrics"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/volume"
|
||||
_ "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/volume/host_path"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/types"
|
||||
|
@ -382,6 +383,8 @@ func (cr *channelReader) GetList() [][]api.BoundPod {
|
|||
return cr.list
|
||||
}
|
||||
|
||||
var emptyPodUIDs map[types.UID]metrics.SyncPodType
|
||||
|
||||
func TestSyncPodsDoesNothing(t *testing.T) {
|
||||
kubelet, fakeDocker, waitGroup := newTestKubelet(t)
|
||||
container := api.Container{Name: "bar"}
|
||||
|
@ -413,7 +416,7 @@ func TestSyncPodsDoesNothing(t *testing.T) {
|
|||
},
|
||||
}
|
||||
waitGroup.Add(1)
|
||||
err := kubelet.SyncPods(kubelet.pods)
|
||||
err := kubelet.SyncPods(kubelet.pods, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@ -444,7 +447,7 @@ func TestSyncPodsWithTerminationLog(t *testing.T) {
|
|||
},
|
||||
}
|
||||
waitGroup.Add(1)
|
||||
err := kubelet.SyncPods(kubelet.pods)
|
||||
err := kubelet.SyncPods(kubelet.pods, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@ -491,7 +494,7 @@ func TestSyncPodsCreatesNetAndContainer(t *testing.T) {
|
|||
},
|
||||
}
|
||||
waitGroup.Add(1)
|
||||
err := kubelet.SyncPods(kubelet.pods)
|
||||
err := kubelet.SyncPods(kubelet.pods, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@ -542,7 +545,7 @@ func TestSyncPodsCreatesNetAndContainerPullsImage(t *testing.T) {
|
|||
},
|
||||
}
|
||||
waitGroup.Add(1)
|
||||
err := kubelet.SyncPods(kubelet.pods)
|
||||
err := kubelet.SyncPods(kubelet.pods, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@ -590,7 +593,7 @@ func TestSyncPodsWithPodInfraCreatesContainer(t *testing.T) {
|
|||
},
|
||||
}
|
||||
waitGroup.Add(1)
|
||||
err := kubelet.SyncPods(kubelet.pods)
|
||||
err := kubelet.SyncPods(kubelet.pods, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@ -645,7 +648,7 @@ func TestSyncPodsWithPodInfraCreatesContainerCallsHandler(t *testing.T) {
|
|||
},
|
||||
}
|
||||
waitGroup.Add(1)
|
||||
err := kubelet.SyncPods(kubelet.pods)
|
||||
err := kubelet.SyncPods(kubelet.pods, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@ -690,7 +693,7 @@ func TestSyncPodsDeletesWithNoPodInfraContainer(t *testing.T) {
|
|||
},
|
||||
}
|
||||
waitGroup.Add(1)
|
||||
err := kubelet.SyncPods(kubelet.pods)
|
||||
err := kubelet.SyncPods(kubelet.pods, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@ -728,7 +731,7 @@ func TestSyncPodsDeletesWhenSourcesAreReady(t *testing.T) {
|
|||
ID: "9876",
|
||||
},
|
||||
}
|
||||
if err := kubelet.SyncPods([]api.BoundPod{}); err != nil {
|
||||
if err := kubelet.SyncPods([]api.BoundPod{}, emptyPodUIDs, time.Now()); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
// Validate nothing happened.
|
||||
|
@ -736,7 +739,7 @@ func TestSyncPodsDeletesWhenSourcesAreReady(t *testing.T) {
|
|||
fakeDocker.ClearCalls()
|
||||
|
||||
ready = true
|
||||
if err := kubelet.SyncPods([]api.BoundPod{}); err != nil {
|
||||
if err := kubelet.SyncPods([]api.BoundPod{}, emptyPodUIDs, time.Now()); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
verifyCalls(t, fakeDocker, []string{"list", "stop", "stop", "inspect_container", "inspect_container"})
|
||||
|
@ -787,7 +790,7 @@ func TestSyncPodsDeletesWhenContainerSourceReady(t *testing.T) {
|
|||
ID: "9876",
|
||||
},
|
||||
}
|
||||
if err := kubelet.SyncPods([]api.BoundPod{}); err != nil {
|
||||
if err := kubelet.SyncPods([]api.BoundPod{}, emptyPodUIDs, time.Now()); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
// Validate nothing happened.
|
||||
|
@ -795,7 +798,7 @@ func TestSyncPodsDeletesWhenContainerSourceReady(t *testing.T) {
|
|||
fakeDocker.ClearCalls()
|
||||
|
||||
ready = true
|
||||
if err := kubelet.SyncPods([]api.BoundPod{}); err != nil {
|
||||
if err := kubelet.SyncPods([]api.BoundPod{}, emptyPodUIDs, time.Now()); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
verifyCalls(t, fakeDocker, []string{"list", "stop", "stop", "inspect_container", "inspect_container"})
|
||||
|
@ -833,7 +836,7 @@ func TestSyncPodsDeletes(t *testing.T) {
|
|||
ID: "4567",
|
||||
},
|
||||
}
|
||||
err := kubelet.SyncPods([]api.BoundPod{})
|
||||
err := kubelet.SyncPods([]api.BoundPod{}, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
@ -2091,7 +2094,7 @@ func TestSyncPodsWithPullPolicy(t *testing.T) {
|
|||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
}, emptyPodUIDs, time.Now())
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ package metrics
|
|||
|
||||
import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/dockertools"
|
||||
"github.com/GoogleCloudPlatform/kubernetes/pkg/types"
|
||||
|
@ -35,8 +36,16 @@ var (
|
|||
Help: "Image pull latency in microseconds.",
|
||||
},
|
||||
)
|
||||
// TODO(vmarmol): Break down by number of containers in pod?
|
||||
SyncPodLatency = prometheus.NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Subsystem: kubeletSubsystem,
|
||||
Name: "sync_pod_latency_microseconds",
|
||||
Help: "Latency in microseconds to sync a single pod. Broken down by operation type: create, update, or sync",
|
||||
},
|
||||
[]string{"operation_type"},
|
||||
)
|
||||
// TODO(vmarmol): Containers per pod
|
||||
// TODO(vmarmol): Latency of pod startup
|
||||
// TODO(vmarmol): Latency of SyncPods
|
||||
)
|
||||
|
||||
|
@ -47,10 +56,37 @@ func Register(containerCache dockertools.DockerCache) {
|
|||
// Register the metrics.
|
||||
registerMetrics.Do(func() {
|
||||
prometheus.MustRegister(ImagePullLatency)
|
||||
prometheus.MustRegister(SyncPodLatency)
|
||||
prometheus.MustRegister(newPodAndContainerCollector(containerCache))
|
||||
})
|
||||
}
|
||||
|
||||
type SyncPodType int
|
||||
|
||||
const (
|
||||
SyncPodCreate SyncPodType = iota
|
||||
SyncPodUpdate
|
||||
SyncPodSync
|
||||
)
|
||||
|
||||
func (self SyncPodType) String() string {
|
||||
switch self {
|
||||
case SyncPodCreate:
|
||||
return "create"
|
||||
case SyncPodUpdate:
|
||||
return "update"
|
||||
case SyncPodSync:
|
||||
return "sync"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// Gets the time since the specified start in microseconds.
|
||||
func SinceInMicroseconds(start time.Time) float64 {
|
||||
return float64(time.Since(start).Nanoseconds() / time.Microsecond.Nanoseconds())
|
||||
}
|
||||
|
||||
func newPodAndContainerCollector(containerCache dockertools.DockerCache) *podAndContainerCollector {
|
||||
return &podAndContainerCollector{
|
||||
containerCache: containerCache,
|
||||
|
|
|
@ -26,7 +26,7 @@ import (
|
|||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
type syncPodFunType func(*api.BoundPod, dockertools.DockerContainers) error
|
||||
type syncPodFnType func(*api.BoundPod, dockertools.DockerContainers) error
|
||||
|
||||
// TODO(wojtek-t) Add unit tests for this type.
|
||||
type podWorkers struct {
|
||||
|
@ -35,7 +35,7 @@ type podWorkers struct {
|
|||
|
||||
// Tracks all running per-pod goroutines - per-pod goroutine will be
|
||||
// processing updates received through its corresponding channel.
|
||||
podUpdates map[types.UID]chan api.BoundPod
|
||||
podUpdates map[types.UID]chan workUpdate
|
||||
// Track the current state of per-pod goroutines.
|
||||
// Currently all update request for a given pod coming when another
|
||||
// update of this pod is being processed are ignored.
|
||||
|
@ -46,44 +46,55 @@ type podWorkers struct {
|
|||
// This function is run to sync the desired stated of pod.
|
||||
// NOTE: This function has to be thread-safe - it can be called for
|
||||
// different pods at the same time.
|
||||
syncPodFun syncPodFunType
|
||||
syncPodFn syncPodFnType
|
||||
}
|
||||
|
||||
func newPodWorkers(dockerCache dockertools.DockerCache, syncPodFun syncPodFunType) *podWorkers {
|
||||
type workUpdate struct {
|
||||
// The pod state to reflect.
|
||||
pod *api.BoundPod
|
||||
|
||||
// Function to call when the update is complete.
|
||||
updateCompleteFn func()
|
||||
}
|
||||
|
||||
func newPodWorkers(dockerCache dockertools.DockerCache, syncPodFn syncPodFnType) *podWorkers {
|
||||
return &podWorkers{
|
||||
podUpdates: map[types.UID]chan api.BoundPod{},
|
||||
podUpdates: map[types.UID]chan workUpdate{},
|
||||
isWorking: map[types.UID]bool{},
|
||||
dockerCache: dockerCache,
|
||||
syncPodFun: syncPodFun,
|
||||
syncPodFn: syncPodFn,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *podWorkers) managePodLoop(podUpdates <-chan api.BoundPod) {
|
||||
for newPod := range podUpdates {
|
||||
func (p *podWorkers) managePodLoop(podUpdates <-chan workUpdate) {
|
||||
for newWork := range podUpdates {
|
||||
// Since we use docker cache, getting current state shouldn't cause
|
||||
// performance overhead on Docker. Moreover, as long as we run syncPod
|
||||
// no matter if it changes anything, having an old version of "containers"
|
||||
// can cause starting eunended containers.
|
||||
func() {
|
||||
defer p.setIsWorking(newPod.UID, false)
|
||||
defer p.setIsWorking(newWork.pod.UID, false)
|
||||
containers, err := p.dockerCache.RunningContainers()
|
||||
if err != nil {
|
||||
glog.Errorf("Error listing containers while syncing pod: %v", err)
|
||||
return
|
||||
}
|
||||
err = p.syncPodFun(&newPod, containers)
|
||||
err = p.syncPodFn(newWork.pod, containers)
|
||||
if err != nil {
|
||||
glog.Errorf("Error syncing pod %s, skipping: %v", newPod.UID, err)
|
||||
record.Eventf(&newPod, "failedSync", "Error syncing pod, skipping: %v", err)
|
||||
glog.Errorf("Error syncing pod %s, skipping: %v", newWork.pod.UID, err)
|
||||
record.Eventf(newWork.pod, "failedSync", "Error syncing pod, skipping: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
newWork.updateCompleteFn()
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func (p *podWorkers) UpdatePod(pod api.BoundPod) {
|
||||
// Apply the new setting to the specified pod. updateComplete is called when the update is completed.
|
||||
func (p *podWorkers) UpdatePod(pod *api.BoundPod, updateComplete func()) {
|
||||
uid := pod.UID
|
||||
var podUpdates chan api.BoundPod
|
||||
var podUpdates chan workUpdate
|
||||
var exists bool
|
||||
|
||||
p.podLock.Lock()
|
||||
|
@ -91,7 +102,7 @@ func (p *podWorkers) UpdatePod(pod api.BoundPod) {
|
|||
if podUpdates, exists = p.podUpdates[uid]; !exists {
|
||||
// Currently all update request for a given pod coming when another
|
||||
// update of this pod is being processed are ignored.
|
||||
podUpdates = make(chan api.BoundPod, 1)
|
||||
podUpdates = make(chan workUpdate, 1)
|
||||
p.podUpdates[uid] = podUpdates
|
||||
go p.managePodLoop(podUpdates)
|
||||
}
|
||||
|
@ -109,7 +120,10 @@ func (p *podWorkers) UpdatePod(pod api.BoundPod) {
|
|||
// be resend (because we don't drop it)
|
||||
if !p.isWorking[pod.UID] {
|
||||
p.isWorking[pod.UID] = true
|
||||
podUpdates <- pod
|
||||
podUpdates <- workUpdate{
|
||||
pod: pod,
|
||||
updateCompleteFn: updateComplete,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue