Reduce logspam for crash looping containers

2018-07-17 16:20:36 -04:00 · 2018-07-17 16:20:36 -04:00 · 3373fcf0fc
parent 0ca67bde54
commit 3373fcf0fc
4 changed files with 77 additions and 6 deletions
--- a/pkg/kubelet/kuberuntime/kuberuntime_container.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go
@ -394,7 +394,8 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n
 	for i, c := range containers {
 		status, err := m.runtimeService.ContainerStatus(c.Id)
 		if err != nil {
-			klog.Errorf("ContainerStatus for %s error: %v", c.Id, err)
+			// Merely log this here; GetPodStatus will actually report the error out.
+			klog.V(4).Infof("ContainerStatus for %s error: %v", c.Id, err)
 			return nil, err
 		}
 		cStatus := toKubeContainerStatus(status, m.runtimeName)
--- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go
+++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go
@ -59,6 +59,8 @@ const (

 	// The expiration time of version cache.
 	versionCacheTTL = 60 * time.Second
+	// How frequently to report identical errors
+	identicalErrorDelay = 1 * time.Minute
 )

 var (
@ -123,6 +125,12 @@ type kubeGenericRuntimeManager struct {

 	// Manage RuntimeClass resources.
 	runtimeClassManager *runtimeclass.Manager
+
+	// Cache last per-container error message to reduce log spam
+	lastError map[string]string
+
+	// Time last per-container error message was printed
+	errorPrinted map[string]time.Time
 }

 // KubeGenericRuntime is a interface contains interfaces for container runtime and command.
@ -177,6 +185,8 @@ func NewKubeGenericRuntimeManager(
 		internalLifecycle:   internalLifecycle,
 		legacyLogProvider:   legacyLogProvider,
 		runtimeClassManager: runtimeClassManager,
+		lastError:           make(map[string]string),
+		errorPrinted:        make(map[string]time.Time),
 	}

 	typedVersion, err := kubeRuntimeManager.runtimeService.Version(kubeRuntimeAPIVersion)
@ -819,6 +829,15 @@ func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPo
 	return
 }

+func (m *kubeGenericRuntimeManager) cleanupErrorTimeouts() {
+	for name, timeout := range m.errorPrinted {
+		if time.Now().Sub(timeout) >= identicalErrorDelay {
+			delete(m.errorPrinted, name)
+			delete(m.lastError, name)
+		}
+	}
+}
+
 // GetPodStatus retrieves the status of the pod, including the
 // information of all containers in the pod that are visible in Runtime.
 func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) {
@ -868,9 +887,16 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namesp
 	// Get statuses of all containers visible in the pod.
 	containerStatuses, err := m.getPodContainerStatuses(uid, name, namespace)
 	if err != nil {
-		klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
+		lastMsg, ok := m.lastError[podFullName]
+		if !ok || err.Error() != lastMsg || time.Now().Sub(m.errorPrinted[podFullName]) >= identicalErrorDelay {
+			klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
+			m.errorPrinted[podFullName] = time.Now()
+			m.lastError[podFullName] = err.Error()
+		}
 		return nil, err
 	}
+	delete(m.errorPrinted, podFullName)
+	delete(m.lastError, podFullName)

 	return &kubecontainer.PodStatus{
 		ID:                uid,
--- a/pkg/kubelet/pleg/generic.go
+++ b/pkg/kubelet/pleg/generic.go
@ -244,7 +244,8 @@ func (g *GenericPLEG) relist() {
 			// serially may take a while. We should be aware of this and
 			// parallelize if needed.
 			if err := g.updateCache(pod, pid); err != nil {
-				klog.Errorf("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err)
+				// Rely on updateCache calling GetPodStatus to log the actual error.
+				klog.V(4).Infof("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err)

 				// make sure we try to reinspect the pod during the next relisting
 				needsReinspection[pid] = pod
@ -274,7 +275,8 @@ func (g *GenericPLEG) relist() {
 			klog.V(5).Infof("GenericPLEG: Reinspecting pods that previously failed inspection")
 			for pid, pod := range g.podsToReinspect {
 				if err := g.updateCache(pod, pid); err != nil {
-					klog.Errorf("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err)
+					// Rely on updateCache calling GetPodStatus to log the actual error.
+					klog.V(5).Infof("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err)
 					needsReinspection[pid] = pod
 				}
 			}
--- a/pkg/kubelet/remote/remote_runtime.go
+++ b/pkg/kubelet/remote/remote_runtime.go
@ -36,8 +36,17 @@ import (
 type RemoteRuntimeService struct {
 	timeout       time.Duration
 	runtimeClient runtimeapi.RuntimeServiceClient
+	// Cache last per-container error message to reduce log spam
+	lastError map[string]string
+	// Time last per-container error message was printed
+	errorPrinted map[string]time.Time
 }

+const (
+	// How frequently to report identical errors
+	identicalErrorDelay = 1 * time.Minute
+)
+
 // NewRemoteRuntimeService creates a new internalapi.RuntimeService.
 func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (internalapi.RuntimeService, error) {
 	klog.V(3).Infof("Connecting to runtime service %s", endpoint)
@ -57,6 +66,8 @@ func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (
 	return &RemoteRuntimeService{
 		timeout:       connectionTimeout,
 		runtimeClient: runtimeapi.NewRuntimeServiceClient(conn),
+		lastError:     make(map[string]string),
+		errorPrinted:  make(map[string]time.Time),
 	}, nil
 }

@ -225,6 +236,8 @@ func (r *RemoteRuntimeService) StopContainer(containerID string, timeout int64)
 	ctx, cancel := getContextWithTimeout(t)
 	defer cancel()

+	delete(r.lastError, containerID)
+	delete(r.errorPrinted, containerID)
 	_, err := r.runtimeClient.StopContainer(ctx, &runtimeapi.StopContainerRequest{
 		ContainerId: containerID,
 		Timeout:     timeout,
@ -243,6 +256,8 @@ func (r *RemoteRuntimeService) RemoveContainer(containerID string) error {
 	ctx, cancel := getContextWithTimeout(r.timeout)
 	defer cancel()

+	delete(r.lastError, containerID)
+	delete(r.errorPrinted, containerID)
 	_, err := r.runtimeClient.RemoveContainer(ctx, &runtimeapi.RemoveContainerRequest{
 		ContainerId: containerID,
 	})
@ -270,6 +285,16 @@ func (r *RemoteRuntimeService) ListContainers(filter *runtimeapi.ContainerFilter
 	return resp.Containers, nil
 }

+// Clean up any expired last-error timers
+func (r *RemoteRuntimeService) cleanupErrorTimeouts() {
+	for ID, timeout := range r.errorPrinted {
+		if time.Now().Sub(timeout) >= identicalErrorDelay {
+			delete(r.lastError, ID)
+			delete(r.errorPrinted, ID)
+		}
+	}
+}
+
 // ContainerStatus returns the container status.
 func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.ContainerStatus, error) {
 	ctx, cancel := getContextWithTimeout(r.timeout)
@ -278,10 +303,19 @@ func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.
 	resp, err := r.runtimeClient.ContainerStatus(ctx, &runtimeapi.ContainerStatusRequest{
 		ContainerId: containerID,
 	})
+	r.cleanupErrorTimeouts()
 	if err != nil {
-		klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
+		// Don't spam the log with endless messages about the same failure.
+		lastMsg, ok := r.lastError[containerID]
+		if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay {
+			klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
+			r.errorPrinted[containerID] = time.Now()
+			r.lastError[containerID] = err.Error()
+		}
 		return nil, err
 	}
+	delete(r.lastError, containerID)
+	delete(r.errorPrinted, containerID)

 	if resp.Status != nil {
 		if err := verifyContainerStatus(resp.Status); err != nil {
@ -456,10 +490,18 @@ func (r *RemoteRuntimeService) ContainerStats(containerID string) (*runtimeapi.C
 	resp, err := r.runtimeClient.ContainerStats(ctx, &runtimeapi.ContainerStatsRequest{
 		ContainerId: containerID,
 	})
+	r.cleanupErrorTimeouts()
 	if err != nil {
-		klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
+		lastMsg, ok := r.lastError[containerID]
+		if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay {
+			klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
+			r.errorPrinted[containerID] = time.Now()
+			r.lastError[containerID] = err.Error()
+		}
 		return nil, err
 	}
+	delete(r.lastError, containerID)
+	delete(r.errorPrinted, containerID)

 	return resp.GetStats(), nil
 }