Reduce logspam for crash looping containers

pull/564/head
Robert Krawitz 2018-07-17 16:20:36 -04:00
parent 0ca67bde54
commit 3373fcf0fc
4 changed files with 77 additions and 6 deletions

View File

@ -394,7 +394,8 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n
for i, c := range containers {
status, err := m.runtimeService.ContainerStatus(c.Id)
if err != nil {
klog.Errorf("ContainerStatus for %s error: %v", c.Id, err)
// Merely log this here; GetPodStatus will actually report the error out.
klog.V(4).Infof("ContainerStatus for %s error: %v", c.Id, err)
return nil, err
}
cStatus := toKubeContainerStatus(status, m.runtimeName)

View File

@ -59,6 +59,8 @@ const (
// The expiration time of version cache.
versionCacheTTL = 60 * time.Second
// How frequently to report identical errors
identicalErrorDelay = 1 * time.Minute
)
var (
@ -123,6 +125,12 @@ type kubeGenericRuntimeManager struct {
// Manage RuntimeClass resources.
runtimeClassManager *runtimeclass.Manager
// Cache last per-container error message to reduce log spam
lastError map[string]string
// Time last per-container error message was printed
errorPrinted map[string]time.Time
}
// KubeGenericRuntime is a interface contains interfaces for container runtime and command.
@ -177,6 +185,8 @@ func NewKubeGenericRuntimeManager(
internalLifecycle: internalLifecycle,
legacyLogProvider: legacyLogProvider,
runtimeClassManager: runtimeClassManager,
lastError: make(map[string]string),
errorPrinted: make(map[string]time.Time),
}
typedVersion, err := kubeRuntimeManager.runtimeService.Version(kubeRuntimeAPIVersion)
@ -819,6 +829,15 @@ func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPo
return
}
func (m *kubeGenericRuntimeManager) cleanupErrorTimeouts() {
for name, timeout := range m.errorPrinted {
if time.Now().Sub(timeout) >= identicalErrorDelay {
delete(m.errorPrinted, name)
delete(m.lastError, name)
}
}
}
// GetPodStatus retrieves the status of the pod, including the
// information of all containers in the pod that are visible in Runtime.
func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) {
@ -868,9 +887,16 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namesp
// Get statuses of all containers visible in the pod.
containerStatuses, err := m.getPodContainerStatuses(uid, name, namespace)
if err != nil {
klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
lastMsg, ok := m.lastError[podFullName]
if !ok || err.Error() != lastMsg || time.Now().Sub(m.errorPrinted[podFullName]) >= identicalErrorDelay {
klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
m.errorPrinted[podFullName] = time.Now()
m.lastError[podFullName] = err.Error()
}
return nil, err
}
delete(m.errorPrinted, podFullName)
delete(m.lastError, podFullName)
return &kubecontainer.PodStatus{
ID: uid,

View File

@ -244,7 +244,8 @@ func (g *GenericPLEG) relist() {
// serially may take a while. We should be aware of this and
// parallelize if needed.
if err := g.updateCache(pod, pid); err != nil {
klog.Errorf("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err)
// Rely on updateCache calling GetPodStatus to log the actual error.
klog.V(4).Infof("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err)
// make sure we try to reinspect the pod during the next relisting
needsReinspection[pid] = pod
@ -274,7 +275,8 @@ func (g *GenericPLEG) relist() {
klog.V(5).Infof("GenericPLEG: Reinspecting pods that previously failed inspection")
for pid, pod := range g.podsToReinspect {
if err := g.updateCache(pod, pid); err != nil {
klog.Errorf("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err)
// Rely on updateCache calling GetPodStatus to log the actual error.
klog.V(5).Infof("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err)
needsReinspection[pid] = pod
}
}

View File

@ -36,8 +36,17 @@ import (
type RemoteRuntimeService struct {
timeout time.Duration
runtimeClient runtimeapi.RuntimeServiceClient
// Cache last per-container error message to reduce log spam
lastError map[string]string
// Time last per-container error message was printed
errorPrinted map[string]time.Time
}
const (
// How frequently to report identical errors
identicalErrorDelay = 1 * time.Minute
)
// NewRemoteRuntimeService creates a new internalapi.RuntimeService.
func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (internalapi.RuntimeService, error) {
klog.V(3).Infof("Connecting to runtime service %s", endpoint)
@ -57,6 +66,8 @@ func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (
return &RemoteRuntimeService{
timeout: connectionTimeout,
runtimeClient: runtimeapi.NewRuntimeServiceClient(conn),
lastError: make(map[string]string),
errorPrinted: make(map[string]time.Time),
}, nil
}
@ -225,6 +236,8 @@ func (r *RemoteRuntimeService) StopContainer(containerID string, timeout int64)
ctx, cancel := getContextWithTimeout(t)
defer cancel()
delete(r.lastError, containerID)
delete(r.errorPrinted, containerID)
_, err := r.runtimeClient.StopContainer(ctx, &runtimeapi.StopContainerRequest{
ContainerId: containerID,
Timeout: timeout,
@ -243,6 +256,8 @@ func (r *RemoteRuntimeService) RemoveContainer(containerID string) error {
ctx, cancel := getContextWithTimeout(r.timeout)
defer cancel()
delete(r.lastError, containerID)
delete(r.errorPrinted, containerID)
_, err := r.runtimeClient.RemoveContainer(ctx, &runtimeapi.RemoveContainerRequest{
ContainerId: containerID,
})
@ -270,6 +285,16 @@ func (r *RemoteRuntimeService) ListContainers(filter *runtimeapi.ContainerFilter
return resp.Containers, nil
}
// Clean up any expired last-error timers
func (r *RemoteRuntimeService) cleanupErrorTimeouts() {
for ID, timeout := range r.errorPrinted {
if time.Now().Sub(timeout) >= identicalErrorDelay {
delete(r.lastError, ID)
delete(r.errorPrinted, ID)
}
}
}
// ContainerStatus returns the container status.
func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.ContainerStatus, error) {
ctx, cancel := getContextWithTimeout(r.timeout)
@ -278,10 +303,19 @@ func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.
resp, err := r.runtimeClient.ContainerStatus(ctx, &runtimeapi.ContainerStatusRequest{
ContainerId: containerID,
})
r.cleanupErrorTimeouts()
if err != nil {
klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
// Don't spam the log with endless messages about the same failure.
lastMsg, ok := r.lastError[containerID]
if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay {
klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
r.errorPrinted[containerID] = time.Now()
r.lastError[containerID] = err.Error()
}
return nil, err
}
delete(r.lastError, containerID)
delete(r.errorPrinted, containerID)
if resp.Status != nil {
if err := verifyContainerStatus(resp.Status); err != nil {
@ -456,10 +490,18 @@ func (r *RemoteRuntimeService) ContainerStats(containerID string) (*runtimeapi.C
resp, err := r.runtimeClient.ContainerStats(ctx, &runtimeapi.ContainerStatsRequest{
ContainerId: containerID,
})
r.cleanupErrorTimeouts()
if err != nil {
klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
lastMsg, ok := r.lastError[containerID]
if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay {
klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
r.errorPrinted[containerID] = time.Now()
r.lastError[containerID] = err.Error()
}
return nil, err
}
delete(r.lastError, containerID)
delete(r.errorPrinted, containerID)
return resp.GetStats(), nil
}