mirror of https://github.com/k3s-io/k3s
Reduce logspam for crash looping containers
parent
0ca67bde54
commit
3373fcf0fc
|
@ -394,7 +394,8 @@ func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, n
|
|||
for i, c := range containers {
|
||||
status, err := m.runtimeService.ContainerStatus(c.Id)
|
||||
if err != nil {
|
||||
klog.Errorf("ContainerStatus for %s error: %v", c.Id, err)
|
||||
// Merely log this here; GetPodStatus will actually report the error out.
|
||||
klog.V(4).Infof("ContainerStatus for %s error: %v", c.Id, err)
|
||||
return nil, err
|
||||
}
|
||||
cStatus := toKubeContainerStatus(status, m.runtimeName)
|
||||
|
|
|
@ -59,6 +59,8 @@ const (
|
|||
|
||||
// The expiration time of version cache.
|
||||
versionCacheTTL = 60 * time.Second
|
||||
// How frequently to report identical errors
|
||||
identicalErrorDelay = 1 * time.Minute
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -123,6 +125,12 @@ type kubeGenericRuntimeManager struct {
|
|||
|
||||
// Manage RuntimeClass resources.
|
||||
runtimeClassManager *runtimeclass.Manager
|
||||
|
||||
// Cache last per-container error message to reduce log spam
|
||||
lastError map[string]string
|
||||
|
||||
// Time last per-container error message was printed
|
||||
errorPrinted map[string]time.Time
|
||||
}
|
||||
|
||||
// KubeGenericRuntime is a interface contains interfaces for container runtime and command.
|
||||
|
@ -177,6 +185,8 @@ func NewKubeGenericRuntimeManager(
|
|||
internalLifecycle: internalLifecycle,
|
||||
legacyLogProvider: legacyLogProvider,
|
||||
runtimeClassManager: runtimeClassManager,
|
||||
lastError: make(map[string]string),
|
||||
errorPrinted: make(map[string]time.Time),
|
||||
}
|
||||
|
||||
typedVersion, err := kubeRuntimeManager.runtimeService.Version(kubeRuntimeAPIVersion)
|
||||
|
@ -819,6 +829,15 @@ func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPo
|
|||
return
|
||||
}
|
||||
|
||||
func (m *kubeGenericRuntimeManager) cleanupErrorTimeouts() {
|
||||
for name, timeout := range m.errorPrinted {
|
||||
if time.Now().Sub(timeout) >= identicalErrorDelay {
|
||||
delete(m.errorPrinted, name)
|
||||
delete(m.lastError, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetPodStatus retrieves the status of the pod, including the
|
||||
// information of all containers in the pod that are visible in Runtime.
|
||||
func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) {
|
||||
|
@ -868,9 +887,16 @@ func (m *kubeGenericRuntimeManager) GetPodStatus(uid kubetypes.UID, name, namesp
|
|||
// Get statuses of all containers visible in the pod.
|
||||
containerStatuses, err := m.getPodContainerStatuses(uid, name, namespace)
|
||||
if err != nil {
|
||||
klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
|
||||
lastMsg, ok := m.lastError[podFullName]
|
||||
if !ok || err.Error() != lastMsg || time.Now().Sub(m.errorPrinted[podFullName]) >= identicalErrorDelay {
|
||||
klog.Errorf("getPodContainerStatuses for pod %q failed: %v", podFullName, err)
|
||||
m.errorPrinted[podFullName] = time.Now()
|
||||
m.lastError[podFullName] = err.Error()
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
delete(m.errorPrinted, podFullName)
|
||||
delete(m.lastError, podFullName)
|
||||
|
||||
return &kubecontainer.PodStatus{
|
||||
ID: uid,
|
||||
|
|
|
@ -244,7 +244,8 @@ func (g *GenericPLEG) relist() {
|
|||
// serially may take a while. We should be aware of this and
|
||||
// parallelize if needed.
|
||||
if err := g.updateCache(pod, pid); err != nil {
|
||||
klog.Errorf("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err)
|
||||
// Rely on updateCache calling GetPodStatus to log the actual error.
|
||||
klog.V(4).Infof("PLEG: Ignoring events for pod %s/%s: %v", pod.Name, pod.Namespace, err)
|
||||
|
||||
// make sure we try to reinspect the pod during the next relisting
|
||||
needsReinspection[pid] = pod
|
||||
|
@ -274,7 +275,8 @@ func (g *GenericPLEG) relist() {
|
|||
klog.V(5).Infof("GenericPLEG: Reinspecting pods that previously failed inspection")
|
||||
for pid, pod := range g.podsToReinspect {
|
||||
if err := g.updateCache(pod, pid); err != nil {
|
||||
klog.Errorf("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err)
|
||||
// Rely on updateCache calling GetPodStatus to log the actual error.
|
||||
klog.V(5).Infof("PLEG: pod %s/%s failed reinspection: %v", pod.Name, pod.Namespace, err)
|
||||
needsReinspection[pid] = pod
|
||||
}
|
||||
}
|
||||
|
|
|
@ -36,8 +36,17 @@ import (
|
|||
type RemoteRuntimeService struct {
|
||||
timeout time.Duration
|
||||
runtimeClient runtimeapi.RuntimeServiceClient
|
||||
// Cache last per-container error message to reduce log spam
|
||||
lastError map[string]string
|
||||
// Time last per-container error message was printed
|
||||
errorPrinted map[string]time.Time
|
||||
}
|
||||
|
||||
const (
|
||||
// How frequently to report identical errors
|
||||
identicalErrorDelay = 1 * time.Minute
|
||||
)
|
||||
|
||||
// NewRemoteRuntimeService creates a new internalapi.RuntimeService.
|
||||
func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (internalapi.RuntimeService, error) {
|
||||
klog.V(3).Infof("Connecting to runtime service %s", endpoint)
|
||||
|
@ -57,6 +66,8 @@ func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (
|
|||
return &RemoteRuntimeService{
|
||||
timeout: connectionTimeout,
|
||||
runtimeClient: runtimeapi.NewRuntimeServiceClient(conn),
|
||||
lastError: make(map[string]string),
|
||||
errorPrinted: make(map[string]time.Time),
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -225,6 +236,8 @@ func (r *RemoteRuntimeService) StopContainer(containerID string, timeout int64)
|
|||
ctx, cancel := getContextWithTimeout(t)
|
||||
defer cancel()
|
||||
|
||||
delete(r.lastError, containerID)
|
||||
delete(r.errorPrinted, containerID)
|
||||
_, err := r.runtimeClient.StopContainer(ctx, &runtimeapi.StopContainerRequest{
|
||||
ContainerId: containerID,
|
||||
Timeout: timeout,
|
||||
|
@ -243,6 +256,8 @@ func (r *RemoteRuntimeService) RemoveContainer(containerID string) error {
|
|||
ctx, cancel := getContextWithTimeout(r.timeout)
|
||||
defer cancel()
|
||||
|
||||
delete(r.lastError, containerID)
|
||||
delete(r.errorPrinted, containerID)
|
||||
_, err := r.runtimeClient.RemoveContainer(ctx, &runtimeapi.RemoveContainerRequest{
|
||||
ContainerId: containerID,
|
||||
})
|
||||
|
@ -270,6 +285,16 @@ func (r *RemoteRuntimeService) ListContainers(filter *runtimeapi.ContainerFilter
|
|||
return resp.Containers, nil
|
||||
}
|
||||
|
||||
// Clean up any expired last-error timers
|
||||
func (r *RemoteRuntimeService) cleanupErrorTimeouts() {
|
||||
for ID, timeout := range r.errorPrinted {
|
||||
if time.Now().Sub(timeout) >= identicalErrorDelay {
|
||||
delete(r.lastError, ID)
|
||||
delete(r.errorPrinted, ID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ContainerStatus returns the container status.
|
||||
func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.ContainerStatus, error) {
|
||||
ctx, cancel := getContextWithTimeout(r.timeout)
|
||||
|
@ -278,10 +303,19 @@ func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.
|
|||
resp, err := r.runtimeClient.ContainerStatus(ctx, &runtimeapi.ContainerStatusRequest{
|
||||
ContainerId: containerID,
|
||||
})
|
||||
r.cleanupErrorTimeouts()
|
||||
if err != nil {
|
||||
klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
|
||||
// Don't spam the log with endless messages about the same failure.
|
||||
lastMsg, ok := r.lastError[containerID]
|
||||
if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay {
|
||||
klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
|
||||
r.errorPrinted[containerID] = time.Now()
|
||||
r.lastError[containerID] = err.Error()
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
delete(r.lastError, containerID)
|
||||
delete(r.errorPrinted, containerID)
|
||||
|
||||
if resp.Status != nil {
|
||||
if err := verifyContainerStatus(resp.Status); err != nil {
|
||||
|
@ -456,10 +490,18 @@ func (r *RemoteRuntimeService) ContainerStats(containerID string) (*runtimeapi.C
|
|||
resp, err := r.runtimeClient.ContainerStats(ctx, &runtimeapi.ContainerStatsRequest{
|
||||
ContainerId: containerID,
|
||||
})
|
||||
r.cleanupErrorTimeouts()
|
||||
if err != nil {
|
||||
klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
|
||||
lastMsg, ok := r.lastError[containerID]
|
||||
if !ok || err.Error() != lastMsg || time.Now().Sub(r.errorPrinted[containerID]) >= identicalErrorDelay {
|
||||
klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
|
||||
r.errorPrinted[containerID] = time.Now()
|
||||
r.lastError[containerID] = err.Error()
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
delete(r.lastError, containerID)
|
||||
delete(r.errorPrinted, containerID)
|
||||
|
||||
return resp.GetStats(), nil
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue