From 54224600ec9b18b38c3f656cee85d4c68fb9bbf0 Mon Sep 17 00:00:00 2001 From: Derek Carr Date: Thu, 12 Oct 2017 21:52:36 -0400 Subject: [PATCH] kubelet syncPod throws specific events --- pkg/kubelet/events/event.go | 7 +++++++ pkg/kubelet/kubelet.go | 11 +++++++++++ pkg/kubelet/kuberuntime/kuberuntime_container.go | 1 + pkg/kubelet/kuberuntime/kuberuntime_manager.go | 5 +++++ pkg/kubelet/pod_workers.go | 11 ++++------- 5 files changed, 28 insertions(+), 7 deletions(-) diff --git a/pkg/kubelet/events/event.go b/pkg/kubelet/events/event.go index a1bb2682ed..5ef68d9b44 100644 --- a/pkg/kubelet/events/event.go +++ b/pkg/kubelet/events/event.go @@ -27,6 +27,12 @@ const ( BackOffStartContainer = "BackOff" ExceededGracePeriod = "ExceededGracePeriod" + // Pod event reason list + FailedToKillPod = "FailedKillPod" + FailedToCreatePodContainer = "FailedCreatePodContainer" + FailedToMakePodDataDirectories = "Failed" + NetworkNotReady = "NetworkNotReady" + // Image event reason list PullingImage = "Pulling" PulledImage = "Pulled" @@ -65,6 +71,7 @@ const ( UnsupportedMountOption = "UnsupportedMountOption" SandboxChanged = "SandboxChanged" FailedCreatePodSandBox = "FailedCreatePodSandBox" + FailedStatusPodSandBox = "FailedPodSandBoxStatus" // Image manager event reason list InvalidDiskCapacity = "InvalidDiskCapacity" diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 68b8fc4b62..7855caa5a2 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1474,6 +1474,10 @@ func (kl *Kubelet) GetClusterDNS(pod *v1.Pod) ([]string, []string, bool, error) // // If any step of this workflow errors, the error is returned, and is repeated // on the next syncPod call. +// +// This operation writes all events that are dispatched in order to provide +// the most accurate information possible about an error situation to aid debugging. +// Callers should not throw an event if this operation returns an error. func (kl *Kubelet) syncPod(o syncPodOptions) error { // pull out the required options pod := o.pod @@ -1491,6 +1495,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { kl.statusManager.SetPodStatus(pod, apiPodStatus) // we kill the pod with the specified grace period since this is a termination if err := kl.killPod(pod, nil, podStatus, killPodOptions.PodTerminationGracePeriodSecondsOverride); err != nil { + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) // there was an error killing the pod, so we return that error directly utilruntime.HandleError(err) return err @@ -1557,6 +1562,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed { var syncErr error if err := kl.killPod(pod, nil, podStatus, nil); err != nil { + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err) syncErr = fmt.Errorf("error killing pod: %v", err) utilruntime.HandleError(syncErr) } else { @@ -1571,6 +1577,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { // If the network plugin is not ready, only start the pod if it uses the host network if rs := kl.runtimeState.networkErrors(); len(rs) != 0 && !kubecontainer.IsHostNetworkPod(pod) { + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "network is not ready: %v", rs) return fmt.Errorf("network is not ready: %v", rs) } @@ -1613,6 +1620,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { glog.V(2).Infof("Failed to update QoS cgroups while syncing pod: %v", err) } if err := pcm.EnsureExists(pod); err != nil { + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err) return fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err) } } @@ -1650,6 +1658,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { // Make data directories for the pod if err := kl.makePodDataDirs(pod); err != nil { + kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err) glog.Errorf("Unable to make pod data directories for pod %q: %v", format.Pod(pod), err) return err } @@ -1671,6 +1680,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error { result := kl.containerRuntime.SyncPod(pod, apiPodStatus, podStatus, pullSecrets, kl.backOff) kl.reasonCache.Update(pod.UID, result) if err := result.Error(); err != nil { + // Do not record an event here, as we keep all event logging for sync pod failures + // local to container runtime so we get better errors return err } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container.go b/pkg/kubelet/kuberuntime/kuberuntime_container.go index 46de4373df..316fe1020a 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container.go @@ -89,6 +89,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb // Step 1: pull the image. imageRef, msg, err := m.imagePuller.EnsureImageExists(pod, container, pullSecrets) if err != nil { + m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err)) return msg, err } diff --git a/pkg/kubelet/kuberuntime/kuberuntime_manager.go b/pkg/kubelet/kuberuntime/kuberuntime_manager.go index 75e238014a..7f936d387f 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_manager.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_manager.go @@ -642,6 +642,11 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, _ v1.PodStatus, podStat podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID) if err != nil { + ref, err := ref.GetReference(api.Scheme, pod) + if err != nil { + glog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), err) + } + m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedStatusPodSandBox, "Unable to get pod sandbox status: %v", err) glog.Errorf("Failed to get pod sandbox status: %v; Skipping pod %q", err, format.Pod(pod)) result.Fail(err) return diff --git a/pkg/kubelet/pod_workers.go b/pkg/kubelet/pod_workers.go index 91f48be899..6a8a33353f 100644 --- a/pkg/kubelet/pod_workers.go +++ b/pkg/kubelet/pod_workers.go @@ -162,6 +162,9 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) { // the previous sync. status, err := p.podCache.GetNewerThan(podUID, lastSyncTime) if err != nil { + // This is the legacy event thrown by manage pod loop + // all other events are now dispatched from syncPodFn + p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err) return err } err = p.syncPodFn(syncPodOptions{ @@ -179,14 +182,8 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) { update.OnCompleteFunc(err) } if err != nil { + // IMPORTANT: we do not log errors here, the syncPodFn is responsible for logging errors glog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err) - // if we failed sync, we throw more specific events for why it happened. - // as a result, i question the value of this event. - // TODO: determine if we can remove this in a future release. - // do not include descriptive text that can vary on why it failed so in a pathological - // scenario, kubelet does not create enough discrete events that miss default aggregation - // window. - p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod") } p.wrapUp(update.Pod.UID, err) }