Merge pull request #53857 from derekwaynecarr/sync-event

Automatic merge from submit-queue (batch tested with PRs 51840, 53542, 53857, 53831, 53702). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

kubelet sync pod throws more detailed events

**What this PR does / why we need it**:
If there are errors in the kubelet sync pod iteration, it is difficult to determine the problem.

This provides more specific events for errors that occur in the syncPod iteration to help perform problem isolation.

Fixes https://github.com/kubernetes/kubernetes/issues/53900

**Special notes for your reviewer**:
It is safer to dispatch more specific events now that we have an event budget per object enforced via https://github.com/kubernetes/kubernetes/pull/47367

**Release note**:
```release-note
kubelet provides more specific events when unable to sync pod
```
pull/6/head
Kubernetes Submit Queue 2017-10-13 12:44:52 -07:00 committed by GitHub
commit e6e23ae163
5 changed files with 28 additions and 7 deletions

View File

@ -27,6 +27,12 @@ const (
BackOffStartContainer = "BackOff" BackOffStartContainer = "BackOff"
ExceededGracePeriod = "ExceededGracePeriod" ExceededGracePeriod = "ExceededGracePeriod"
// Pod event reason list
FailedToKillPod = "FailedKillPod"
FailedToCreatePodContainer = "FailedCreatePodContainer"
FailedToMakePodDataDirectories = "Failed"
NetworkNotReady = "NetworkNotReady"
// Image event reason list // Image event reason list
PullingImage = "Pulling" PullingImage = "Pulling"
PulledImage = "Pulled" PulledImage = "Pulled"
@ -65,6 +71,7 @@ const (
UnsupportedMountOption = "UnsupportedMountOption" UnsupportedMountOption = "UnsupportedMountOption"
SandboxChanged = "SandboxChanged" SandboxChanged = "SandboxChanged"
FailedCreatePodSandBox = "FailedCreatePodSandBox" FailedCreatePodSandBox = "FailedCreatePodSandBox"
FailedStatusPodSandBox = "FailedPodSandBoxStatus"
// Image manager event reason list // Image manager event reason list
InvalidDiskCapacity = "InvalidDiskCapacity" InvalidDiskCapacity = "InvalidDiskCapacity"

View File

@ -1474,6 +1474,10 @@ func (kl *Kubelet) GetClusterDNS(pod *v1.Pod) ([]string, []string, bool, error)
// //
// If any step of this workflow errors, the error is returned, and is repeated // If any step of this workflow errors, the error is returned, and is repeated
// on the next syncPod call. // on the next syncPod call.
//
// This operation writes all events that are dispatched in order to provide
// the most accurate information possible about an error situation to aid debugging.
// Callers should not throw an event if this operation returns an error.
func (kl *Kubelet) syncPod(o syncPodOptions) error { func (kl *Kubelet) syncPod(o syncPodOptions) error {
// pull out the required options // pull out the required options
pod := o.pod pod := o.pod
@ -1491,6 +1495,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
kl.statusManager.SetPodStatus(pod, apiPodStatus) kl.statusManager.SetPodStatus(pod, apiPodStatus)
// we kill the pod with the specified grace period since this is a termination // we kill the pod with the specified grace period since this is a termination
if err := kl.killPod(pod, nil, podStatus, killPodOptions.PodTerminationGracePeriodSecondsOverride); err != nil { if err := kl.killPod(pod, nil, podStatus, killPodOptions.PodTerminationGracePeriodSecondsOverride); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
// there was an error killing the pod, so we return that error directly // there was an error killing the pod, so we return that error directly
utilruntime.HandleError(err) utilruntime.HandleError(err)
return err return err
@ -1557,6 +1562,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed { if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed {
var syncErr error var syncErr error
if err := kl.killPod(pod, nil, podStatus, nil); err != nil { if err := kl.killPod(pod, nil, podStatus, nil); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
syncErr = fmt.Errorf("error killing pod: %v", err) syncErr = fmt.Errorf("error killing pod: %v", err)
utilruntime.HandleError(syncErr) utilruntime.HandleError(syncErr)
} else { } else {
@ -1571,6 +1577,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
// If the network plugin is not ready, only start the pod if it uses the host network // If the network plugin is not ready, only start the pod if it uses the host network
if rs := kl.runtimeState.networkErrors(); len(rs) != 0 && !kubecontainer.IsHostNetworkPod(pod) { if rs := kl.runtimeState.networkErrors(); len(rs) != 0 && !kubecontainer.IsHostNetworkPod(pod) {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "network is not ready: %v", rs)
return fmt.Errorf("network is not ready: %v", rs) return fmt.Errorf("network is not ready: %v", rs)
} }
@ -1613,6 +1620,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
glog.V(2).Infof("Failed to update QoS cgroups while syncing pod: %v", err) glog.V(2).Infof("Failed to update QoS cgroups while syncing pod: %v", err)
} }
if err := pcm.EnsureExists(pod); err != nil { if err := pcm.EnsureExists(pod); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err)
return fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err) return fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err)
} }
} }
@ -1650,6 +1658,7 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
// Make data directories for the pod // Make data directories for the pod
if err := kl.makePodDataDirs(pod); err != nil { if err := kl.makePodDataDirs(pod); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err)
glog.Errorf("Unable to make pod data directories for pod %q: %v", format.Pod(pod), err) glog.Errorf("Unable to make pod data directories for pod %q: %v", format.Pod(pod), err)
return err return err
} }
@ -1671,6 +1680,8 @@ func (kl *Kubelet) syncPod(o syncPodOptions) error {
result := kl.containerRuntime.SyncPod(pod, apiPodStatus, podStatus, pullSecrets, kl.backOff) result := kl.containerRuntime.SyncPod(pod, apiPodStatus, podStatus, pullSecrets, kl.backOff)
kl.reasonCache.Update(pod.UID, result) kl.reasonCache.Update(pod.UID, result)
if err := result.Error(); err != nil { if err := result.Error(); err != nil {
// Do not record an event here, as we keep all event logging for sync pod failures
// local to container runtime so we get better errors
return err return err
} }

View File

@ -89,6 +89,7 @@ func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandb
// Step 1: pull the image. // Step 1: pull the image.
imageRef, msg, err := m.imagePuller.EnsureImageExists(pod, container, pullSecrets) imageRef, msg, err := m.imagePuller.EnsureImageExists(pod, container, pullSecrets)
if err != nil { if err != nil {
m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err))
return msg, err return msg, err
} }

View File

@ -642,6 +642,11 @@ func (m *kubeGenericRuntimeManager) SyncPod(pod *v1.Pod, _ v1.PodStatus, podStat
podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID) podSandboxStatus, err := m.runtimeService.PodSandboxStatus(podSandboxID)
if err != nil { if err != nil {
ref, err := ref.GetReference(api.Scheme, pod)
if err != nil {
glog.Errorf("Couldn't make a ref to pod %q: '%v'", format.Pod(pod), err)
}
m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedStatusPodSandBox, "Unable to get pod sandbox status: %v", err)
glog.Errorf("Failed to get pod sandbox status: %v; Skipping pod %q", err, format.Pod(pod)) glog.Errorf("Failed to get pod sandbox status: %v; Skipping pod %q", err, format.Pod(pod))
result.Fail(err) result.Fail(err)
return return

View File

@ -162,6 +162,9 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) {
// the previous sync. // the previous sync.
status, err := p.podCache.GetNewerThan(podUID, lastSyncTime) status, err := p.podCache.GetNewerThan(podUID, lastSyncTime)
if err != nil { if err != nil {
// This is the legacy event thrown by manage pod loop
// all other events are now dispatched from syncPodFn
p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "error determining status: %v", err)
return err return err
} }
err = p.syncPodFn(syncPodOptions{ err = p.syncPodFn(syncPodOptions{
@ -179,14 +182,8 @@ func (p *podWorkers) managePodLoop(podUpdates <-chan UpdatePodOptions) {
update.OnCompleteFunc(err) update.OnCompleteFunc(err)
} }
if err != nil { if err != nil {
// IMPORTANT: we do not log errors here, the syncPodFn is responsible for logging errors
glog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err) glog.Errorf("Error syncing pod %s (%q), skipping: %v", update.Pod.UID, format.Pod(update.Pod), err)
// if we failed sync, we throw more specific events for why it happened.
// as a result, i question the value of this event.
// TODO: determine if we can remove this in a future release.
// do not include descriptive text that can vary on why it failed so in a pathological
// scenario, kubelet does not create enough discrete events that miss default aggregation
// window.
p.recorder.Eventf(update.Pod, v1.EventTypeWarning, events.FailedSync, "Error syncing pod")
} }
p.wrapUp(update.Pod.UID, err) p.wrapUp(update.Pod.UID, err)
} }