Kubelet: surface the container creation/start failure reason

Container creation/start failure cannot be reproduced by inspecting the
containers. This change caches such errors so that kubelet can retrieve it
later.

This change also extends FakeDockerClient to support setting error response
for a specific function.
pull/6/head
Yu-Ju Hong 2015-04-09 11:57:53 -07:00
parent c7ca118c7a
commit bb417e82d7
5 changed files with 231 additions and 96 deletions

View File

@ -243,7 +243,7 @@ func TestPull(t *testing.T) {
func TestDockerKeyringLookupFails(t *testing.T) {
fakeKeyring := &credentialprovider.FakeKeyring{}
fakeClient := &FakeDockerClient{
Err: fmt.Errorf("test error"),
Errors: map[string]error{"pull": fmt.Errorf("test error")},
}
dp := dockerPuller{
@ -394,7 +394,7 @@ func TestIsImagePresent(t *testing.T) {
}
func TestGetRunningContainers(t *testing.T) {
fakeDocker := &FakeDockerClient{}
fakeDocker := &FakeDockerClient{Errors: make(map[string]error)}
fakeRecorder := &record.FakeRecorder{}
containerManager := NewDockerManager(fakeDocker, fakeRecorder, PodInfraContainerImage)
tests := []struct {
@ -478,14 +478,16 @@ func TestGetRunningContainers(t *testing.T) {
}
for _, test := range tests {
fakeDocker.ContainerMap = test.containers
fakeDocker.Err = test.err
if test.err != nil {
fakeDocker.Errors["inspect_container"] = test.err
}
if results, err := containerManager.GetRunningContainers(test.inputIDs); err == nil {
resultIDs := []string{}
for _, result := range results {
resultIDs = append(resultIDs, result.ID)
}
if !reflect.DeepEqual(resultIDs, test.expectedIDs) {
t.Errorf("expected: %v, saw: %v", test.expectedIDs, resultIDs)
t.Errorf("expected: %#v, saw: %#v", test.expectedIDs, resultIDs)
}
if err != nil {
t.Errorf("unexpected error: %v", err)

View File

@ -38,7 +38,7 @@ type FakeDockerClient struct {
ContainerMap map[string]*docker.Container
Image *docker.Image
Images []docker.APIImages
Err error
Errors map[string]error
called []string
Stopped []string
pulled []string
@ -118,17 +118,30 @@ func (f *FakeDockerClient) AssertUnorderedCalls(calls []string) (err error) {
return
}
func (f *FakeDockerClient) popError(op string) error {
if f.Errors == nil {
return nil
}
err, ok := f.Errors[op]
if ok {
delete(f.Errors, op)
return err
} else {
return nil
}
}
// ListContainers is a test-spy implementation of DockerInterface.ListContainers.
// It adds an entry "list" to the internal method call record.
func (f *FakeDockerClient) ListContainers(options docker.ListContainersOptions) ([]docker.APIContainers, error) {
f.Lock()
defer f.Unlock()
f.called = append(f.called, "list")
err := f.popError("list")
if options.All {
return append(f.ContainerList, f.ExitedContainerList...), f.Err
return append(f.ContainerList, f.ExitedContainerList...), err
}
return f.ContainerList, f.Err
return f.ContainerList, err
}
// InspectContainer is a test-spy implementation of DockerInterface.InspectContainer.
@ -137,12 +150,13 @@ func (f *FakeDockerClient) InspectContainer(id string) (*docker.Container, error
f.Lock()
defer f.Unlock()
f.called = append(f.called, "inspect_container")
err := f.popError("inspect_container")
if f.ContainerMap != nil {
if container, ok := f.ContainerMap[id]; ok {
return container, f.Err
return container, err
}
}
return f.Container, f.Err
return f.Container, err
}
// InspectImage is a test-spy implementation of DockerInterface.InspectImage.
@ -151,7 +165,8 @@ func (f *FakeDockerClient) InspectImage(name string) (*docker.Image, error) {
f.Lock()
defer f.Unlock()
f.called = append(f.called, "inspect_image")
return f.Image, f.Err
err := f.popError("inspect_image")
return f.Image, err
}
// CreateContainer is a test-spy implementation of DockerInterface.CreateContainer.
@ -160,12 +175,16 @@ func (f *FakeDockerClient) CreateContainer(c docker.CreateContainerOptions) (*do
f.Lock()
defer f.Unlock()
f.called = append(f.called, "create")
f.Created = append(f.Created, c.Name)
// This is not a very good fake. We'll just add this container's name to the list.
// Docker likes to add a '/', so copy that behavior.
name := "/" + c.Name
f.ContainerList = append(f.ContainerList, docker.APIContainers{ID: name, Names: []string{name}, Image: c.Config.Image})
return &docker.Container{ID: name}, nil
err := f.popError("create")
if err == nil {
f.Created = append(f.Created, c.Name)
// This is not a very good fake. We'll just add this container's name to the list.
// Docker likes to add a '/', so copy that behavior.
name := "/" + c.Name
f.ContainerList = append(f.ContainerList, docker.APIContainers{ID: name, Names: []string{name}, Image: c.Config.Image})
return &docker.Container{ID: name}, nil
}
return nil, err
}
// StartContainer is a test-spy implementation of DockerInterface.StartContainer.
@ -174,18 +193,22 @@ func (f *FakeDockerClient) StartContainer(id string, hostConfig *docker.HostConf
f.Lock()
defer f.Unlock()
f.called = append(f.called, "start")
f.Container = &docker.Container{
ID: id,
Name: id, // For testing purpose, we set name to id
Config: &docker.Config{Image: "testimage"},
HostConfig: hostConfig,
State: docker.State{
Running: true,
Pid: os.Getpid(),
},
NetworkSettings: &docker.NetworkSettings{IPAddress: "1.2.3.4"},
err := f.popError("start")
if err == nil {
f.Container = &docker.Container{
ID: id,
Name: id, // For testing purpose, we set name to id
Config: &docker.Config{Image: "testimage"},
HostConfig: hostConfig,
State: docker.State{
Running: true,
Pid: os.Getpid(),
},
NetworkSettings: &docker.NetworkSettings{IPAddress: "1.2.3.4"},
}
}
return f.Err
return err
}
// StopContainer is a test-spy implementation of DockerInterface.StopContainer.
@ -194,23 +217,29 @@ func (f *FakeDockerClient) StopContainer(id string, timeout uint) error {
f.Lock()
defer f.Unlock()
f.called = append(f.called, "stop")
f.Stopped = append(f.Stopped, id)
var newList []docker.APIContainers
for _, container := range f.ContainerList {
if container.ID != id {
newList = append(newList, container)
err := f.popError("stop")
if err == nil {
f.Stopped = append(f.Stopped, id)
var newList []docker.APIContainers
for _, container := range f.ContainerList {
if container.ID != id {
newList = append(newList, container)
}
}
f.ContainerList = newList
}
f.ContainerList = newList
return f.Err
return err
}
func (f *FakeDockerClient) RemoveContainer(opts docker.RemoveContainerOptions) error {
f.Lock()
defer f.Unlock()
f.called = append(f.called, "remove")
f.Removed = append(f.Removed, opts.ID)
return f.Err
err := f.popError("remove")
if err == nil {
f.Removed = append(f.Removed, opts.ID)
}
return err
}
// Logs is a test-spy implementation of DockerInterface.Logs.
@ -219,7 +248,7 @@ func (f *FakeDockerClient) Logs(opts docker.LogsOptions) error {
f.Lock()
defer f.Unlock()
f.called = append(f.called, "logs")
return f.Err
return f.popError("logs")
}
// PullImage is a test-spy implementation of DockerInterface.StopContainer.
@ -228,12 +257,15 @@ func (f *FakeDockerClient) PullImage(opts docker.PullImageOptions, auth docker.A
f.Lock()
defer f.Unlock()
f.called = append(f.called, "pull")
registry := opts.Registry
if len(registry) != 0 {
registry = registry + "/"
err := f.popError("pull")
if err == nil {
registry := opts.Registry
if len(registry) != 0 {
registry = registry + "/"
}
f.pulled = append(f.pulled, fmt.Sprintf("%s%s:%s", registry, opts.Repository, opts.Tag))
}
f.pulled = append(f.pulled, fmt.Sprintf("%s%s:%s", registry, opts.Repository, opts.Tag))
return f.Err
return err
}
func (f *FakeDockerClient) Version() (*docker.Env, error) {
@ -249,12 +281,16 @@ func (f *FakeDockerClient) StartExec(_ string, _ docker.StartExecOptions) error
}
func (f *FakeDockerClient) ListImages(opts docker.ListImagesOptions) ([]docker.APIImages, error) {
return f.Images, f.Err
err := f.popError("list_images")
return f.Images, err
}
func (f *FakeDockerClient) RemoveImage(image string) error {
f.RemovedImages.Insert(image)
return f.Err
err := f.popError("remove_image")
if err == nil {
f.RemovedImages.Insert(image)
}
return err
}
// FakeDockerPuller is a stub implementation of DockerPuller.

View File

@ -25,14 +25,21 @@ import (
"path"
"strconv"
"strings"
"sync"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/capabilities"
"github.com/GoogleCloudPlatform/kubernetes/pkg/client/record"
kubecontainer "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container"
"github.com/GoogleCloudPlatform/kubernetes/pkg/types"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
"github.com/fsouza/go-dockerclient"
"github.com/golang/glog"
"github.com/golang/groupcache/lru"
)
const (
maxReasonCacheEntries = 200
)
// Implements kubecontainer.ContainerRunner.
@ -44,13 +51,60 @@ type DockerManager struct {
// TODO(yifan): PodInfraContainerImage can be unexported once
// we move createPodInfraContainer into dockertools.
PodInfraContainerImage string
// reasonCache stores the failure reason of the last container creation
// and/or start in a string, keyed by <pod_UID>_<container_name>. The goal
// is to propagate this reason to the container status. This endeavor is
// "best-effort" for two reasons:
// 1. The cache is not persisted.
// 2. We use an LRU cache to avoid extra garbage collection work. This
// means that some entries may be recycled before a pod has been
// deleted.
reasonCache stringCache
}
// Ensures DockerManager implements ConatinerRunner.
var _ kubecontainer.ContainerRunner = new(DockerManager)
func NewDockerManager(client DockerInterface, recorder record.EventRecorder, podInfraContainerImage string) *DockerManager {
return &DockerManager{client: client, recorder: recorder, PodInfraContainerImage: podInfraContainerImage}
reasonCache := stringCache{cache: lru.New(maxReasonCacheEntries)}
return &DockerManager{
client: client,
recorder: recorder,
PodInfraContainerImage: podInfraContainerImage,
reasonCache: reasonCache}
}
// A cache which stores strings keyed by <pod_UID>_<container_name>.
type stringCache struct {
lock sync.RWMutex
cache *lru.Cache
}
func (self *stringCache) composeKey(uid types.UID, name string) string {
return fmt.Sprintf("%s_%s", uid, name)
}
func (self *stringCache) Add(uid types.UID, name string, value string) {
self.lock.Lock()
defer self.lock.Unlock()
self.cache.Add(self.composeKey(uid, name), value)
}
func (self *stringCache) Remove(uid types.UID, name string) {
self.lock.Lock()
defer self.lock.Unlock()
self.cache.Remove(self.composeKey(uid, name))
}
func (self *stringCache) Get(uid types.UID, name string) (string, bool) {
self.lock.RLock()
defer self.lock.RUnlock()
value, ok := self.cache.Get(self.composeKey(uid, name))
if ok {
return value.(string), ok
} else {
return "", ok
}
}
// GetKubeletDockerContainerLogs returns logs of a specific container. By
@ -118,7 +172,6 @@ func (self *DockerManager) inspectContainer(dockerID, containerName, tPath strin
ContainerID: DockerPrefix + dockerID,
}
waiting := true
if inspectResult.State.Running {
result.status.State.Running = &api.ContainerStateRunning{
StartedAt: util.NewTime(inspectResult.State.StartedAt),
@ -126,7 +179,6 @@ func (self *DockerManager) inspectContainer(dockerID, containerName, tPath strin
if containerName == PodInfraContainerName && inspectResult.NetworkSettings != nil {
result.ip = inspectResult.NetworkSettings.IPAddress
}
waiting = false
} else if !inspectResult.State.FinishedAt.IsZero() {
reason := ""
// Note: An application might handle OOMKilled gracefully.
@ -155,13 +207,8 @@ func (self *DockerManager) inspectContainer(dockerID, containerName, tPath strin
}
}
}
waiting = false
}
if waiting {
} else {
// TODO(dchen1107): Separate issue docker/docker#8294 was filed
// TODO(dchen1107): Need to figure out why we are still waiting
// Check any issue to run container
result.status.State.Waiting = &api.ContainerStateWaiting{
Reason: ErrContainerCannotRun.Error(),
}
@ -233,31 +280,27 @@ func (self *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
return nil, result.err
}
// Add user container information
if dockerContainerName == PodInfraContainerName &&
result.status.State.Running != nil {
if dockerContainerName == PodInfraContainerName {
// Found network container
podStatus.PodIP = result.ip
if result.status.State.Running != nil {
podStatus.PodIP = result.ip
}
} else {
// Add user container information.
statuses[dockerContainerName] = result.status
}
}
if len(statuses) == 0 && podStatus.PodIP == "" {
return nil, ErrNoContainersInPod
}
// Not all containers expected are created, check if there are
// image related issues
if len(statuses) < len(manifest.Containers) {
for _, container := range manifest.Containers {
var containerStatus api.ContainerStatus
for _, container := range manifest.Containers {
if _, found := statuses[container.Name]; found {
continue
}
if status, found := statuses[container.Name]; found {
containerStatus = status
} else {
// The container has not been created yet. Check image is ready on
// the node or not.
// TODO: If we integrate DockerPuller into DockerManager, we can
// record the pull failure and eliminate the image checking below.
image := container.Image
// Check image is ready on the node or not
// TODO(dchen1107): docker/docker/issues/8365 to figure out if the image exists
_, err := self.client.InspectImage(image)
if err == nil {
@ -268,14 +311,15 @@ func (self *DockerManager) GetPodStatus(pod *api.Pod) (*api.PodStatus, error) {
containerStatus.State.Waiting = &api.ContainerStateWaiting{
Reason: fmt.Sprintf("Image: %s is not ready on the node", image),
}
} else {
containerStatus.State.Waiting = &api.ContainerStateWaiting{
Reason: "",
}
}
statuses[container.Name] = containerStatus
}
if containerStatus.State.Waiting != nil {
// For containers in the waiting state, fill in a specific reason if it is recorded.
if reason, ok := self.reasonCache.Get(uid, container.Name); ok {
containerStatus.State.Waiting.Reason = reason
}
}
statuses[container.Name] = containerStatus
}
podStatus.ContainerStatuses = make([]api.ContainerStatus, 0)
@ -304,6 +348,19 @@ func (self *DockerManager) GetRunningContainers(ids []string) ([]*docker.Contain
}
func (self *DockerManager) RunContainer(pod *api.Pod, container *api.Container, opts *kubecontainer.RunContainerOptions) (string, error) {
dockerID, err := self.runContainer(pod, container, opts)
if err != nil {
errString := err.Error()
if errString != "" {
self.reasonCache.Add(pod.UID, container.Name, errString)
} else {
self.reasonCache.Remove(pod.UID, container.Name)
}
}
return dockerID, err
}
func (self *DockerManager) runContainer(pod *api.Pod, container *api.Container, opts *kubecontainer.RunContainerOptions) (string, error) {
ref, err := kubecontainer.GenerateContainerRef(pod, container)
if err != nil {
glog.Errorf("Couldn't make a ref to pod %v, container %v: '%v'", pod.Name, container.Name, err)

View File

@ -1699,18 +1699,16 @@ func (kl *Kubelet) validateContainerStatus(podStatus *api.PodStatus, containerNa
func (kl *Kubelet) GetKubeletContainerLogs(podFullName, containerName, tail string, follow bool, stdout, stderr io.Writer) error {
podStatus, err := kl.GetPodStatus(podFullName)
if err != nil {
if err == dockertools.ErrNoContainersInPod {
return fmt.Errorf("pod %q not found\n", podFullName)
} else {
return fmt.Errorf("failed to get status for pod %q - %v", podFullName, err)
}
return fmt.Errorf("failed to get status for pod %q - %v", podFullName, err)
}
if err := kl.validatePodPhase(&podStatus); err != nil {
// No log is available if pod is not in a "known" phase (e.g. Unknown).
return err
}
dockerContainerID, err := kl.validateContainerStatus(&podStatus, containerName)
if err != nil {
// No log is available if the container status is missing or is in the
// waiting state.
return err
}
return kl.containerManager.GetKubeletDockerContainerLogs(dockerContainerID, tail, follow, stdout, stderr)

View File

@ -69,7 +69,7 @@ type TestKubelet struct {
}
func newTestKubelet(t *testing.T) *TestKubelet {
fakeDocker := &dockertools.FakeDockerClient{RemovedImages: util.StringSet{}}
fakeDocker := &dockertools.FakeDockerClient{Errors: make(map[string]error), RemovedImages: util.StringSet{}}
fakeDockerCache := dockertools.NewFakeDockerCache(fakeDocker)
fakeRecorder := &record.FakeRecorder{}
fakeKubeClient := &testclient.Fake{}
@ -366,7 +366,7 @@ func TestKillContainerWithError(t *testing.T) {
},
}
fakeDocker := &dockertools.FakeDockerClient{
Err: fmt.Errorf("sample error"),
Errors: make(map[string]error),
ContainerList: append([]docker.APIContainers{}, containers...),
}
testKubelet := newTestKubelet(t)
@ -376,6 +376,7 @@ func TestKillContainerWithError(t *testing.T) {
}
kubelet.dockerClient = fakeDocker
c := apiContainerToContainer(fakeDocker.ContainerList[0])
fakeDocker.Errors["stop"] = fmt.Errorf("sample error")
err := kubelet.killContainer(&c)
if err == nil {
t.Errorf("expected error, found nil")
@ -512,7 +513,7 @@ func TestSyncPodsWithTerminationLog(t *testing.T) {
}
waitGroup.Wait()
verifyCalls(t, fakeDocker, []string{
"list", "list", "list", "create", "start", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
"list", "list", "list", "inspect_image", "create", "start", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
fakeDocker.Lock()
parts := strings.Split(fakeDocker.Container.HostConfig.Binds[0], ":")
@ -564,7 +565,7 @@ func TestSyncPodsCreatesNetAndContainer(t *testing.T) {
waitGroup.Wait()
verifyCalls(t, fakeDocker, []string{
"list", "list", "list", "create", "start", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
"list", "list", "list", "inspect_image", "create", "start", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
fakeDocker.Lock()
@ -619,7 +620,7 @@ func TestSyncPodsCreatesNetAndContainerPullsImage(t *testing.T) {
waitGroup.Wait()
verifyCalls(t, fakeDocker, []string{
"list", "list", "list", "create", "start", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
"list", "list", "list", "inspect_image", "create", "start", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
fakeDocker.Lock()
@ -671,7 +672,7 @@ func TestSyncPodsWithPodInfraCreatesContainer(t *testing.T) {
waitGroup.Wait()
verifyCalls(t, fakeDocker, []string{
"list", "list", "list", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
"list", "list", "list", "inspect_container", "inspect_image", "create", "start", "list", "inspect_container", "inspect_container"})
fakeDocker.Lock()
if len(fakeDocker.Created) != 1 ||
@ -730,7 +731,7 @@ func TestSyncPodsWithPodInfraCreatesContainerCallsHandler(t *testing.T) {
waitGroup.Wait()
verifyCalls(t, fakeDocker, []string{
"list", "list", "list", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
"list", "list", "list", "inspect_container", "inspect_image", "create", "start", "list", "inspect_container", "inspect_container"})
fakeDocker.Lock()
if len(fakeDocker.Created) != 1 ||
@ -948,7 +949,7 @@ func TestSyncPodDeletesDuplicate(t *testing.T) {
t.Errorf("unexpected error: %v", err)
}
verifyCalls(t, fakeDocker, []string{"list", "stop", "list"})
verifyCalls(t, fakeDocker, []string{"list", "inspect_image", "stop", "list", "inspect_image"})
// Expect one of the duplicates to be killed.
if len(fakeDocker.Stopped) != 1 || (fakeDocker.Stopped[0] != "1234" && fakeDocker.Stopped[0] != "4567") {
t.Errorf("Wrong containers were stopped: %v", fakeDocker.Stopped)
@ -990,8 +991,8 @@ func TestSyncPodBadHash(t *testing.T) {
t.Errorf("unexpected error: %v", err)
}
//verifyCalls(t, fakeDocker, []string{"list", "stop", "list", "create", "start", "stop", "create", "start", "inspect_container"})
verifyCalls(t, fakeDocker, []string{"list", "stop", "stop", "create", "start", "inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
verifyCalls(t, fakeDocker, []string{"list", "inspect_image", "stop", "stop", "create", "start",
"inspect_container", "create", "start", "list", "inspect_container", "inspect_container"})
// A map interation is used to delete containers, so must not depend on
// order here.
@ -1045,7 +1046,7 @@ func TestSyncPodUnhealthy(t *testing.T) {
t.Errorf("unexpected error: %v", err)
}
verifyCalls(t, fakeDocker, []string{"list", "stop", "create", "start", "list", "inspect_container"})
verifyCalls(t, fakeDocker, []string{"list", "inspect_image", "stop", "create", "start", "list", "inspect_container"})
// A map interation is used to delete containers, so must not depend on
// order here.
@ -1638,7 +1639,7 @@ func TestSyncPodEventHandlerFails(t *testing.T) {
t.Errorf("unexpected error: %v", err)
}
verifyCalls(t, fakeDocker, []string{"list", "create", "start", "stop", "list"})
verifyCalls(t, fakeDocker, []string{"list", "inspect_image", "create", "start", "stop", "list", "inspect_image"})
if len(fakeDocker.Stopped) != 1 {
t.Errorf("Wrong containers were stopped: %v", fakeDocker.Stopped)
@ -3792,3 +3793,44 @@ func TestGetPodStatusWithLastTermination(t *testing.T) {
}
}
}
func TestGetPodCreationFailureReason(t *testing.T) {
testKubelet := newTestKubelet(t)
kubelet := testKubelet.kubelet
fakeDocker := testKubelet.fakeDocker
failureReason := "creation failure"
fakeDocker.Errors["create"] = fmt.Errorf("%s", failureReason)
fakeDocker.ContainerList = []docker.APIContainers{}
pod := api.Pod{
ObjectMeta: api.ObjectMeta{
UID: "12345678",
Name: "bar",
Namespace: "new",
},
Spec: api.PodSpec{
Containers: []api.Container{
{Name: "foo"},
},
},
}
pods := []api.Pod{pod}
kubelet.podManager.SetPods(pods)
_, err := kubelet.runContainer(&pod, &pod.Spec.Containers[0], make(map[string]volume.Volume), "", "")
if err == nil {
t.Errorf("expected error, found nil")
}
status, err := kubelet.GetPodStatus(kubecontainer.GetPodFullName(&pod))
if err != nil {
t.Errorf("unexpected error %v", err)
}
if len(status.ContainerStatuses) < 1 {
t.Errorf("expected 1 container status, got %d", len(status.ContainerStatuses))
} else {
state := status.ContainerStatuses[0].State
if state.Waiting == nil {
t.Errorf("expected waiting state, got %#v", state)
} else if state.Waiting.Reason != failureReason {
t.Errorf("expected reason %q, got %q", state.Waiting.Reason)
}
}
}