From e4acad7afb0031ef1c0e03ce899e862a730a087b Mon Sep 17 00:00:00 2001 From: Vishnu kannan Date: Wed, 7 Sep 2016 18:32:38 -0700 Subject: [PATCH] Fix oom-score-adj policy in kubelet. Docker daemon and kubelet needs to be protected by setting oom-score-adj to -999. Signed-off-by: Vishnu kannan --- Makefile | 2 +- docs/design/resource-qos.md | 1 + docs/proposals/kubelet-systemd.md | 2 +- pkg/kubelet/cm/container_manager_linux.go | 26 ++- pkg/kubelet/qos/policy.go | 9 +- pkg/util/oom/oom_linux.go | 21 +-- test/e2e_node/container_manager_test.go | 209 ++++++++++++++++++---- test/e2e_node/kubelet_test.go | 47 +++++ test/e2e_node/resource_collector.go | 55 +++--- 9 files changed, 274 insertions(+), 98 deletions(-) diff --git a/Makefile b/Makefile index 69f4a611d0..af0950c6b1 100644 --- a/Makefile +++ b/Makefile @@ -156,7 +156,7 @@ test-e2e: ginkgo generated_files # DELETE_INSTANCES: For REMOTE=true only. Delete any instances created as # part of this test run. Defaults to false. # ARTIFACTS: For REMOTE=true only. Local directory to scp test artifacts into -# from the remote hosts. Defaults to ""/tmp/_artifacts". +# from the remote hosts. Defaults to "/tmp/_artifacts". # REPORT: For REMOTE=false only. Local directory to write juntil xml results # to. Defaults to "/tmp/". # CLEANUP: For REMOTE=true only. If false, do not stop processes or delete diff --git a/docs/design/resource-qos.md b/docs/design/resource-qos.md index b2a43c9793..6a8e8ab28a 100644 --- a/docs/design/resource-qos.md +++ b/docs/design/resource-qos.md @@ -226,6 +226,7 @@ Pod OOM score configuration *Pod infra containers* or *Special Pod init process* - OOM_SCORE_ADJ: -998 + *Kubelet, Docker* - OOM_SCORE_ADJ: -999 (won’t be OOM killed) - Hack, because these critical tasks might die if they conflict with guaranteed containers. In the future, we should place all user-pods into a separate cgroup, and set a limit on the memory they can consume. diff --git a/docs/proposals/kubelet-systemd.md b/docs/proposals/kubelet-systemd.md index d7c42cf667..1b7c0373e0 100644 --- a/docs/proposals/kubelet-systemd.md +++ b/docs/proposals/kubelet-systemd.md @@ -365,7 +365,7 @@ The `kubelet` will set the following: The `kubelet` at bootstrapping will set the `oom_score_adj` value for Kubernetes daemons, and any dependent container-runtime daemons. -If `container-runtime` is set to `docker`, then set its `oom_score_adj=-900` +If `container-runtime` is set to `docker`, then set its `oom_score_adj=-999` ## Implementation concerns diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index 22a6c99f79..6bddb6b986 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -326,6 +326,7 @@ func (cm *containerManagerImpl) setupNode() error { systemContainers := []*systemContainer{} if cm.ContainerRuntime == "docker" { + dockerVersion := getDockerVersion(cm.cadvisorInterface) if cm.RuntimeCgroupsName != "" { cont := newSystemCgroups(cm.RuntimeCgroupsName) var capacity = api.ResourceList{} @@ -351,13 +352,16 @@ func (cm *containerManagerImpl) setupNode() error { }, }, } - dockerVersion := getDockerVersion(cm.cadvisorInterface) cont.ensureStateFunc = func(manager *fs.Manager) error { - return ensureDockerInContainer(dockerVersion, -900, dockerContainer) + return ensureDockerInContainer(dockerVersion, qos.DockerOOMScoreAdj, dockerContainer) } systemContainers = append(systemContainers, cont) } else { cm.periodicTasks = append(cm.periodicTasks, func() { + if err := ensureDockerInContainer(dockerVersion, qos.DockerOOMScoreAdj, nil); err != nil { + glog.Error(err) + return + } cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile) if err != nil { glog.Error(err) @@ -401,11 +405,15 @@ func (cm *containerManagerImpl) setupNode() error { }, } cont.ensureStateFunc = func(_ *fs.Manager) error { - return manager.Apply(os.Getpid()) + return ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, &manager) } systemContainers = append(systemContainers, cont) } else { cm.periodicTasks = append(cm.periodicTasks, func() { + if err := ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, nil); err != nil { + glog.Error(err) + return + } cont, err := getContainer(os.Getpid()) if err != nil { glog.Errorf("failed to find cgroups of kubelet - %v", err) @@ -578,7 +586,7 @@ func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, mana // Move if the pid is not already in the desired container. for _, pid := range pids { - if err := ensureProcessInContainer(pid, oomScoreAdj, manager); err != nil { + if err := ensureProcessInContainerWithOOMScore(pid, oomScoreAdj, manager); err != nil { errs = append(errs, fmt.Errorf("errors moving %q pid: %v", proc.name, err)) } } @@ -586,12 +594,13 @@ func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, mana return utilerrors.NewAggregate(errs) } -func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) error { +func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager *fs.Manager) error { if runningInHost, err := isProcessRunningInHost(pid); err != nil { // Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context. return err } else if !runningInHost { // Process is running inside a container. Don't touch that. + glog.V(2).Infof("pid %d is running in the host namespaces", pid) return nil } @@ -601,17 +610,18 @@ func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) err errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err)) } - if cont != manager.Cgroups.Name { + if manager != nil && cont != manager.Cgroups.Name { err = manager.Apply(pid) if err != nil { - errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name)) + errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, manager.Cgroups.Name, err)) } } // Also apply oom-score-adj to processes oomAdjuster := oom.NewOOMAdjuster() if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil { - errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid)) + glog.V(3).Infof("Failed to apply oom_score_adj %d for pid %d: %v", oomScoreAdj, pid, err) + errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d: %v", oomScoreAdj, pid, err)) } return utilerrors.NewAggregate(errs) } diff --git a/pkg/kubelet/qos/policy.go b/pkg/kubelet/qos/policy.go index ad696f3610..7013f712f8 100644 --- a/pkg/kubelet/qos/policy.go +++ b/pkg/kubelet/qos/policy.go @@ -21,8 +21,9 @@ import ( ) const ( - PodInfraOOMAdj int = -999 + PodInfraOOMAdj int = -998 KubeletOOMScoreAdj int = -999 + DockerOOMScoreAdj int = -999 KubeProxyOOMScoreAdj int = -999 guaranteedOOMScoreAdj int = -998 besteffortOOMScoreAdj int = 1000 @@ -53,10 +54,10 @@ func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCa // Note that this is a heuristic, it won't work if a container has many small processes. memoryRequest := container.Resources.Requests.Memory().Value() oomScoreAdjust := 1000 - (1000*memoryRequest)/memoryCapacity - // A guaranteed pod using 100% of memory can have an OOM score of 1. Ensure + // A guaranteed pod using 100% of memory can have an OOM score of 10. Ensure // that burstable pods have a higher OOM score adjustment. - if oomScoreAdjust < 2 { - return 2 + if int(oomScoreAdjust) < (1000 + guaranteedOOMScoreAdj) { + return (1000 + guaranteedOOMScoreAdj) } // Give burstable pods a higher chance of survival over besteffort pods. if int(oomScoreAdjust) == besteffortOOMScoreAdj { diff --git a/pkg/util/oom/oom_linux.go b/pkg/util/oom/oom_linux.go index 0bb8703874..6b9c93cf17 100644 --- a/pkg/util/oom/oom_linux.go +++ b/pkg/util/oom/oom_linux.go @@ -20,6 +20,7 @@ package oom import ( "fmt" + "io/ioutil" "os" "path" "strconv" @@ -65,28 +66,24 @@ func applyOOMScoreAdj(pid int, oomScoreAdj int) error { maxTries := 2 oomScoreAdjPath := path.Join("/proc", pidStr, "oom_score_adj") value := strconv.Itoa(oomScoreAdj) + glog.V(4).Infof("attempting to set %q to %q", oomScoreAdjPath, value) var err error for i := 0; i < maxTries; i++ { - f, err := os.Open(oomScoreAdjPath) + err = ioutil.WriteFile(oomScoreAdjPath, []byte(value), 0700) if err != nil { if os.IsNotExist(err) { + glog.V(2).Infof("%q does not exist", oomScoreAdjPath) return os.ErrNotExist } - err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err) - continue - } - if _, err := f.Write([]byte(value)); err != nil { - // we can ignore the return value of f.Close() here. - f.Close() - err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err) - continue - } - if err = f.Close(); err != nil { - err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err) + + glog.V(3).Info(err) continue } return nil } + if err != nil { + glog.V(2).Infof("failed to set %q to %q: %v", oomScoreAdjPath, value, err) + } return err } diff --git a/test/e2e_node/container_manager_test.go b/test/e2e_node/container_manager_test.go index ac8e659e02..e5fd987c48 100644 --- a/test/e2e_node/container_manager_test.go +++ b/test/e2e_node/container_manager_test.go @@ -18,9 +18,14 @@ package e2e_node import ( "fmt" + "os/exec" + "path" + "strconv" + "strings" "time" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/resource" "k8s.io/kubernetes/pkg/util/uuid" "k8s.io/kubernetes/test/e2e/framework" @@ -28,63 +33,193 @@ import ( . "github.com/onsi/gomega" ) -var _ = framework.KubeDescribe("Kubelet Container Manager", func() { +const ( + kubeletProcessname = "kubelet" +) + +func getOOMScoreForPid(pid int) (int, error) { + procfsPath := path.Join("/proc", strconv.Itoa(pid), "oom_score_adj") + out, err := exec.Command("sudo", "cat", procfsPath).CombinedOutput() + if err != nil { + return 0, err + } + return strconv.Atoi(strings.TrimSpace(string(out))) +} + +func validateOOMScoreAdjSetting(pid int, expectedOOMScoreAdj int) error { + oomScore, err := getOOMScoreForPid(pid) + if err != nil { + return fmt.Errorf("failed to get oom_score_adj for %d: %v", pid, err) + } + if expectedOOMScoreAdj != oomScore { + return fmt.Errorf("expected pid %d's oom_score_adj to be %d; found %d", pid, expectedOOMScoreAdj, oomScore) + } + return nil +} + +func validateOOMScoreAdjSettingIsInRange(pid int, expectedMinOOMScoreAdj, expectedMaxOOMScoreAdj int) error { + oomScore, err := getOOMScoreForPid(pid) + if err != nil { + return fmt.Errorf("failed to get oom_score_adj for %d", pid) + } + if oomScore < expectedMinOOMScoreAdj { + return fmt.Errorf("expected pid %d's oom_score_adj to be >= %d; found %d", pid, expectedMinOOMScoreAdj, oomScore) + } + if oomScore < expectedMaxOOMScoreAdj { + return fmt.Errorf("expected pid %d's oom_score_adj to be < %d; found %d", pid, expectedMaxOOMScoreAdj, oomScore) + } + return nil +} + +var _ = framework.KubeDescribe("Kubelet Container Manager [Serial]", func() { f := framework.NewDefaultFramework("kubelet-container-manager") - var podClient *framework.PodClient - BeforeEach(func() { - podClient = f.PodClient() - }) - - Describe("oom score adjusting", func() { - Context("when scheduling a busybox command that always fails in a pod", func() { - var podName string - - BeforeEach(func() { - podName = "bin-false" + string(uuid.NewUUID()) + Describe("Validate OOM score adjustments", func() { + Context("once the node is setup", func() { + It("docker daemon's oom-score-adj should be -999", func() { + dockerPids, err := getPidsForProcess(dockerProcessName, dockerPidFile) + Expect(err).To(BeNil(), "failed to get list of docker daemon pids") + for _, pid := range dockerPids { + Eventually(func() error { + return validateOOMScoreAdjSetting(pid, -999) + }, 5*time.Minute, 30*time.Second).Should(BeNil()) + } + }) + It("Kubelet's oom-score-adj should be -999", func() { + kubeletPids, err := getPidsForProcess(kubeletProcessName, "") + Expect(err).To(BeNil(), "failed to get list of kubelet pids") + Expect(len(kubeletPids)).To(Equal(1), "expected only one kubelet process; found %d", len(kubeletPids)) + Eventually(func() error { + return validateOOMScoreAdjSetting(kubeletPids[0], -999) + }, 5*time.Minute, 30*time.Second).Should(BeNil()) + }) + It("pod infra containers oom-score-adj should be -998 and best effort container's should be 1000", func() { + var err error + podClient := f.PodClient() + podName := "besteffort" + string(uuid.NewUUID()) podClient.Create(&api.Pod{ ObjectMeta: api.ObjectMeta{ Name: podName, }, Spec: api.PodSpec{ - // Don't restart the Pod since it is expected to exit - RestartPolicy: api.RestartPolicyNever, Containers: []api.Container{ { - Image: ImageRegistry[busyBoxImage], - Name: podName, - Command: []string{"/bin/false"}, + Image: ImageRegistry[serveHostnameImage], + Name: podName, }, }, }, }) - }) - - It("should have an error terminated reason", func() { + var pausePids []int + By("checking infra container's oom-score-adj") Eventually(func() error { - podData, err := podClient.Get(podName) + pausePids, err = getPidsForProcess("pause", "") if err != nil { - return err + return fmt.Errorf("failed to get list of pause pids: %v", err) } - if len(podData.Status.ContainerStatuses) != 1 { - return fmt.Errorf("expected only one container in the pod %q", podName) - } - contTerminatedState := podData.Status.ContainerStatuses[0].State.Terminated - if contTerminatedState == nil { - return fmt.Errorf("expected state to be terminated. Got pod status: %+v", podData.Status) - } - if contTerminatedState.Reason != "Error" { - return fmt.Errorf("expected terminated state reason to be error. Got %+v", contTerminatedState) + for _, pid := range pausePids { + if err := validateOOMScoreAdjSetting(pid, -998); err != nil { + return err + } } return nil - }, time.Minute, time.Second*4).Should(BeNil()) - }) + }, 2*time.Minute, time.Second*4).Should(BeNil()) + var shPids []int + By("checking besteffort container's oom-score-adj") + Eventually(func() error { + shPids, err = getPidsForProcess("serve_hostname", "") + if err != nil { + return fmt.Errorf("failed to get list of serve hostname process pids: %v", err) + } + if len(shPids) != 1 { + return fmt.Errorf("expected only one serve_hostname process; found %d", len(shPids)) + } + return validateOOMScoreAdjSetting(shPids[0], 1000) + }, 2*time.Minute, time.Second*4).Should(BeNil()) - It("should be possible to delete", func() { - err := podClient.Delete(podName, &api.DeleteOptions{}) - Expect(err).To(BeNil(), fmt.Sprintf("Error deleting Pod %v", err)) + }) + It("guaranteed container's oom-score-adj should be -998", func() { + podClient := f.PodClient() + podName := "guaranteed" + string(uuid.NewUUID()) + podClient.Create(&api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: podName, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Image: ImageRegistry[nginxImage], + Name: podName, + Resources: api.ResourceRequirements{ + Limits: api.ResourceList{ + "cpu": resource.MustParse("100m"), + "memory": resource.MustParse("50Mi"), + }, + }, + }, + }, + }, + }) + var ( + ngPids []int + err error + ) + Eventually(func() error { + ngPids, err = getPidsForProcess("nginx", "") + if err != nil { + return fmt.Errorf("failed to get list of nginx process pids: %v", err) + } + for _, pid := range ngPids { + if err := validateOOMScoreAdjSetting(pid, -998); err != nil { + return err + } + } + + return nil + }, 2*time.Minute, time.Second*4).Should(BeNil()) + + }) + It("burstable container's oom-score-adj should be between [2, 1000)", func() { + podClient := f.PodClient() + podName := "burstable" + string(uuid.NewUUID()) + podClient.Create(&api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: podName, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Image: ImageRegistry[testWebServer], + Name: podName, + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + "cpu": resource.MustParse("100m"), + "memory": resource.MustParse("50Mi"), + }, + }, + }, + }, + }, + }) + var ( + wsPids []int + err error + ) + Eventually(func() error { + wsPids, err = getPidsForProcess("test-webserver", "") + if err != nil { + return fmt.Errorf("failed to get list of test-webserver process pids: %v", err) + } + for _, pid := range wsPids { + if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil { + return err + } + } + return nil + }, 2*time.Minute, time.Second*4).Should(BeNil()) + + // TODO: Test the oom-score-adj logic for burstable more accurately. }) }) }) - }) diff --git a/test/e2e_node/kubelet_test.go b/test/e2e_node/kubelet_test.go index 6f8657e931..445dc52196 100644 --- a/test/e2e_node/kubelet_test.go +++ b/test/e2e_node/kubelet_test.go @@ -75,7 +75,54 @@ var _ = framework.KubeDescribe("Kubelet", func() { }, time.Minute, time.Second*4).Should(Equal("Hello World\n")) }) }) + Context("when scheduling a busybox command that always fails in a pod", func() { + var podName string + BeforeEach(func() { + podName = "bin-false" + string(uuid.NewUUID()) + podClient.Create(&api.Pod{ + ObjectMeta: api.ObjectMeta{ + Name: podName, + }, + Spec: api.PodSpec{ + // Don't restart the Pod since it is expected to exit + RestartPolicy: api.RestartPolicyNever, + Containers: []api.Container{ + { + Image: ImageRegistry[busyBoxImage], + Name: podName, + Command: []string{"/bin/false"}, + }, + }, + }, + }) + }) + + It("should have an error terminated reason", func() { + Eventually(func() error { + podData, err := podClient.Get(podName) + if err != nil { + return err + } + if len(podData.Status.ContainerStatuses) != 1 { + return fmt.Errorf("expected only one container in the pod %q", podName) + } + contTerminatedState := podData.Status.ContainerStatuses[0].State.Terminated + if contTerminatedState == nil { + return fmt.Errorf("expected state to be terminated. Got pod status: %+v", podData.Status) + } + if contTerminatedState.Reason != "Error" { + return fmt.Errorf("expected terminated state reason to be error. Got %+v", contTerminatedState) + } + return nil + }, time.Minute, time.Second*4).Should(BeNil()) + }) + + It("should be possible to delete", func() { + err := podClient.Delete(podName, &api.DeleteOptions{}) + Expect(err).To(BeNil(), fmt.Sprintf("Error deleting Pod %v", err)) + }) + }) Context("when scheduling a read only busybox container", func() { podName := "busybox-readonly-fs" + string(uuid.NewUUID()) It("it should not write to root filesystem", func() { diff --git a/test/e2e_node/resource_collector.go b/test/e2e_node/resource_collector.go index f7498c6e37..0ccdfc1fd2 100644 --- a/test/e2e_node/resource_collector.go +++ b/test/e2e_node/resource_collector.go @@ -24,7 +24,6 @@ import ( "io/ioutil" "log" "os" - "os/exec" "sort" "strconv" "strings" @@ -38,6 +37,7 @@ import ( "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/labels" + "k8s.io/kubernetes/pkg/util/procfs" "k8s.io/kubernetes/pkg/util/runtime" "k8s.io/kubernetes/pkg/util/uuid" "k8s.io/kubernetes/pkg/util/wait" @@ -450,19 +450,16 @@ const ( containerdPidFile = "/run/docker/libcontainerd/docker-containerd.pid" ) -func getContainerNameForProcess(name, pidFile string) (string, error) { - pids, err := getPidsForProcess(name, pidFile) - if err != nil { - return "", fmt.Errorf("failed to detect process id for %q - %v", name, err) +func getPidsForProcess(name, pidFile string) ([]int, error) { + if len(pidFile) > 0 { + if pid, err := getPidFromPidFile(pidFile); err == nil { + return []int{pid}, nil + } else { + // log the error and fall back to pidof + runtime.HandleError(err) + } } - if len(pids) == 0 { - return "", nil - } - cont, err := getContainer(pids[0]) - if err != nil { - return "", err - } - return cont, nil + return procfs.PidOf(name) } func getPidFromPidFile(pidFile string) (int, error) { @@ -485,31 +482,19 @@ func getPidFromPidFile(pidFile string) (int, error) { return pid, nil } -func getPidsForProcess(name, pidFile string) ([]int, error) { - if len(pidFile) > 0 { - if pid, err := getPidFromPidFile(pidFile); err == nil { - return []int{pid}, nil - } else { - // log the error and fall back to pidof - runtime.HandleError(err) - } - } - - out, err := exec.Command("pidof", name).Output() +func getContainerNameForProcess(name, pidFile string) (string, error) { + pids, err := getPidsForProcess(name, pidFile) if err != nil { - return []int{}, fmt.Errorf("failed to find pid of %q: %v", name, err) + return "", fmt.Errorf("failed to detect process id for %q - %v", name, err) } - - // The output of pidof is a list of pids. - pids := []int{} - for _, pidStr := range strings.Split(strings.TrimSpace(string(out)), " ") { - pid, err := strconv.Atoi(pidStr) - if err != nil { - continue - } - pids = append(pids, pid) + if len(pids) == 0 { + return "", nil } - return pids, nil + cont, err := getContainer(pids[0]) + if err != nil { + return "", err + } + return cont, nil } // getContainer returns the cgroup associated with the specified pid.