Fix oom-score-adj policy in kubelet.

Docker daemon and kubelet needs to be protected by setting oom-score-adj to -999.

Signed-off-by: Vishnu kannan <vishnuk@google.com>
pull/6/head
Vishnu kannan 2016-09-07 18:32:38 -07:00
parent f59f4aecfa
commit e4acad7afb
9 changed files with 274 additions and 98 deletions

View File

@ -156,7 +156,7 @@ test-e2e: ginkgo generated_files
# DELETE_INSTANCES: For REMOTE=true only. Delete any instances created as
# part of this test run. Defaults to false.
# ARTIFACTS: For REMOTE=true only. Local directory to scp test artifacts into
# from the remote hosts. Defaults to ""/tmp/_artifacts".
# from the remote hosts. Defaults to "/tmp/_artifacts".
# REPORT: For REMOTE=false only. Local directory to write juntil xml results
# to. Defaults to "/tmp/".
# CLEANUP: For REMOTE=true only. If false, do not stop processes or delete

View File

@ -226,6 +226,7 @@ Pod OOM score configuration
*Pod infra containers* or *Special Pod init process*
- OOM_SCORE_ADJ: -998
*Kubelet, Docker*
- OOM_SCORE_ADJ: -999 (wont be OOM killed)
- Hack, because these critical tasks might die if they conflict with guaranteed containers. In the future, we should place all user-pods into a separate cgroup, and set a limit on the memory they can consume.

View File

@ -365,7 +365,7 @@ The `kubelet` will set the following:
The `kubelet` at bootstrapping will set the `oom_score_adj` value for Kubernetes
daemons, and any dependent container-runtime daemons.
If `container-runtime` is set to `docker`, then set its `oom_score_adj=-900`
If `container-runtime` is set to `docker`, then set its `oom_score_adj=-999`
## Implementation concerns

View File

@ -326,6 +326,7 @@ func (cm *containerManagerImpl) setupNode() error {
systemContainers := []*systemContainer{}
if cm.ContainerRuntime == "docker" {
dockerVersion := getDockerVersion(cm.cadvisorInterface)
if cm.RuntimeCgroupsName != "" {
cont := newSystemCgroups(cm.RuntimeCgroupsName)
var capacity = api.ResourceList{}
@ -351,13 +352,16 @@ func (cm *containerManagerImpl) setupNode() error {
},
},
}
dockerVersion := getDockerVersion(cm.cadvisorInterface)
cont.ensureStateFunc = func(manager *fs.Manager) error {
return ensureDockerInContainer(dockerVersion, -900, dockerContainer)
return ensureDockerInContainer(dockerVersion, qos.DockerOOMScoreAdj, dockerContainer)
}
systemContainers = append(systemContainers, cont)
} else {
cm.periodicTasks = append(cm.periodicTasks, func() {
if err := ensureDockerInContainer(dockerVersion, qos.DockerOOMScoreAdj, nil); err != nil {
glog.Error(err)
return
}
cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
if err != nil {
glog.Error(err)
@ -401,11 +405,15 @@ func (cm *containerManagerImpl) setupNode() error {
},
}
cont.ensureStateFunc = func(_ *fs.Manager) error {
return manager.Apply(os.Getpid())
return ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, &manager)
}
systemContainers = append(systemContainers, cont)
} else {
cm.periodicTasks = append(cm.periodicTasks, func() {
if err := ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, nil); err != nil {
glog.Error(err)
return
}
cont, err := getContainer(os.Getpid())
if err != nil {
glog.Errorf("failed to find cgroups of kubelet - %v", err)
@ -578,7 +586,7 @@ func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, mana
// Move if the pid is not already in the desired container.
for _, pid := range pids {
if err := ensureProcessInContainer(pid, oomScoreAdj, manager); err != nil {
if err := ensureProcessInContainerWithOOMScore(pid, oomScoreAdj, manager); err != nil {
errs = append(errs, fmt.Errorf("errors moving %q pid: %v", proc.name, err))
}
}
@ -586,12 +594,13 @@ func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, mana
return utilerrors.NewAggregate(errs)
}
func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) error {
func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager *fs.Manager) error {
if runningInHost, err := isProcessRunningInHost(pid); err != nil {
// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
return err
} else if !runningInHost {
// Process is running inside a container. Don't touch that.
glog.V(2).Infof("pid %d is running in the host namespaces", pid)
return nil
}
@ -601,17 +610,18 @@ func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) err
errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
}
if cont != manager.Cgroups.Name {
if manager != nil && cont != manager.Cgroups.Name {
err = manager.Apply(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name))
errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, manager.Cgroups.Name, err))
}
}
// Also apply oom-score-adj to processes
oomAdjuster := oom.NewOOMAdjuster()
if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid))
glog.V(3).Infof("Failed to apply oom_score_adj %d for pid %d: %v", oomScoreAdj, pid, err)
errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d: %v", oomScoreAdj, pid, err))
}
return utilerrors.NewAggregate(errs)
}

View File

@ -21,8 +21,9 @@ import (
)
const (
PodInfraOOMAdj int = -999
PodInfraOOMAdj int = -998
KubeletOOMScoreAdj int = -999
DockerOOMScoreAdj int = -999
KubeProxyOOMScoreAdj int = -999
guaranteedOOMScoreAdj int = -998
besteffortOOMScoreAdj int = 1000
@ -53,10 +54,10 @@ func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCa
// Note that this is a heuristic, it won't work if a container has many small processes.
memoryRequest := container.Resources.Requests.Memory().Value()
oomScoreAdjust := 1000 - (1000*memoryRequest)/memoryCapacity
// A guaranteed pod using 100% of memory can have an OOM score of 1. Ensure
// A guaranteed pod using 100% of memory can have an OOM score of 10. Ensure
// that burstable pods have a higher OOM score adjustment.
if oomScoreAdjust < 2 {
return 2
if int(oomScoreAdjust) < (1000 + guaranteedOOMScoreAdj) {
return (1000 + guaranteedOOMScoreAdj)
}
// Give burstable pods a higher chance of survival over besteffort pods.
if int(oomScoreAdjust) == besteffortOOMScoreAdj {

View File

@ -20,6 +20,7 @@ package oom
import (
"fmt"
"io/ioutil"
"os"
"path"
"strconv"
@ -65,28 +66,24 @@ func applyOOMScoreAdj(pid int, oomScoreAdj int) error {
maxTries := 2
oomScoreAdjPath := path.Join("/proc", pidStr, "oom_score_adj")
value := strconv.Itoa(oomScoreAdj)
glog.V(4).Infof("attempting to set %q to %q", oomScoreAdjPath, value)
var err error
for i := 0; i < maxTries; i++ {
f, err := os.Open(oomScoreAdjPath)
err = ioutil.WriteFile(oomScoreAdjPath, []byte(value), 0700)
if err != nil {
if os.IsNotExist(err) {
glog.V(2).Infof("%q does not exist", oomScoreAdjPath)
return os.ErrNotExist
}
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
if _, err := f.Write([]byte(value)); err != nil {
// we can ignore the return value of f.Close() here.
f.Close()
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
if err = f.Close(); err != nil {
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
glog.V(3).Info(err)
continue
}
return nil
}
if err != nil {
glog.V(2).Infof("failed to set %q to %q: %v", oomScoreAdjPath, value, err)
}
return err
}

View File

@ -18,9 +18,14 @@ package e2e_node
import (
"fmt"
"os/exec"
"path"
"strconv"
"strings"
"time"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
"k8s.io/kubernetes/pkg/util/uuid"
"k8s.io/kubernetes/test/e2e/framework"
@ -28,63 +33,193 @@ import (
. "github.com/onsi/gomega"
)
var _ = framework.KubeDescribe("Kubelet Container Manager", func() {
const (
kubeletProcessname = "kubelet"
)
func getOOMScoreForPid(pid int) (int, error) {
procfsPath := path.Join("/proc", strconv.Itoa(pid), "oom_score_adj")
out, err := exec.Command("sudo", "cat", procfsPath).CombinedOutput()
if err != nil {
return 0, err
}
return strconv.Atoi(strings.TrimSpace(string(out)))
}
func validateOOMScoreAdjSetting(pid int, expectedOOMScoreAdj int) error {
oomScore, err := getOOMScoreForPid(pid)
if err != nil {
return fmt.Errorf("failed to get oom_score_adj for %d: %v", pid, err)
}
if expectedOOMScoreAdj != oomScore {
return fmt.Errorf("expected pid %d's oom_score_adj to be %d; found %d", pid, expectedOOMScoreAdj, oomScore)
}
return nil
}
func validateOOMScoreAdjSettingIsInRange(pid int, expectedMinOOMScoreAdj, expectedMaxOOMScoreAdj int) error {
oomScore, err := getOOMScoreForPid(pid)
if err != nil {
return fmt.Errorf("failed to get oom_score_adj for %d", pid)
}
if oomScore < expectedMinOOMScoreAdj {
return fmt.Errorf("expected pid %d's oom_score_adj to be >= %d; found %d", pid, expectedMinOOMScoreAdj, oomScore)
}
if oomScore < expectedMaxOOMScoreAdj {
return fmt.Errorf("expected pid %d's oom_score_adj to be < %d; found %d", pid, expectedMaxOOMScoreAdj, oomScore)
}
return nil
}
var _ = framework.KubeDescribe("Kubelet Container Manager [Serial]", func() {
f := framework.NewDefaultFramework("kubelet-container-manager")
var podClient *framework.PodClient
BeforeEach(func() {
podClient = f.PodClient()
})
Describe("oom score adjusting", func() {
Context("when scheduling a busybox command that always fails in a pod", func() {
var podName string
BeforeEach(func() {
podName = "bin-false" + string(uuid.NewUUID())
Describe("Validate OOM score adjustments", func() {
Context("once the node is setup", func() {
It("docker daemon's oom-score-adj should be -999", func() {
dockerPids, err := getPidsForProcess(dockerProcessName, dockerPidFile)
Expect(err).To(BeNil(), "failed to get list of docker daemon pids")
for _, pid := range dockerPids {
Eventually(func() error {
return validateOOMScoreAdjSetting(pid, -999)
}, 5*time.Minute, 30*time.Second).Should(BeNil())
}
})
It("Kubelet's oom-score-adj should be -999", func() {
kubeletPids, err := getPidsForProcess(kubeletProcessName, "")
Expect(err).To(BeNil(), "failed to get list of kubelet pids")
Expect(len(kubeletPids)).To(Equal(1), "expected only one kubelet process; found %d", len(kubeletPids))
Eventually(func() error {
return validateOOMScoreAdjSetting(kubeletPids[0], -999)
}, 5*time.Minute, 30*time.Second).Should(BeNil())
})
It("pod infra containers oom-score-adj should be -998 and best effort container's should be 1000", func() {
var err error
podClient := f.PodClient()
podName := "besteffort" + string(uuid.NewUUID())
podClient.Create(&api.Pod{
ObjectMeta: api.ObjectMeta{
Name: podName,
},
Spec: api.PodSpec{
// Don't restart the Pod since it is expected to exit
RestartPolicy: api.RestartPolicyNever,
Containers: []api.Container{
{
Image: ImageRegistry[busyBoxImage],
Name: podName,
Command: []string{"/bin/false"},
Image: ImageRegistry[serveHostnameImage],
Name: podName,
},
},
},
})
})
It("should have an error terminated reason", func() {
var pausePids []int
By("checking infra container's oom-score-adj")
Eventually(func() error {
podData, err := podClient.Get(podName)
pausePids, err = getPidsForProcess("pause", "")
if err != nil {
return err
return fmt.Errorf("failed to get list of pause pids: %v", err)
}
if len(podData.Status.ContainerStatuses) != 1 {
return fmt.Errorf("expected only one container in the pod %q", podName)
}
contTerminatedState := podData.Status.ContainerStatuses[0].State.Terminated
if contTerminatedState == nil {
return fmt.Errorf("expected state to be terminated. Got pod status: %+v", podData.Status)
}
if contTerminatedState.Reason != "Error" {
return fmt.Errorf("expected terminated state reason to be error. Got %+v", contTerminatedState)
for _, pid := range pausePids {
if err := validateOOMScoreAdjSetting(pid, -998); err != nil {
return err
}
}
return nil
}, time.Minute, time.Second*4).Should(BeNil())
})
}, 2*time.Minute, time.Second*4).Should(BeNil())
var shPids []int
By("checking besteffort container's oom-score-adj")
Eventually(func() error {
shPids, err = getPidsForProcess("serve_hostname", "")
if err != nil {
return fmt.Errorf("failed to get list of serve hostname process pids: %v", err)
}
if len(shPids) != 1 {
return fmt.Errorf("expected only one serve_hostname process; found %d", len(shPids))
}
return validateOOMScoreAdjSetting(shPids[0], 1000)
}, 2*time.Minute, time.Second*4).Should(BeNil())
It("should be possible to delete", func() {
err := podClient.Delete(podName, &api.DeleteOptions{})
Expect(err).To(BeNil(), fmt.Sprintf("Error deleting Pod %v", err))
})
It("guaranteed container's oom-score-adj should be -998", func() {
podClient := f.PodClient()
podName := "guaranteed" + string(uuid.NewUUID())
podClient.Create(&api.Pod{
ObjectMeta: api.ObjectMeta{
Name: podName,
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Image: ImageRegistry[nginxImage],
Name: podName,
Resources: api.ResourceRequirements{
Limits: api.ResourceList{
"cpu": resource.MustParse("100m"),
"memory": resource.MustParse("50Mi"),
},
},
},
},
},
})
var (
ngPids []int
err error
)
Eventually(func() error {
ngPids, err = getPidsForProcess("nginx", "")
if err != nil {
return fmt.Errorf("failed to get list of nginx process pids: %v", err)
}
for _, pid := range ngPids {
if err := validateOOMScoreAdjSetting(pid, -998); err != nil {
return err
}
}
return nil
}, 2*time.Minute, time.Second*4).Should(BeNil())
})
It("burstable container's oom-score-adj should be between [2, 1000)", func() {
podClient := f.PodClient()
podName := "burstable" + string(uuid.NewUUID())
podClient.Create(&api.Pod{
ObjectMeta: api.ObjectMeta{
Name: podName,
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Image: ImageRegistry[testWebServer],
Name: podName,
Resources: api.ResourceRequirements{
Requests: api.ResourceList{
"cpu": resource.MustParse("100m"),
"memory": resource.MustParse("50Mi"),
},
},
},
},
},
})
var (
wsPids []int
err error
)
Eventually(func() error {
wsPids, err = getPidsForProcess("test-webserver", "")
if err != nil {
return fmt.Errorf("failed to get list of test-webserver process pids: %v", err)
}
for _, pid := range wsPids {
if err := validateOOMScoreAdjSettingIsInRange(pid, 2, 1000); err != nil {
return err
}
}
return nil
}, 2*time.Minute, time.Second*4).Should(BeNil())
// TODO: Test the oom-score-adj logic for burstable more accurately.
})
})
})
})

View File

@ -75,7 +75,54 @@ var _ = framework.KubeDescribe("Kubelet", func() {
}, time.Minute, time.Second*4).Should(Equal("Hello World\n"))
})
})
Context("when scheduling a busybox command that always fails in a pod", func() {
var podName string
BeforeEach(func() {
podName = "bin-false" + string(uuid.NewUUID())
podClient.Create(&api.Pod{
ObjectMeta: api.ObjectMeta{
Name: podName,
},
Spec: api.PodSpec{
// Don't restart the Pod since it is expected to exit
RestartPolicy: api.RestartPolicyNever,
Containers: []api.Container{
{
Image: ImageRegistry[busyBoxImage],
Name: podName,
Command: []string{"/bin/false"},
},
},
},
})
})
It("should have an error terminated reason", func() {
Eventually(func() error {
podData, err := podClient.Get(podName)
if err != nil {
return err
}
if len(podData.Status.ContainerStatuses) != 1 {
return fmt.Errorf("expected only one container in the pod %q", podName)
}
contTerminatedState := podData.Status.ContainerStatuses[0].State.Terminated
if contTerminatedState == nil {
return fmt.Errorf("expected state to be terminated. Got pod status: %+v", podData.Status)
}
if contTerminatedState.Reason != "Error" {
return fmt.Errorf("expected terminated state reason to be error. Got %+v", contTerminatedState)
}
return nil
}, time.Minute, time.Second*4).Should(BeNil())
})
It("should be possible to delete", func() {
err := podClient.Delete(podName, &api.DeleteOptions{})
Expect(err).To(BeNil(), fmt.Sprintf("Error deleting Pod %v", err))
})
})
Context("when scheduling a read only busybox container", func() {
podName := "busybox-readonly-fs" + string(uuid.NewUUID())
It("it should not write to root filesystem", func() {

View File

@ -24,7 +24,6 @@ import (
"io/ioutil"
"log"
"os"
"os/exec"
"sort"
"strconv"
"strings"
@ -38,6 +37,7 @@ import (
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/pkg/util/procfs"
"k8s.io/kubernetes/pkg/util/runtime"
"k8s.io/kubernetes/pkg/util/uuid"
"k8s.io/kubernetes/pkg/util/wait"
@ -450,19 +450,16 @@ const (
containerdPidFile = "/run/docker/libcontainerd/docker-containerd.pid"
)
func getContainerNameForProcess(name, pidFile string) (string, error) {
pids, err := getPidsForProcess(name, pidFile)
if err != nil {
return "", fmt.Errorf("failed to detect process id for %q - %v", name, err)
func getPidsForProcess(name, pidFile string) ([]int, error) {
if len(pidFile) > 0 {
if pid, err := getPidFromPidFile(pidFile); err == nil {
return []int{pid}, nil
} else {
// log the error and fall back to pidof
runtime.HandleError(err)
}
}
if len(pids) == 0 {
return "", nil
}
cont, err := getContainer(pids[0])
if err != nil {
return "", err
}
return cont, nil
return procfs.PidOf(name)
}
func getPidFromPidFile(pidFile string) (int, error) {
@ -485,31 +482,19 @@ func getPidFromPidFile(pidFile string) (int, error) {
return pid, nil
}
func getPidsForProcess(name, pidFile string) ([]int, error) {
if len(pidFile) > 0 {
if pid, err := getPidFromPidFile(pidFile); err == nil {
return []int{pid}, nil
} else {
// log the error and fall back to pidof
runtime.HandleError(err)
}
}
out, err := exec.Command("pidof", name).Output()
func getContainerNameForProcess(name, pidFile string) (string, error) {
pids, err := getPidsForProcess(name, pidFile)
if err != nil {
return []int{}, fmt.Errorf("failed to find pid of %q: %v", name, err)
return "", fmt.Errorf("failed to detect process id for %q - %v", name, err)
}
// The output of pidof is a list of pids.
pids := []int{}
for _, pidStr := range strings.Split(strings.TrimSpace(string(out)), " ") {
pid, err := strconv.Atoi(pidStr)
if err != nil {
continue
}
pids = append(pids, pid)
if len(pids) == 0 {
return "", nil
}
return pids, nil
cont, err := getContainer(pids[0])
if err != nil {
return "", err
}
return cont, nil
}
// getContainer returns the cgroup associated with the specified pid.