Merge pull request #23894 from vishh/oom-score-adj-error

Automatic merge from submit-queue

Do not throw creation errors for containers that fail immediately after being started

Fixes (hopefully) #23607 

cc @dchen1107
pull/6/head
k8s-merge-robot 2016-04-14 22:29:14 -07:00
commit 2382fec956
4 changed files with 123 additions and 35 deletions

View File

@ -20,7 +20,6 @@ package oom
import (
"fmt"
"io/ioutil"
"os"
"path"
"strconv"
@ -49,16 +48,6 @@ func getPids(cgroupName string) ([]int, error) {
return fsManager.GetPids()
}
func syscallNotExists(err error) bool {
if err == nil {
return false
}
if e, ok := err.(*os.SyscallError); ok && os.IsNotExist(e) {
return true
}
return false
}
// Writes 'value' to /proc/<pid>/oom_score_adj. PID = 0 means self
// Returns os.ErrNotExist if the `pid` does not exist.
func applyOOMScoreAdj(pid int, oomScoreAdj int) error {
@ -78,12 +67,19 @@ func applyOOMScoreAdj(pid int, oomScoreAdj int) error {
value := strconv.Itoa(oomScoreAdj)
var err error
for i := 0; i < maxTries; i++ {
if err = ioutil.WriteFile(oomScoreAdjPath, []byte(value), 0700); err != nil {
if syscallNotExists(err) {
f, err := os.Open(oomScoreAdjPath)
if err != nil {
if os.IsNotExist(err) {
return os.ErrNotExist
}
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
if _, err := f.Write([]byte(value)); err != nil {
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
return nil
}
return err
}
@ -96,20 +92,26 @@ func (oomAdjuster *OOMAdjuster) applyOOMScoreAdjContainer(cgroupName string, oom
continueAdjusting := false
pidList, err := oomAdjuster.pidLister(cgroupName)
if err != nil {
if syscallNotExists(err) {
if os.IsNotExist(err) {
// Nothing to do since the container doesn't exist anymore.
return os.ErrNotExist
}
continueAdjusting = true
glog.Errorf("Error getting process list for cgroup %s: %+v", cgroupName, err)
glog.V(10).Infof("Error getting process list for cgroup %s: %+v", cgroupName, err)
} else if len(pidList) == 0 {
glog.V(10).Infof("Pid list is empty")
continueAdjusting = true
} else {
for _, pid := range pidList {
if !adjustedProcessSet[pid] {
continueAdjusting = true
glog.V(10).Infof("pid %d needs to be set", pid)
if err = oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err == nil {
adjustedProcessSet[pid] = true
} else if err == os.ErrNotExist {
continue
} else {
glog.V(10).Infof("cannot adjust oom score for pid %d - %v", pid, err)
continueAdjusting = true
}
// Processes can come and go while we try to apply oom score adjust value. So ignore errors here.
}

View File

@ -19,7 +19,10 @@ limitations under the License.
package oom
import (
"os"
"testing"
"github.com/stretchr/testify/assert"
)
// Converts a sequence of PID lists into a PID lister.
@ -62,10 +65,9 @@ func applyOOMScoreAdjContainerTester(pidListSequence [][]int, maxTries int, appl
} else if err != nil {
return
}
// Check that OOM scores were applied to the right processes.
if len(appliedPids) != len(pidOOMs) {
t.Errorf("Applied OOM scores to incorrect number of processes")
t.Errorf("Applied OOM scores to incorrect number of processes - %+v vs %v", appliedPids, pidOOMs)
return
}
for _, pid := range appliedPids {
@ -82,29 +84,21 @@ func TestOOMScoreAdjContainer(t *testing.T) {
pidListSequence1 := [][]int{
{1, 2},
}
applyOOMScoreAdjContainerTester(pidListSequence1, 1, nil, true, t)
applyOOMScoreAdjContainerTester(pidListSequence1, 2, []int{1, 2}, false, t)
applyOOMScoreAdjContainerTester(pidListSequence1, 3, []int{1, 2}, false, t)
pidListSequence3 := [][]int{
{1, 2},
{1, 2, 4, 5},
{2, 1, 4, 5, 3},
}
applyOOMScoreAdjContainerTester(pidListSequence3, 1, nil, true, t)
applyOOMScoreAdjContainerTester(pidListSequence3, 2, nil, true, t)
applyOOMScoreAdjContainerTester(pidListSequence3, 3, nil, true, t)
applyOOMScoreAdjContainerTester(pidListSequence3, 4, []int{1, 2, 3, 4, 5}, false, t)
applyOOMScoreAdjContainerTester(pidListSequence1, 1, []int{1, 2}, false, t)
pidListSequenceLag := [][]int{
{},
{},
{},
{1, 2, 4},
{1, 2, 4, 5},
}
for i := 1; i < 5; i++ {
for i := 1; i < 4; i++ {
applyOOMScoreAdjContainerTester(pidListSequenceLag, i, nil, true, t)
}
applyOOMScoreAdjContainerTester(pidListSequenceLag, 6, []int{1, 2, 4, 5}, false, t)
applyOOMScoreAdjContainerTester(pidListSequenceLag, 4, []int{1, 2, 4}, false, t)
}
func TestPidListerFailure(t *testing.T) {
_, err := getPids("/does/not/exist")
assert.True(t, os.IsNotExist(err), "expected getPids to return not exists error. Got %v", err)
}

View File

@ -49,7 +49,7 @@ func (pfs *ProcFS) GetFullContainerName(pid int) (string, error) {
filePath := path.Join("/proc", strconv.Itoa(pid), "cgroup")
content, err := ioutil.ReadFile(filePath)
if err != nil {
if e, ok := err.(*os.SyscallError); ok && os.IsNotExist(e) {
if os.IsNotExist(err) {
return "", os.ErrNotExist
}
return "", err

View File

@ -0,0 +1,92 @@
/*
Copyright 2016 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e_node
import (
"fmt"
"time"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/restclient"
client "k8s.io/kubernetes/pkg/client/unversioned"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
var _ = Describe("Kubelet Container Manager", func() {
var cl *client.Client
BeforeEach(func() {
// Setup the apiserver client
cl = client.NewOrDie(&restclient.Config{Host: *apiServerAddress})
})
Describe("oom score adjusting", func() {
namespace := "oom-adj"
Context("when scheduling a busybox command that always fails in a pod", func() {
podName := "bin-false"
It("it should return succes", func() {
pod := &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: podName,
Namespace: namespace,
},
Spec: api.PodSpec{
// Force the Pod to schedule to the node without a scheduler running
NodeName: *nodeName,
// Don't restart the Pod since it is expected to exit
RestartPolicy: api.RestartPolicyNever,
Containers: []api.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: podName,
Command: []string{"/bin/false"},
},
},
},
}
_, err := cl.Pods(namespace).Create(pod)
Expect(err).To(BeNil(), fmt.Sprintf("Error creating Pod %v", err))
})
It("it should have an error terminated reason", func() {
Eventually(func() error {
podData, err := cl.Pods(namespace).Get(podName)
if err != nil {
return err
}
if len(podData.Status.ContainerStatuses) != 1 {
return fmt.Errorf("expected only one container in the pod %q", podName)
}
contTerminatedState := podData.Status.ContainerStatuses[0].State.Terminated
if contTerminatedState == nil {
return fmt.Errorf("expected state to be terminated. Got pod status: %+v", podData.Status)
}
if contTerminatedState.Reason != "Error" {
return fmt.Errorf("expected terminated state reason to be error. Got %+v", contTerminatedState)
}
return nil
}, time.Minute, time.Second*4).Should(BeNil())
})
It("it should be possible to delete", func() {
err := cl.Pods(namespace).Delete(podName, &api.DeleteOptions{})
Expect(err).To(BeNil(), fmt.Sprintf("Error deleting Pod %v", err))
})
})
})
})