mirror of https://github.com/k3s-io/k3s
Do not count failed pods as unready in HPA controller
Currently, when performing a scale up, any failed pods (which can be present for example in case of evictions performed by kubelet) will be treated as unready. Unready pods are treated as if they had 0% utilization which will slow down or even block scale up. After this change, failed pods are ignored in all calculations. This way they do not influence neither scale up nor scale down replica calculations.pull/6/head
parent
875f8c308f
commit
e5f8bfa023
|
@ -103,6 +103,7 @@ type testCase struct {
|
|||
reportedLevels []uint64
|
||||
reportedCPURequests []resource.Quantity
|
||||
reportedPodReadiness []v1.ConditionStatus
|
||||
reportedPodPhase []v1.PodPhase
|
||||
scaleUpdated bool
|
||||
statusUpdated bool
|
||||
eventCreated bool
|
||||
|
@ -250,10 +251,14 @@ func (tc *testCase) prepareTestClient(t *testing.T) (*fake.Clientset, *metricsfa
|
|||
if tc.reportedPodReadiness != nil {
|
||||
podReadiness = tc.reportedPodReadiness[i]
|
||||
}
|
||||
podPhase := v1.PodRunning
|
||||
if tc.reportedPodPhase != nil {
|
||||
podPhase = tc.reportedPodPhase[i]
|
||||
}
|
||||
podName := fmt.Sprintf("%s-%d", podNamePrefix, i)
|
||||
pod := v1.Pod{
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodRunning,
|
||||
Phase: podPhase,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.PodReady,
|
||||
|
@ -713,6 +718,24 @@ func TestScaleUpUnreadyNoScale(t *testing.T) {
|
|||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestScaleUpIgnoresFailedPods(t *testing.T) {
|
||||
tc := testCase{
|
||||
minReplicas: 2,
|
||||
maxReplicas: 6,
|
||||
initialReplicas: 2,
|
||||
desiredReplicas: 4,
|
||||
CPUTarget: 30,
|
||||
CPUCurrent: 60,
|
||||
verifyCPUCurrent: true,
|
||||
reportedLevels: []uint64{500, 700},
|
||||
reportedCPURequests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
|
||||
reportedPodReadiness: []v1.ConditionStatus{v1.ConditionTrue, v1.ConditionTrue, v1.ConditionFalse, v1.ConditionFalse},
|
||||
reportedPodPhase: []v1.PodPhase{v1.PodRunning, v1.PodRunning, v1.PodFailed, v1.PodFailed},
|
||||
useMetricsAPI: true,
|
||||
}
|
||||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestScaleUpDeployment(t *testing.T) {
|
||||
tc := testCase{
|
||||
minReplicas: 2,
|
||||
|
@ -1017,6 +1040,24 @@ func TestScaleDownIgnoresUnreadyPods(t *testing.T) {
|
|||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestScaleDownIgnoresFailedPods(t *testing.T) {
|
||||
tc := testCase{
|
||||
minReplicas: 2,
|
||||
maxReplicas: 6,
|
||||
initialReplicas: 5,
|
||||
desiredReplicas: 3,
|
||||
CPUTarget: 50,
|
||||
CPUCurrent: 28,
|
||||
verifyCPUCurrent: true,
|
||||
reportedLevels: []uint64{100, 300, 500, 250, 250},
|
||||
reportedCPURequests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
|
||||
useMetricsAPI: true,
|
||||
reportedPodReadiness: []v1.ConditionStatus{v1.ConditionTrue, v1.ConditionTrue, v1.ConditionTrue, v1.ConditionTrue, v1.ConditionTrue, v1.ConditionFalse, v1.ConditionFalse},
|
||||
reportedPodPhase: []v1.PodPhase{v1.PodRunning, v1.PodRunning, v1.PodRunning, v1.PodRunning, v1.PodRunning, v1.PodFailed, v1.PodFailed},
|
||||
}
|
||||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestTolerance(t *testing.T) {
|
||||
tc := testCase{
|
||||
minReplicas: 1,
|
||||
|
|
|
@ -88,7 +88,11 @@ func (c *ReplicaCalculator) GetResourceReplicas(currentReplicas int32, targetUti
|
|||
|
||||
if pod.Status.Phase != v1.PodRunning || !podutil.IsPodReady(&pod) {
|
||||
// save this pod name for later, but pretend it doesn't exist for now
|
||||
unreadyPods.Insert(pod.Name)
|
||||
if pod.Status.Phase != v1.PodFailed {
|
||||
// Failed pods should not be counted as unready pods as they will
|
||||
// not become running anymore.
|
||||
unreadyPods.Insert(pod.Name)
|
||||
}
|
||||
delete(metrics, pod.Name)
|
||||
continue
|
||||
}
|
||||
|
|
|
@ -77,6 +77,7 @@ type replicaCalcTestCase struct {
|
|||
metric *metricInfo
|
||||
|
||||
podReadiness []v1.ConditionStatus
|
||||
podPhase []v1.PodPhase
|
||||
}
|
||||
|
||||
const (
|
||||
|
@ -90,15 +91,24 @@ func (tc *replicaCalcTestCase) prepareTestClient(t *testing.T) (*fake.Clientset,
|
|||
fakeClient := &fake.Clientset{}
|
||||
fakeClient.AddReactor("list", "pods", func(action core.Action) (handled bool, ret runtime.Object, err error) {
|
||||
obj := &v1.PodList{}
|
||||
for i := 0; i < int(tc.currentReplicas); i++ {
|
||||
podsCount := int(tc.currentReplicas)
|
||||
// Failed pods are not included in tc.currentReplicas
|
||||
if tc.podPhase != nil && len(tc.podPhase) > podsCount {
|
||||
podsCount = len(tc.podPhase)
|
||||
}
|
||||
for i := 0; i < podsCount; i++ {
|
||||
podReadiness := v1.ConditionTrue
|
||||
if tc.podReadiness != nil {
|
||||
if tc.podReadiness != nil && i < len(tc.podReadiness) {
|
||||
podReadiness = tc.podReadiness[i]
|
||||
}
|
||||
podPhase := v1.PodRunning
|
||||
if tc.podPhase != nil {
|
||||
podPhase = tc.podPhase[i]
|
||||
}
|
||||
podName := fmt.Sprintf("%s-%d", podNamePrefix, i)
|
||||
pod := v1.Pod{
|
||||
Status: v1.PodStatus{
|
||||
Phase: v1.PodRunning,
|
||||
Phase: podPhase,
|
||||
Conditions: []v1.PodCondition{
|
||||
{
|
||||
Type: v1.PodReady,
|
||||
|
@ -405,6 +415,25 @@ func TestReplicaCalcScaleUpUnreadyNoScale(t *testing.T) {
|
|||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestReplicaCalcScaleUpIgnoresFailedPods(t *testing.T) {
|
||||
tc := replicaCalcTestCase{
|
||||
currentReplicas: 2,
|
||||
expectedReplicas: 4,
|
||||
podReadiness: []v1.ConditionStatus{v1.ConditionTrue, v1.ConditionTrue, v1.ConditionFalse, v1.ConditionFalse},
|
||||
podPhase: []v1.PodPhase{v1.PodRunning, v1.PodRunning, v1.PodFailed, v1.PodFailed},
|
||||
resource: &resourceInfo{
|
||||
name: v1.ResourceCPU,
|
||||
requests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
|
||||
levels: []int64{500, 700},
|
||||
|
||||
targetUtilization: 30,
|
||||
expectedUtilization: 60,
|
||||
expectedValue: numContainersPerPod * 600,
|
||||
},
|
||||
}
|
||||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestReplicaCalcScaleUpCM(t *testing.T) {
|
||||
tc := replicaCalcTestCase{
|
||||
currentReplicas: 3,
|
||||
|
@ -610,6 +639,25 @@ func TestReplicaCalcScaleDownIgnoresUnreadyPods(t *testing.T) {
|
|||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestReplicaCalcScaleDownIgnoresFailedPods(t *testing.T) {
|
||||
tc := replicaCalcTestCase{
|
||||
currentReplicas: 5,
|
||||
expectedReplicas: 3,
|
||||
podReadiness: []v1.ConditionStatus{v1.ConditionTrue, v1.ConditionTrue, v1.ConditionTrue, v1.ConditionTrue, v1.ConditionTrue, v1.ConditionFalse, v1.ConditionFalse},
|
||||
podPhase: []v1.PodPhase{v1.PodRunning, v1.PodRunning, v1.PodRunning, v1.PodRunning, v1.PodRunning, v1.PodFailed, v1.PodFailed},
|
||||
resource: &resourceInfo{
|
||||
name: v1.ResourceCPU,
|
||||
requests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
|
||||
levels: []int64{100, 300, 500, 250, 250},
|
||||
|
||||
targetUtilization: 50,
|
||||
expectedUtilization: 28,
|
||||
expectedValue: numContainersPerPod * 280,
|
||||
},
|
||||
}
|
||||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestReplicaCalcTolerance(t *testing.T) {
|
||||
tc := replicaCalcTestCase{
|
||||
currentReplicas: 3,
|
||||
|
|
Loading…
Reference in New Issue