Remove UpscaleForbiddenWindow

Instead discard metric values for pods that are unready and have never
been ready (they may report misleading values, the original reason for
introducing scale up forbidden window).

Use per pod metric when pod is:
- Ready, or
- Not ready but creation timestamp and last readiness change are more
  than 10s apart.

In the latter case we asume the pod was ready but later became unready.
We want to use metrics for such pods because sometimes such pods are
unready because they were getting too much load.
pull/8/head
Joachim Bartosik 2018-07-18 14:21:00 +02:00
parent 086ed3c659
commit 7681c284f5
6 changed files with 177 additions and 24 deletions

View File

@ -93,7 +93,6 @@ func startHPAControllerWithMetricsClient(ctx ControllerContext, metricsClient me
replicaCalc,
ctx.InformerFactory.Autoscaling().V1().HorizontalPodAutoscalers(),
ctx.ComponentConfig.HPAController.HorizontalPodAutoscalerSyncPeriod.Duration,
ctx.ComponentConfig.HPAController.HorizontalPodAutoscalerUpscaleForbiddenWindow.Duration,
ctx.ComponentConfig.HPAController.HorizontalPodAutoscalerDownscaleForbiddenWindow.Duration,
).Run(ctx.Stop)
return nil, true, nil

View File

@ -64,7 +64,6 @@ type HorizontalController struct {
replicaCalc *ReplicaCalculator
eventRecorder record.EventRecorder
upscaleForbiddenWindow time.Duration
downscaleForbiddenWindow time.Duration
// hpaLister is able to list/get HPAs from the shared cache from the informer passed in to
@ -85,7 +84,6 @@ func NewHorizontalController(
replicaCalc *ReplicaCalculator,
hpaInformer autoscalinginformers.HorizontalPodAutoscalerInformer,
resyncPeriod time.Duration,
upscaleForbiddenWindow time.Duration,
downscaleForbiddenWindow time.Duration,
) *HorizontalController {
@ -99,7 +97,6 @@ func NewHorizontalController(
eventRecorder: recorder,
scaleNamespacer: scaleNamespacer,
hpaNamespacer: hpaNamespacer,
upscaleForbiddenWindow: upscaleForbiddenWindow,
downscaleForbiddenWindow: downscaleForbiddenWindow,
queue: workqueue.NewNamedRateLimitingQueue(NewDefaultHPARateLimiter(resyncPeriod), "horizontalpodautoscaler"),
mapper: mapper,
@ -246,7 +243,6 @@ func (a *HorizontalController) computeReplicasForMetrics(hpa *autoscalingv2.Hori
setCondition(hpa, autoscalingv2.ScalingActive, v1.ConditionFalse, "InvalidMetricSourceType", "the HPA was unable to compute the replica count: %s", errMsg)
return 0, "", nil, time.Time{}, fmt.Errorf(errMsg)
}
if replicas == 0 || replicaCountProposal > replicas {
timestamp = timestampProposal
replicas = replicaCountProposal
@ -472,6 +468,7 @@ func (a *HorizontalController) reconcileAutoscaler(hpav1Shared *autoscalingv1.Ho
rescaleReason = "Current number of replicas must be greater than 0"
desiredReplicas = 1
} else {
metricDesiredReplicas, metricName, metricStatuses, metricTimestamp, err = a.computeReplicasForMetrics(hpa, scale, hpa.Spec.Metrics)
if err != nil {
a.setCurrentReplicasInStatus(hpa, currentReplicas)
@ -507,15 +504,6 @@ func (a *HorizontalController) reconcileAutoscaler(hpav1Shared *autoscalingv1.Ho
setCondition(hpa, autoscalingv2.AbleToScale, v1.ConditionFalse, "BackoffDownscale", "the time since the previous scale is still within the downscale forbidden window")
backoffDown = true
}
if !hpa.Status.LastScaleTime.Add(a.upscaleForbiddenWindow).Before(timestamp) {
backoffUp = true
if backoffDown {
setCondition(hpa, autoscalingv2.AbleToScale, v1.ConditionFalse, "BackoffBoth", "the time since the previous scale is still within both the downscale and upscale forbidden windows")
} else {
setCondition(hpa, autoscalingv2.AbleToScale, v1.ConditionFalse, "BackoffUpscale", "the time since the previous scale is still within the upscale forbidden window")
}
}
}
if !backoffDown && !backoffUp {
@ -634,9 +622,8 @@ func (a *HorizontalController) shouldScale(hpa *autoscalingv2.HorizontalPodAutos
return true
}
// Going up only if the usage ratio increased significantly above the target
// and there was no rescaling in the last upscaleForbiddenWindow.
if desiredReplicas > currentReplicas && hpa.Status.LastScaleTime.Add(a.upscaleForbiddenWindow).Before(timestamp) {
// Going up only if the usage ratio increased significantly above the target.
if desiredReplicas > currentReplicas {
return true
}

View File

@ -643,7 +643,6 @@ func (tc *testCase) setupController(t *testing.T) (*HorizontalController, inform
}
informerFactory := informers.NewSharedInformerFactory(testClient, controller.NoResyncPeriodFunc())
defaultUpscaleForbiddenWindow := 3 * time.Minute
defaultDownscaleForbiddenWindow := 5 * time.Minute
hpaController := NewHorizontalController(
@ -654,7 +653,6 @@ func (tc *testCase) setupController(t *testing.T) (*HorizontalController, inform
replicaCalc,
informerFactory.Autoscaling().V1().HorizontalPodAutoscalers(),
controller.NoResyncPeriodFunc(),
defaultUpscaleForbiddenWindow,
defaultDownscaleForbiddenWindow,
)
hpaController.hpaListerSynced = alwaysReady
@ -1728,7 +1726,7 @@ func TestConditionFailedUpdateScale(t *testing.T) {
tc.runTest(t)
}
func TestBackoffUpscale(t *testing.T) {
func NoTestBackoffUpscale(t *testing.T) {
time := metav1.Time{Time: time.Now()}
tc := testCase{
minReplicas: 1,
@ -1746,8 +1744,84 @@ func TestBackoffUpscale(t *testing.T) {
Reason: "ReadyForNewScale",
}, autoscalingv2.HorizontalPodAutoscalerCondition{
Type: autoscalingv2.AbleToScale,
Status: v1.ConditionTrue,
Reason: "SucceededRescale",
}),
}
tc.runTest(t)
}
func TestNoBackoffUpscaleCM(t *testing.T) {
time := metav1.Time{Time: time.Now()}
tc := testCase{
minReplicas: 1,
maxReplicas: 5,
initialReplicas: 3,
expectedDesiredReplicas: 4,
CPUTarget: 0,
metricsTarget: []autoscalingv2.MetricSpec{
{
Type: autoscalingv2.PodsMetricSourceType,
Pods: &autoscalingv2.PodsMetricSource{
MetricName: "qps",
TargetAverageValue: resource.MustParse("15.0"),
},
},
},
reportedLevels: []uint64{20000, 10000, 30000},
reportedCPURequests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
//useMetricsAPI: true,
lastScaleTime: &time,
expectedConditions: statusOkWithOverrides(autoscalingv2.HorizontalPodAutoscalerCondition{
Type: autoscalingv2.AbleToScale,
Status: v1.ConditionTrue,
Reason: "ReadyForNewScale",
}, autoscalingv2.HorizontalPodAutoscalerCondition{
Type: autoscalingv2.AbleToScale,
Status: v1.ConditionTrue,
Reason: "SucceededRescale",
}, autoscalingv2.HorizontalPodAutoscalerCondition{
Type: autoscalingv2.ScalingLimited,
Status: v1.ConditionFalse,
Reason: "BackoffBoth",
Reason: "DesiredWithinRange",
}),
}
tc.runTest(t)
}
func TestNoBackoffUpscaleCMNoBackoffCpu(t *testing.T) {
time := metav1.Time{Time: time.Now()}
tc := testCase{
minReplicas: 1,
maxReplicas: 5,
initialReplicas: 3,
expectedDesiredReplicas: 5,
CPUTarget: 10,
metricsTarget: []autoscalingv2.MetricSpec{
{
Type: autoscalingv2.PodsMetricSourceType,
Pods: &autoscalingv2.PodsMetricSource{
MetricName: "qps",
TargetAverageValue: resource.MustParse("15.0"),
},
},
},
reportedLevels: []uint64{20000, 10000, 30000},
reportedCPURequests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
useMetricsAPI: true,
lastScaleTime: &time,
expectedConditions: statusOkWithOverrides(autoscalingv2.HorizontalPodAutoscalerCondition{
Type: autoscalingv2.AbleToScale,
Status: v1.ConditionTrue,
Reason: "ReadyForNewScale",
}, autoscalingv2.HorizontalPodAutoscalerCondition{
Type: autoscalingv2.AbleToScale,
Status: v1.ConditionTrue,
Reason: "SucceededRescale",
}, autoscalingv2.HorizontalPodAutoscalerCondition{
Type: autoscalingv2.ScalingLimited,
Status: v1.ConditionTrue,
Reason: "TooManyReplicas",
}),
}
tc.runTest(t)

View File

@ -491,7 +491,6 @@ func (tc *legacyTestCase) runTest(t *testing.T) {
}
informerFactory := informers.NewSharedInformerFactory(testClient, controller.NoResyncPeriodFunc())
defaultUpscaleForbiddenWindow := 3 * time.Minute
defaultDownscaleForbiddenWindow := 5 * time.Minute
hpaController := NewHorizontalController(
@ -502,7 +501,6 @@ func (tc *legacyTestCase) runTest(t *testing.T) {
replicaCalc,
informerFactory.Autoscaling().V1().HorizontalPodAutoscalers(),
controller.NoResyncPeriodFunc(),
defaultUpscaleForbiddenWindow,
defaultDownscaleForbiddenWindow,
)
hpaController.hpaListerSynced = alwaysReady

View File

@ -35,6 +35,10 @@ const (
// defaultTestingTolerance is default value for calculating when to
// scale up/scale down.
defaultTestingTolerance = 0.1
// Pod begins existence as unready. If pod is unready and timestamp of last pod readiness change is
// less than maxDelayOfInitialReadinessStatus after pod start we assume it has never been ready.
maxDelayOfInitialReadinessStatus = 10 * time.Second
)
type ReplicaCalculator struct {
@ -205,7 +209,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet
missingPods := sets.NewString()
for _, pod := range podList.Items {
if pod.Status.Phase != v1.PodRunning || !podutil.IsPodReady(&pod) {
if pod.Status.Phase != v1.PodRunning || !hasPodBeenReadyBefore(&pod) {
// save this pod name for later, but pretend it doesn't exist for now
unreadyPods.Insert(pod.Name)
delete(metrics, pod.Name)
@ -381,3 +385,22 @@ func (c *ReplicaCalculator) GetExternalPerPodMetricReplicas(currentReplicas int3
utilization = int64(math.Ceil(float64(utilization) / float64(currentReplicas)))
return replicaCount, utilization, timestamp, nil
}
// hasPodBeenReadyBefore returns true if the pod is ready or if it's not ready
func hasPodBeenReadyBefore(pod *v1.Pod) bool {
_, readyCondition := podutil.GetPodCondition(&pod.Status, v1.PodReady)
if readyCondition == nil {
return false
}
if readyCondition.Status == v1.ConditionTrue {
return true
}
lastReady := readyCondition.LastTransitionTime.Time
if pod.Status.StartTime == nil {
return false
}
started := pod.Status.StartTime.Time
// If last status change was longer than maxDelayOfInitialReadinessStatus after the pod was
// created assume it was ready in the past.
return lastReady.After(started.Add(maxDelayOfInitialReadinessStatus))
}

View File

@ -1069,4 +1069,76 @@ func TestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) {
tc.runTest(t)
}
func TestHasPodBeenReadyBefore(t *testing.T) {
tests := []struct {
name string
conditions []v1.PodCondition
started time.Time
expected bool
}{
{
"initially unready",
[]v1.PodCondition{
{
Type: v1.PodReady,
LastTransitionTime: metav1.Time{
Time: metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
},
Status: v1.ConditionFalse,
},
},
metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
false,
},
{
"currently unready",
[]v1.PodCondition{
{
Type: v1.PodReady,
LastTransitionTime: metav1.Time{
Time: metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
},
Status: v1.ConditionFalse,
},
},
metav1.Date(2018, 7, 25, 17, 0, 0, 0, time.UTC).Time,
true,
},
{
"currently ready",
[]v1.PodCondition{
{
Type: v1.PodReady,
LastTransitionTime: metav1.Time{
Time: metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
},
Status: v1.ConditionTrue,
},
},
metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
true,
},
{
"no ready status",
[]v1.PodCondition{},
metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
false,
},
}
for _, tc := range tests {
pod := &v1.Pod{
Status: v1.PodStatus{
Conditions: tc.conditions,
StartTime: &metav1.Time{
Time: tc.started,
},
},
}
got := hasPodBeenReadyBefore(pod)
if got != tc.expected {
t.Errorf("[TestHasPodBeenReadyBefore.%s] got %v, want %v", tc.name, got, tc.expected)
}
}
}
// TODO: add more tests