mirror of https://github.com/k3s-io/k3s
Remove UpscaleForbiddenWindow
Instead discard metric values for pods that are unready and have never been ready (they may report misleading values, the original reason for introducing scale up forbidden window). Use per pod metric when pod is: - Ready, or - Not ready but creation timestamp and last readiness change are more than 10s apart. In the latter case we asume the pod was ready but later became unready. We want to use metrics for such pods because sometimes such pods are unready because they were getting too much load.pull/8/head
parent
086ed3c659
commit
7681c284f5
|
@ -93,7 +93,6 @@ func startHPAControllerWithMetricsClient(ctx ControllerContext, metricsClient me
|
|||
replicaCalc,
|
||||
ctx.InformerFactory.Autoscaling().V1().HorizontalPodAutoscalers(),
|
||||
ctx.ComponentConfig.HPAController.HorizontalPodAutoscalerSyncPeriod.Duration,
|
||||
ctx.ComponentConfig.HPAController.HorizontalPodAutoscalerUpscaleForbiddenWindow.Duration,
|
||||
ctx.ComponentConfig.HPAController.HorizontalPodAutoscalerDownscaleForbiddenWindow.Duration,
|
||||
).Run(ctx.Stop)
|
||||
return nil, true, nil
|
||||
|
|
|
@ -64,7 +64,6 @@ type HorizontalController struct {
|
|||
replicaCalc *ReplicaCalculator
|
||||
eventRecorder record.EventRecorder
|
||||
|
||||
upscaleForbiddenWindow time.Duration
|
||||
downscaleForbiddenWindow time.Duration
|
||||
|
||||
// hpaLister is able to list/get HPAs from the shared cache from the informer passed in to
|
||||
|
@ -85,7 +84,6 @@ func NewHorizontalController(
|
|||
replicaCalc *ReplicaCalculator,
|
||||
hpaInformer autoscalinginformers.HorizontalPodAutoscalerInformer,
|
||||
resyncPeriod time.Duration,
|
||||
upscaleForbiddenWindow time.Duration,
|
||||
downscaleForbiddenWindow time.Duration,
|
||||
|
||||
) *HorizontalController {
|
||||
|
@ -99,7 +97,6 @@ func NewHorizontalController(
|
|||
eventRecorder: recorder,
|
||||
scaleNamespacer: scaleNamespacer,
|
||||
hpaNamespacer: hpaNamespacer,
|
||||
upscaleForbiddenWindow: upscaleForbiddenWindow,
|
||||
downscaleForbiddenWindow: downscaleForbiddenWindow,
|
||||
queue: workqueue.NewNamedRateLimitingQueue(NewDefaultHPARateLimiter(resyncPeriod), "horizontalpodautoscaler"),
|
||||
mapper: mapper,
|
||||
|
@ -246,7 +243,6 @@ func (a *HorizontalController) computeReplicasForMetrics(hpa *autoscalingv2.Hori
|
|||
setCondition(hpa, autoscalingv2.ScalingActive, v1.ConditionFalse, "InvalidMetricSourceType", "the HPA was unable to compute the replica count: %s", errMsg)
|
||||
return 0, "", nil, time.Time{}, fmt.Errorf(errMsg)
|
||||
}
|
||||
|
||||
if replicas == 0 || replicaCountProposal > replicas {
|
||||
timestamp = timestampProposal
|
||||
replicas = replicaCountProposal
|
||||
|
@ -472,6 +468,7 @@ func (a *HorizontalController) reconcileAutoscaler(hpav1Shared *autoscalingv1.Ho
|
|||
rescaleReason = "Current number of replicas must be greater than 0"
|
||||
desiredReplicas = 1
|
||||
} else {
|
||||
|
||||
metricDesiredReplicas, metricName, metricStatuses, metricTimestamp, err = a.computeReplicasForMetrics(hpa, scale, hpa.Spec.Metrics)
|
||||
if err != nil {
|
||||
a.setCurrentReplicasInStatus(hpa, currentReplicas)
|
||||
|
@ -507,15 +504,6 @@ func (a *HorizontalController) reconcileAutoscaler(hpav1Shared *autoscalingv1.Ho
|
|||
setCondition(hpa, autoscalingv2.AbleToScale, v1.ConditionFalse, "BackoffDownscale", "the time since the previous scale is still within the downscale forbidden window")
|
||||
backoffDown = true
|
||||
}
|
||||
|
||||
if !hpa.Status.LastScaleTime.Add(a.upscaleForbiddenWindow).Before(timestamp) {
|
||||
backoffUp = true
|
||||
if backoffDown {
|
||||
setCondition(hpa, autoscalingv2.AbleToScale, v1.ConditionFalse, "BackoffBoth", "the time since the previous scale is still within both the downscale and upscale forbidden windows")
|
||||
} else {
|
||||
setCondition(hpa, autoscalingv2.AbleToScale, v1.ConditionFalse, "BackoffUpscale", "the time since the previous scale is still within the upscale forbidden window")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !backoffDown && !backoffUp {
|
||||
|
@ -634,9 +622,8 @@ func (a *HorizontalController) shouldScale(hpa *autoscalingv2.HorizontalPodAutos
|
|||
return true
|
||||
}
|
||||
|
||||
// Going up only if the usage ratio increased significantly above the target
|
||||
// and there was no rescaling in the last upscaleForbiddenWindow.
|
||||
if desiredReplicas > currentReplicas && hpa.Status.LastScaleTime.Add(a.upscaleForbiddenWindow).Before(timestamp) {
|
||||
// Going up only if the usage ratio increased significantly above the target.
|
||||
if desiredReplicas > currentReplicas {
|
||||
return true
|
||||
}
|
||||
|
||||
|
|
|
@ -643,7 +643,6 @@ func (tc *testCase) setupController(t *testing.T) (*HorizontalController, inform
|
|||
}
|
||||
|
||||
informerFactory := informers.NewSharedInformerFactory(testClient, controller.NoResyncPeriodFunc())
|
||||
defaultUpscaleForbiddenWindow := 3 * time.Minute
|
||||
defaultDownscaleForbiddenWindow := 5 * time.Minute
|
||||
|
||||
hpaController := NewHorizontalController(
|
||||
|
@ -654,7 +653,6 @@ func (tc *testCase) setupController(t *testing.T) (*HorizontalController, inform
|
|||
replicaCalc,
|
||||
informerFactory.Autoscaling().V1().HorizontalPodAutoscalers(),
|
||||
controller.NoResyncPeriodFunc(),
|
||||
defaultUpscaleForbiddenWindow,
|
||||
defaultDownscaleForbiddenWindow,
|
||||
)
|
||||
hpaController.hpaListerSynced = alwaysReady
|
||||
|
@ -1728,7 +1726,7 @@ func TestConditionFailedUpdateScale(t *testing.T) {
|
|||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestBackoffUpscale(t *testing.T) {
|
||||
func NoTestBackoffUpscale(t *testing.T) {
|
||||
time := metav1.Time{Time: time.Now()}
|
||||
tc := testCase{
|
||||
minReplicas: 1,
|
||||
|
@ -1746,8 +1744,84 @@ func TestBackoffUpscale(t *testing.T) {
|
|||
Reason: "ReadyForNewScale",
|
||||
}, autoscalingv2.HorizontalPodAutoscalerCondition{
|
||||
Type: autoscalingv2.AbleToScale,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "SucceededRescale",
|
||||
}),
|
||||
}
|
||||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestNoBackoffUpscaleCM(t *testing.T) {
|
||||
time := metav1.Time{Time: time.Now()}
|
||||
tc := testCase{
|
||||
minReplicas: 1,
|
||||
maxReplicas: 5,
|
||||
initialReplicas: 3,
|
||||
expectedDesiredReplicas: 4,
|
||||
CPUTarget: 0,
|
||||
metricsTarget: []autoscalingv2.MetricSpec{
|
||||
{
|
||||
Type: autoscalingv2.PodsMetricSourceType,
|
||||
Pods: &autoscalingv2.PodsMetricSource{
|
||||
MetricName: "qps",
|
||||
TargetAverageValue: resource.MustParse("15.0"),
|
||||
},
|
||||
},
|
||||
},
|
||||
reportedLevels: []uint64{20000, 10000, 30000},
|
||||
reportedCPURequests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
|
||||
//useMetricsAPI: true,
|
||||
lastScaleTime: &time,
|
||||
expectedConditions: statusOkWithOverrides(autoscalingv2.HorizontalPodAutoscalerCondition{
|
||||
Type: autoscalingv2.AbleToScale,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "ReadyForNewScale",
|
||||
}, autoscalingv2.HorizontalPodAutoscalerCondition{
|
||||
Type: autoscalingv2.AbleToScale,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "SucceededRescale",
|
||||
}, autoscalingv2.HorizontalPodAutoscalerCondition{
|
||||
Type: autoscalingv2.ScalingLimited,
|
||||
Status: v1.ConditionFalse,
|
||||
Reason: "BackoffBoth",
|
||||
Reason: "DesiredWithinRange",
|
||||
}),
|
||||
}
|
||||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestNoBackoffUpscaleCMNoBackoffCpu(t *testing.T) {
|
||||
time := metav1.Time{Time: time.Now()}
|
||||
tc := testCase{
|
||||
minReplicas: 1,
|
||||
maxReplicas: 5,
|
||||
initialReplicas: 3,
|
||||
expectedDesiredReplicas: 5,
|
||||
CPUTarget: 10,
|
||||
metricsTarget: []autoscalingv2.MetricSpec{
|
||||
{
|
||||
Type: autoscalingv2.PodsMetricSourceType,
|
||||
Pods: &autoscalingv2.PodsMetricSource{
|
||||
MetricName: "qps",
|
||||
TargetAverageValue: resource.MustParse("15.0"),
|
||||
},
|
||||
},
|
||||
},
|
||||
reportedLevels: []uint64{20000, 10000, 30000},
|
||||
reportedCPURequests: []resource.Quantity{resource.MustParse("1.0"), resource.MustParse("1.0"), resource.MustParse("1.0")},
|
||||
useMetricsAPI: true,
|
||||
lastScaleTime: &time,
|
||||
expectedConditions: statusOkWithOverrides(autoscalingv2.HorizontalPodAutoscalerCondition{
|
||||
Type: autoscalingv2.AbleToScale,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "ReadyForNewScale",
|
||||
}, autoscalingv2.HorizontalPodAutoscalerCondition{
|
||||
Type: autoscalingv2.AbleToScale,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "SucceededRescale",
|
||||
}, autoscalingv2.HorizontalPodAutoscalerCondition{
|
||||
Type: autoscalingv2.ScalingLimited,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "TooManyReplicas",
|
||||
}),
|
||||
}
|
||||
tc.runTest(t)
|
||||
|
|
|
@ -491,7 +491,6 @@ func (tc *legacyTestCase) runTest(t *testing.T) {
|
|||
}
|
||||
|
||||
informerFactory := informers.NewSharedInformerFactory(testClient, controller.NoResyncPeriodFunc())
|
||||
defaultUpscaleForbiddenWindow := 3 * time.Minute
|
||||
defaultDownscaleForbiddenWindow := 5 * time.Minute
|
||||
|
||||
hpaController := NewHorizontalController(
|
||||
|
@ -502,7 +501,6 @@ func (tc *legacyTestCase) runTest(t *testing.T) {
|
|||
replicaCalc,
|
||||
informerFactory.Autoscaling().V1().HorizontalPodAutoscalers(),
|
||||
controller.NoResyncPeriodFunc(),
|
||||
defaultUpscaleForbiddenWindow,
|
||||
defaultDownscaleForbiddenWindow,
|
||||
)
|
||||
hpaController.hpaListerSynced = alwaysReady
|
||||
|
|
|
@ -35,6 +35,10 @@ const (
|
|||
// defaultTestingTolerance is default value for calculating when to
|
||||
// scale up/scale down.
|
||||
defaultTestingTolerance = 0.1
|
||||
|
||||
// Pod begins existence as unready. If pod is unready and timestamp of last pod readiness change is
|
||||
// less than maxDelayOfInitialReadinessStatus after pod start we assume it has never been ready.
|
||||
maxDelayOfInitialReadinessStatus = 10 * time.Second
|
||||
)
|
||||
|
||||
type ReplicaCalculator struct {
|
||||
|
@ -205,7 +209,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet
|
|||
missingPods := sets.NewString()
|
||||
|
||||
for _, pod := range podList.Items {
|
||||
if pod.Status.Phase != v1.PodRunning || !podutil.IsPodReady(&pod) {
|
||||
if pod.Status.Phase != v1.PodRunning || !hasPodBeenReadyBefore(&pod) {
|
||||
// save this pod name for later, but pretend it doesn't exist for now
|
||||
unreadyPods.Insert(pod.Name)
|
||||
delete(metrics, pod.Name)
|
||||
|
@ -381,3 +385,22 @@ func (c *ReplicaCalculator) GetExternalPerPodMetricReplicas(currentReplicas int3
|
|||
utilization = int64(math.Ceil(float64(utilization) / float64(currentReplicas)))
|
||||
return replicaCount, utilization, timestamp, nil
|
||||
}
|
||||
|
||||
// hasPodBeenReadyBefore returns true if the pod is ready or if it's not ready
|
||||
func hasPodBeenReadyBefore(pod *v1.Pod) bool {
|
||||
_, readyCondition := podutil.GetPodCondition(&pod.Status, v1.PodReady)
|
||||
if readyCondition == nil {
|
||||
return false
|
||||
}
|
||||
if readyCondition.Status == v1.ConditionTrue {
|
||||
return true
|
||||
}
|
||||
lastReady := readyCondition.LastTransitionTime.Time
|
||||
if pod.Status.StartTime == nil {
|
||||
return false
|
||||
}
|
||||
started := pod.Status.StartTime.Time
|
||||
// If last status change was longer than maxDelayOfInitialReadinessStatus after the pod was
|
||||
// created assume it was ready in the past.
|
||||
return lastReady.After(started.Add(maxDelayOfInitialReadinessStatus))
|
||||
}
|
||||
|
|
|
@ -1069,4 +1069,76 @@ func TestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) {
|
|||
tc.runTest(t)
|
||||
}
|
||||
|
||||
func TestHasPodBeenReadyBefore(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
conditions []v1.PodCondition
|
||||
started time.Time
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
"initially unready",
|
||||
[]v1.PodCondition{
|
||||
{
|
||||
Type: v1.PodReady,
|
||||
LastTransitionTime: metav1.Time{
|
||||
Time: metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
|
||||
},
|
||||
Status: v1.ConditionFalse,
|
||||
},
|
||||
},
|
||||
metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
|
||||
false,
|
||||
},
|
||||
{
|
||||
"currently unready",
|
||||
[]v1.PodCondition{
|
||||
{
|
||||
Type: v1.PodReady,
|
||||
LastTransitionTime: metav1.Time{
|
||||
Time: metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
|
||||
},
|
||||
Status: v1.ConditionFalse,
|
||||
},
|
||||
},
|
||||
metav1.Date(2018, 7, 25, 17, 0, 0, 0, time.UTC).Time,
|
||||
true,
|
||||
},
|
||||
{
|
||||
"currently ready",
|
||||
[]v1.PodCondition{
|
||||
{
|
||||
Type: v1.PodReady,
|
||||
LastTransitionTime: metav1.Time{
|
||||
Time: metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
|
||||
},
|
||||
Status: v1.ConditionTrue,
|
||||
},
|
||||
},
|
||||
metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
|
||||
true,
|
||||
},
|
||||
{
|
||||
"no ready status",
|
||||
[]v1.PodCondition{},
|
||||
metav1.Date(2018, 7, 25, 17, 10, 0, 0, time.UTC).Time,
|
||||
false,
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
pod := &v1.Pod{
|
||||
Status: v1.PodStatus{
|
||||
Conditions: tc.conditions,
|
||||
StartTime: &metav1.Time{
|
||||
Time: tc.started,
|
||||
},
|
||||
},
|
||||
}
|
||||
got := hasPodBeenReadyBefore(pod)
|
||||
if got != tc.expected {
|
||||
t.Errorf("[TestHasPodBeenReadyBefore.%s] got %v, want %v", tc.name, got, tc.expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: add more tests
|
||||
|
|
Loading…
Reference in New Issue