From abd46684d451bb3ef2b542971ea4e48ab9c221c9 Mon Sep 17 00:00:00 2001 From: mattjmcnaughton Date: Mon, 11 Sep 2017 09:59:53 -0400 Subject: [PATCH] Make HPA tolerance a flag Fix #18155 Make HPA tolerance configurable as a flag. This change allows us to use different tolerance values in production/testing. Signed-off-by: mattjmcnaughton --- .../app/autoscaling.go | 6 +++++- .../app/options/options.go | 2 ++ .../app/options/options_test.go | 1 + pkg/apis/componentconfig/types.go | 3 +++ pkg/controller/podautoscaler/horizontal.go | 4 ---- .../podautoscaler/horizontal_test.go | 5 +++-- .../podautoscaler/legacy_horizontal_test.go | 5 +++-- .../legacy_replica_calculator_test.go | 5 +++-- .../podautoscaler/replica_calculator.go | 20 +++++++++++++------ .../podautoscaler/replica_calculator_test.go | 5 +++-- 10 files changed, 37 insertions(+), 19 deletions(-) diff --git a/cmd/kube-controller-manager/app/autoscaling.go b/cmd/kube-controller-manager/app/autoscaling.go index 2a7d83f1db..5af911d7ad 100644 --- a/cmd/kube-controller-manager/app/autoscaling.go +++ b/cmd/kube-controller-manager/app/autoscaling.go @@ -64,7 +64,11 @@ func startHPAControllerWithLegacyClient(ctx ControllerContext) (bool, error) { func startHPAControllerWithMetricsClient(ctx ControllerContext, metricsClient metrics.MetricsClient) (bool, error) { hpaClient := ctx.ClientBuilder.ClientOrDie("horizontal-pod-autoscaler") - replicaCalc := podautoscaler.NewReplicaCalculator(metricsClient, hpaClient.Core()) + replicaCalc := podautoscaler.NewReplicaCalculator( + metricsClient, + hpaClient.Core(), + ctx.Options.HorizontalPodAutoscalerTolerance, + ) go podautoscaler.NewHorizontalController( ctx.ClientBuilder.ClientGoClientOrDie("horizontal-pod-autoscaler").Core(), hpaClient.Extensions(), diff --git a/cmd/kube-controller-manager/app/options/options.go b/cmd/kube-controller-manager/app/options/options.go index 12f6264dfb..2712102c55 100644 --- a/cmd/kube-controller-manager/app/options/options.go +++ b/cmd/kube-controller-manager/app/options/options.go @@ -79,6 +79,7 @@ func NewCMServer() *CMServer { HorizontalPodAutoscalerSyncPeriod: metav1.Duration{Duration: 30 * time.Second}, HorizontalPodAutoscalerUpscaleForbiddenWindow: metav1.Duration{Duration: 3 * time.Minute}, HorizontalPodAutoscalerDownscaleForbiddenWindow: metav1.Duration{Duration: 5 * time.Minute}, + HorizontalPodAutoscalerTolerance: 0.1, DeploymentControllerSyncPeriod: metav1.Duration{Duration: 30 * time.Second}, MinResyncPeriod: metav1.Duration{Duration: 12 * time.Hour}, RegisterRetryCount: 10, @@ -166,6 +167,7 @@ func (s *CMServer) AddFlags(fs *pflag.FlagSet, allControllers []string, disabled fs.DurationVar(&s.HorizontalPodAutoscalerSyncPeriod.Duration, "horizontal-pod-autoscaler-sync-period", s.HorizontalPodAutoscalerSyncPeriod.Duration, "The period for syncing the number of pods in horizontal pod autoscaler.") fs.DurationVar(&s.HorizontalPodAutoscalerUpscaleForbiddenWindow.Duration, "horizontal-pod-autoscaler-upscale-delay", s.HorizontalPodAutoscalerUpscaleForbiddenWindow.Duration, "The period since last upscale, before another upscale can be performed in horizontal pod autoscaler.") fs.DurationVar(&s.HorizontalPodAutoscalerDownscaleForbiddenWindow.Duration, "horizontal-pod-autoscaler-downscale-delay", s.HorizontalPodAutoscalerDownscaleForbiddenWindow.Duration, "The period since last downscale, before another downscale can be performed in horizontal pod autoscaler.") + fs.Float64Var(&s.HorizontalPodAutoscalerTolerance, "horizontal-pod-autoscaler-tolerance", s.HorizontalPodAutoscalerTolerance, "The minimum change (from 1.0) in the desired-to-actual metrics ratio for the horizontal pod autoscaler to consider scaling.") fs.DurationVar(&s.DeploymentControllerSyncPeriod.Duration, "deployment-controller-sync-period", s.DeploymentControllerSyncPeriod.Duration, "Period for syncing the deployments.") fs.DurationVar(&s.PodEvictionTimeout.Duration, "pod-eviction-timeout", s.PodEvictionTimeout.Duration, "The grace period for deleting pods on failed nodes.") fs.Float32Var(&s.DeletingPodsQps, "deleting-pods-qps", 0.1, "Number of nodes per second on which pods are deleted in case of node failure.") diff --git a/cmd/kube-controller-manager/app/options/options_test.go b/cmd/kube-controller-manager/app/options/options_test.go index ce5beff0ef..da4630f0bd 100644 --- a/cmd/kube-controller-manager/app/options/options_test.go +++ b/cmd/kube-controller-manager/app/options/options_test.go @@ -151,6 +151,7 @@ func TestAddFlags(t *testing.T) { NodeMonitorPeriod: metav1.Duration{Duration: 10 * time.Second}, HorizontalPodAutoscalerUpscaleForbiddenWindow: metav1.Duration{Duration: 1 * time.Minute}, HorizontalPodAutoscalerDownscaleForbiddenWindow: metav1.Duration{Duration: 2 * time.Minute}, + HorizontalPodAutoscalerTolerance: 0.1, TerminatedPodGCThreshold: 12000, VolumeConfiguration: componentconfig.VolumeConfiguration{ EnableDynamicProvisioning: false, diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index 078fa11d00..6aa64e436d 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -346,6 +346,9 @@ type KubeControllerManagerConfiguration struct { HorizontalPodAutoscalerUpscaleForbiddenWindow metav1.Duration // horizontalPodAutoscalerDownscaleForbiddenWindow is a period after which next downscale allowed. HorizontalPodAutoscalerDownscaleForbiddenWindow metav1.Duration + // horizontalPodAutoscalerTolerance is the tolerance for when + // resource usage suggests upscaling/downscaling + HorizontalPodAutoscalerTolerance float64 // deploymentControllerSyncPeriod is the period for syncing the deployments. DeploymentControllerSyncPeriod metav1.Duration // podEvictionTimeout is the grace period for deleting pods on failed nodes. diff --git a/pkg/controller/podautoscaler/horizontal.go b/pkg/controller/podautoscaler/horizontal.go index 6c02a72c8e..e532309f5c 100644 --- a/pkg/controller/podautoscaler/horizontal.go +++ b/pkg/controller/podautoscaler/horizontal.go @@ -49,10 +49,6 @@ import ( ) var ( - // Usage shoud exceed the tolerance before we start downscale or upscale the pods. - // TODO: make it a flag or HPA spec element. - tolerance = 0.1 - scaleUpLimitFactor = 2.0 scaleUpLimitMinimum = 4.0 ) diff --git a/pkg/controller/podautoscaler/horizontal_test.go b/pkg/controller/podautoscaler/horizontal_test.go index 98a48ea4f2..8f6d3d92ea 100644 --- a/pkg/controller/podautoscaler/horizontal_test.go +++ b/pkg/controller/podautoscaler/horizontal_test.go @@ -579,6 +579,7 @@ func (tc *testCase) setupController(t *testing.T) (*HorizontalController, inform replicaCalc := &ReplicaCalculator{ metricsClient: metricsClient, podsGetter: testClient.Core(), + tolerance: defaultTestingTolerance, } informerFactory := informers.NewSharedInformerFactory(testClient, controller.NoResyncPeriodFunc()) @@ -1525,7 +1526,7 @@ func TestComputedToleranceAlgImplementation(t *testing.T) { perPodRequested := totalRequestedCPUOfAllPods / startPods // Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio). - target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01 + target := math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .01 finalCPUPercentTarget := int32(target * 100) resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target) @@ -1570,7 +1571,7 @@ func TestComputedToleranceAlgImplementation(t *testing.T) { // Reuse the data structure above, now testing "unscaling". // Now, we test that no scaling happens if we are in a very close margin to the tolerance - target = math.Abs(1/(requestedToUsed*(1-tolerance))) + .004 + target = math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .004 finalCPUPercentTarget = int32(target * 100) tc.CPUTarget = finalCPUPercentTarget tc.initialReplicas = startPods diff --git a/pkg/controller/podautoscaler/legacy_horizontal_test.go b/pkg/controller/podautoscaler/legacy_horizontal_test.go index d1fb896f70..e8efb5d1fa 100644 --- a/pkg/controller/podautoscaler/legacy_horizontal_test.go +++ b/pkg/controller/podautoscaler/legacy_horizontal_test.go @@ -484,6 +484,7 @@ func (tc *legacyTestCase) runTest(t *testing.T) { replicaCalc := &ReplicaCalculator{ metricsClient: metricsClient, podsGetter: testClient.Core(), + tolerance: defaultTestingTolerance, } informerFactory := informers.NewSharedInformerFactory(testClient, controller.NoResyncPeriodFunc()) @@ -965,7 +966,7 @@ func LegacyTestComputedToleranceAlgImplementation(t *testing.T) { perPodRequested := totalRequestedCPUOfAllPods / startPods // Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio). - target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01 + target := math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .01 finalCPUPercentTarget := int32(target * 100) resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target) @@ -1010,7 +1011,7 @@ func LegacyTestComputedToleranceAlgImplementation(t *testing.T) { // Reuse the data structure above, now testing "unscaling". // Now, we test that no scaling happens if we are in a very close margin to the tolerance - target = math.Abs(1/(requestedToUsed*(1-tolerance))) + .004 + target = math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .004 finalCPUPercentTarget = int32(target * 100) tc.CPUTarget = finalCPUPercentTarget tc.initialReplicas = startPods diff --git a/pkg/controller/podautoscaler/legacy_replica_calculator_test.go b/pkg/controller/podautoscaler/legacy_replica_calculator_test.go index dfec6f9649..85cc30d35a 100644 --- a/pkg/controller/podautoscaler/legacy_replica_calculator_test.go +++ b/pkg/controller/podautoscaler/legacy_replica_calculator_test.go @@ -188,6 +188,7 @@ func (tc *legacyReplicaCalcTestCase) runTest(t *testing.T) { replicaCalc := &ReplicaCalculator{ metricsClient: metricsClient, podsGetter: testClient.Core(), + tolerance: defaultTestingTolerance, } selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ @@ -603,7 +604,7 @@ func LegacyTestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) { perPodRequested := totalRequestedCPUOfAllPods / startPods // Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio). - target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01 + target := math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .01 finalCPUPercentTarget := int32(target * 100) resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target) @@ -651,7 +652,7 @@ func LegacyTestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) { // Reuse the data structure above, now testing "unscaling". // Now, we test that no scaling happens if we are in a very close margin to the tolerance - target = math.Abs(1/(requestedToUsed*(1-tolerance))) + .004 + target = math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .004 finalCPUPercentTarget = int32(target * 100) tc.resource.targetUtilization = finalCPUPercentTarget tc.currentReplicas = startPods diff --git a/pkg/controller/podautoscaler/replica_calculator.go b/pkg/controller/podautoscaler/replica_calculator.go index ee239198aa..62f863c3b1 100644 --- a/pkg/controller/podautoscaler/replica_calculator.go +++ b/pkg/controller/podautoscaler/replica_calculator.go @@ -31,15 +31,23 @@ import ( metricsclient "k8s.io/kubernetes/pkg/controller/podautoscaler/metrics" ) +const ( + // defaultTestingTolerance is default value for calculating when to + // scale up/scale down. + defaultTestingTolerance = 0.1 +) + type ReplicaCalculator struct { metricsClient metricsclient.MetricsClient podsGetter v1coreclient.PodsGetter + tolerance float64 } -func NewReplicaCalculator(metricsClient metricsclient.MetricsClient, podsGetter v1coreclient.PodsGetter) *ReplicaCalculator { +func NewReplicaCalculator(metricsClient metricsclient.MetricsClient, podsGetter v1coreclient.PodsGetter, tolerance float64) *ReplicaCalculator { return &ReplicaCalculator{ metricsClient: metricsClient, podsGetter: podsGetter, + tolerance: tolerance, } } @@ -105,7 +113,7 @@ func (c *ReplicaCalculator) GetResourceReplicas(currentReplicas int32, targetUti rebalanceUnready := len(unreadyPods) > 0 && usageRatio > 1.0 if !rebalanceUnready && len(missingPods) == 0 { - if math.Abs(1.0-usageRatio) <= tolerance { + if math.Abs(1.0-usageRatio) <= c.tolerance { // return the current replicas if the change would be too small return currentReplicas, utilization, rawUtilization, timestamp, nil } @@ -141,7 +149,7 @@ func (c *ReplicaCalculator) GetResourceReplicas(currentReplicas int32, targetUti return 0, utilization, rawUtilization, time.Time{}, err } - if math.Abs(1.0-newUsageRatio) <= tolerance || (usageRatio < 1.0 && newUsageRatio > 1.0) || (usageRatio > 1.0 && newUsageRatio < 1.0) { + if math.Abs(1.0-newUsageRatio) <= c.tolerance || (usageRatio < 1.0 && newUsageRatio > 1.0) || (usageRatio > 1.0 && newUsageRatio < 1.0) { // return the current replicas if the change would be too small, // or if the new usage ratio would cause a change in scale direction return currentReplicas, utilization, rawUtilization, timestamp, nil @@ -218,7 +226,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet rebalanceUnready := len(unreadyPods) > 0 && usageRatio > 1.0 if !rebalanceUnready && len(missingPods) == 0 { - if math.Abs(1.0-usageRatio) <= tolerance { + if math.Abs(1.0-usageRatio) <= c.tolerance { // return the current replicas if the change would be too small return currentReplicas, utilization, nil } @@ -251,7 +259,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet // re-run the utilization calculation with our new numbers newUsageRatio, _ := metricsclient.GetMetricUtilizationRatio(metrics, targetUtilization) - if math.Abs(1.0-newUsageRatio) <= tolerance || (usageRatio < 1.0 && newUsageRatio > 1.0) || (usageRatio > 1.0 && newUsageRatio < 1.0) { + if math.Abs(1.0-newUsageRatio) <= c.tolerance || (usageRatio < 1.0 && newUsageRatio > 1.0) || (usageRatio > 1.0 && newUsageRatio < 1.0) { // return the current replicas if the change would be too small, // or if the new usage ratio would cause a change in scale direction return currentReplicas, utilization, nil @@ -271,7 +279,7 @@ func (c *ReplicaCalculator) GetObjectMetricReplicas(currentReplicas int32, targe } usageRatio := float64(utilization) / float64(targetUtilization) - if math.Abs(1.0-usageRatio) <= tolerance { + if math.Abs(1.0-usageRatio) <= c.tolerance { // return the current replicas if the change would be too small return currentReplicas, utilization, timestamp, nil } diff --git a/pkg/controller/podautoscaler/replica_calculator_test.go b/pkg/controller/podautoscaler/replica_calculator_test.go index 764ae6fe0d..3353adbe57 100644 --- a/pkg/controller/podautoscaler/replica_calculator_test.go +++ b/pkg/controller/podautoscaler/replica_calculator_test.go @@ -247,6 +247,7 @@ func (tc *replicaCalcTestCase) runTest(t *testing.T) { replicaCalc := &ReplicaCalculator{ metricsClient: metricsClient, podsGetter: testClient.Core(), + tolerance: defaultTestingTolerance, } selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ @@ -727,7 +728,7 @@ func TestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) { perPodRequested := totalRequestedCPUOfAllPods / startPods // Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio). - target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01 + target := math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .01 finalCPUPercentTarget := int32(target * 100) resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target) @@ -775,7 +776,7 @@ func TestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) { // Reuse the data structure above, now testing "unscaling". // Now, we test that no scaling happens if we are in a very close margin to the tolerance - target = math.Abs(1/(requestedToUsed*(1-tolerance))) + .004 + target = math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .004 finalCPUPercentTarget = int32(target * 100) tc.resource.targetUtilization = finalCPUPercentTarget tc.currentReplicas = startPods