mirror of https://github.com/k3s-io/k3s
Make HPA tolerance a flag
Fix #18155 Make HPA tolerance configurable as a flag. This change allows us to use different tolerance values in production/testing. Signed-off-by: mattjmcnaughton <mattjmcnaughton@gmail.com>pull/6/head
parent
ca712680c7
commit
abd46684d4
|
@ -64,7 +64,11 @@ func startHPAControllerWithLegacyClient(ctx ControllerContext) (bool, error) {
|
|||
|
||||
func startHPAControllerWithMetricsClient(ctx ControllerContext, metricsClient metrics.MetricsClient) (bool, error) {
|
||||
hpaClient := ctx.ClientBuilder.ClientOrDie("horizontal-pod-autoscaler")
|
||||
replicaCalc := podautoscaler.NewReplicaCalculator(metricsClient, hpaClient.Core())
|
||||
replicaCalc := podautoscaler.NewReplicaCalculator(
|
||||
metricsClient,
|
||||
hpaClient.Core(),
|
||||
ctx.Options.HorizontalPodAutoscalerTolerance,
|
||||
)
|
||||
go podautoscaler.NewHorizontalController(
|
||||
ctx.ClientBuilder.ClientGoClientOrDie("horizontal-pod-autoscaler").Core(),
|
||||
hpaClient.Extensions(),
|
||||
|
|
|
@ -79,6 +79,7 @@ func NewCMServer() *CMServer {
|
|||
HorizontalPodAutoscalerSyncPeriod: metav1.Duration{Duration: 30 * time.Second},
|
||||
HorizontalPodAutoscalerUpscaleForbiddenWindow: metav1.Duration{Duration: 3 * time.Minute},
|
||||
HorizontalPodAutoscalerDownscaleForbiddenWindow: metav1.Duration{Duration: 5 * time.Minute},
|
||||
HorizontalPodAutoscalerTolerance: 0.1,
|
||||
DeploymentControllerSyncPeriod: metav1.Duration{Duration: 30 * time.Second},
|
||||
MinResyncPeriod: metav1.Duration{Duration: 12 * time.Hour},
|
||||
RegisterRetryCount: 10,
|
||||
|
@ -166,6 +167,7 @@ func (s *CMServer) AddFlags(fs *pflag.FlagSet, allControllers []string, disabled
|
|||
fs.DurationVar(&s.HorizontalPodAutoscalerSyncPeriod.Duration, "horizontal-pod-autoscaler-sync-period", s.HorizontalPodAutoscalerSyncPeriod.Duration, "The period for syncing the number of pods in horizontal pod autoscaler.")
|
||||
fs.DurationVar(&s.HorizontalPodAutoscalerUpscaleForbiddenWindow.Duration, "horizontal-pod-autoscaler-upscale-delay", s.HorizontalPodAutoscalerUpscaleForbiddenWindow.Duration, "The period since last upscale, before another upscale can be performed in horizontal pod autoscaler.")
|
||||
fs.DurationVar(&s.HorizontalPodAutoscalerDownscaleForbiddenWindow.Duration, "horizontal-pod-autoscaler-downscale-delay", s.HorizontalPodAutoscalerDownscaleForbiddenWindow.Duration, "The period since last downscale, before another downscale can be performed in horizontal pod autoscaler.")
|
||||
fs.Float64Var(&s.HorizontalPodAutoscalerTolerance, "horizontal-pod-autoscaler-tolerance", s.HorizontalPodAutoscalerTolerance, "The minimum change (from 1.0) in the desired-to-actual metrics ratio for the horizontal pod autoscaler to consider scaling.")
|
||||
fs.DurationVar(&s.DeploymentControllerSyncPeriod.Duration, "deployment-controller-sync-period", s.DeploymentControllerSyncPeriod.Duration, "Period for syncing the deployments.")
|
||||
fs.DurationVar(&s.PodEvictionTimeout.Duration, "pod-eviction-timeout", s.PodEvictionTimeout.Duration, "The grace period for deleting pods on failed nodes.")
|
||||
fs.Float32Var(&s.DeletingPodsQps, "deleting-pods-qps", 0.1, "Number of nodes per second on which pods are deleted in case of node failure.")
|
||||
|
|
|
@ -151,6 +151,7 @@ func TestAddFlags(t *testing.T) {
|
|||
NodeMonitorPeriod: metav1.Duration{Duration: 10 * time.Second},
|
||||
HorizontalPodAutoscalerUpscaleForbiddenWindow: metav1.Duration{Duration: 1 * time.Minute},
|
||||
HorizontalPodAutoscalerDownscaleForbiddenWindow: metav1.Duration{Duration: 2 * time.Minute},
|
||||
HorizontalPodAutoscalerTolerance: 0.1,
|
||||
TerminatedPodGCThreshold: 12000,
|
||||
VolumeConfiguration: componentconfig.VolumeConfiguration{
|
||||
EnableDynamicProvisioning: false,
|
||||
|
|
|
@ -346,6 +346,9 @@ type KubeControllerManagerConfiguration struct {
|
|||
HorizontalPodAutoscalerUpscaleForbiddenWindow metav1.Duration
|
||||
// horizontalPodAutoscalerDownscaleForbiddenWindow is a period after which next downscale allowed.
|
||||
HorizontalPodAutoscalerDownscaleForbiddenWindow metav1.Duration
|
||||
// horizontalPodAutoscalerTolerance is the tolerance for when
|
||||
// resource usage suggests upscaling/downscaling
|
||||
HorizontalPodAutoscalerTolerance float64
|
||||
// deploymentControllerSyncPeriod is the period for syncing the deployments.
|
||||
DeploymentControllerSyncPeriod metav1.Duration
|
||||
// podEvictionTimeout is the grace period for deleting pods on failed nodes.
|
||||
|
|
|
@ -49,10 +49,6 @@ import (
|
|||
)
|
||||
|
||||
var (
|
||||
// Usage shoud exceed the tolerance before we start downscale or upscale the pods.
|
||||
// TODO: make it a flag or HPA spec element.
|
||||
tolerance = 0.1
|
||||
|
||||
scaleUpLimitFactor = 2.0
|
||||
scaleUpLimitMinimum = 4.0
|
||||
)
|
||||
|
|
|
@ -579,6 +579,7 @@ func (tc *testCase) setupController(t *testing.T) (*HorizontalController, inform
|
|||
replicaCalc := &ReplicaCalculator{
|
||||
metricsClient: metricsClient,
|
||||
podsGetter: testClient.Core(),
|
||||
tolerance: defaultTestingTolerance,
|
||||
}
|
||||
|
||||
informerFactory := informers.NewSharedInformerFactory(testClient, controller.NoResyncPeriodFunc())
|
||||
|
@ -1525,7 +1526,7 @@ func TestComputedToleranceAlgImplementation(t *testing.T) {
|
|||
perPodRequested := totalRequestedCPUOfAllPods / startPods
|
||||
|
||||
// Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio).
|
||||
target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01
|
||||
target := math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .01
|
||||
finalCPUPercentTarget := int32(target * 100)
|
||||
resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target)
|
||||
|
||||
|
@ -1570,7 +1571,7 @@ func TestComputedToleranceAlgImplementation(t *testing.T) {
|
|||
|
||||
// Reuse the data structure above, now testing "unscaling".
|
||||
// Now, we test that no scaling happens if we are in a very close margin to the tolerance
|
||||
target = math.Abs(1/(requestedToUsed*(1-tolerance))) + .004
|
||||
target = math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .004
|
||||
finalCPUPercentTarget = int32(target * 100)
|
||||
tc.CPUTarget = finalCPUPercentTarget
|
||||
tc.initialReplicas = startPods
|
||||
|
|
|
@ -484,6 +484,7 @@ func (tc *legacyTestCase) runTest(t *testing.T) {
|
|||
replicaCalc := &ReplicaCalculator{
|
||||
metricsClient: metricsClient,
|
||||
podsGetter: testClient.Core(),
|
||||
tolerance: defaultTestingTolerance,
|
||||
}
|
||||
|
||||
informerFactory := informers.NewSharedInformerFactory(testClient, controller.NoResyncPeriodFunc())
|
||||
|
@ -965,7 +966,7 @@ func LegacyTestComputedToleranceAlgImplementation(t *testing.T) {
|
|||
perPodRequested := totalRequestedCPUOfAllPods / startPods
|
||||
|
||||
// Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio).
|
||||
target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01
|
||||
target := math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .01
|
||||
finalCPUPercentTarget := int32(target * 100)
|
||||
resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target)
|
||||
|
||||
|
@ -1010,7 +1011,7 @@ func LegacyTestComputedToleranceAlgImplementation(t *testing.T) {
|
|||
|
||||
// Reuse the data structure above, now testing "unscaling".
|
||||
// Now, we test that no scaling happens if we are in a very close margin to the tolerance
|
||||
target = math.Abs(1/(requestedToUsed*(1-tolerance))) + .004
|
||||
target = math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .004
|
||||
finalCPUPercentTarget = int32(target * 100)
|
||||
tc.CPUTarget = finalCPUPercentTarget
|
||||
tc.initialReplicas = startPods
|
||||
|
|
|
@ -188,6 +188,7 @@ func (tc *legacyReplicaCalcTestCase) runTest(t *testing.T) {
|
|||
replicaCalc := &ReplicaCalculator{
|
||||
metricsClient: metricsClient,
|
||||
podsGetter: testClient.Core(),
|
||||
tolerance: defaultTestingTolerance,
|
||||
}
|
||||
|
||||
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
|
||||
|
@ -603,7 +604,7 @@ func LegacyTestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) {
|
|||
perPodRequested := totalRequestedCPUOfAllPods / startPods
|
||||
|
||||
// Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio).
|
||||
target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01
|
||||
target := math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .01
|
||||
finalCPUPercentTarget := int32(target * 100)
|
||||
resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target)
|
||||
|
||||
|
@ -651,7 +652,7 @@ func LegacyTestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) {
|
|||
|
||||
// Reuse the data structure above, now testing "unscaling".
|
||||
// Now, we test that no scaling happens if we are in a very close margin to the tolerance
|
||||
target = math.Abs(1/(requestedToUsed*(1-tolerance))) + .004
|
||||
target = math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .004
|
||||
finalCPUPercentTarget = int32(target * 100)
|
||||
tc.resource.targetUtilization = finalCPUPercentTarget
|
||||
tc.currentReplicas = startPods
|
||||
|
|
|
@ -31,15 +31,23 @@ import (
|
|||
metricsclient "k8s.io/kubernetes/pkg/controller/podautoscaler/metrics"
|
||||
)
|
||||
|
||||
const (
|
||||
// defaultTestingTolerance is default value for calculating when to
|
||||
// scale up/scale down.
|
||||
defaultTestingTolerance = 0.1
|
||||
)
|
||||
|
||||
type ReplicaCalculator struct {
|
||||
metricsClient metricsclient.MetricsClient
|
||||
podsGetter v1coreclient.PodsGetter
|
||||
tolerance float64
|
||||
}
|
||||
|
||||
func NewReplicaCalculator(metricsClient metricsclient.MetricsClient, podsGetter v1coreclient.PodsGetter) *ReplicaCalculator {
|
||||
func NewReplicaCalculator(metricsClient metricsclient.MetricsClient, podsGetter v1coreclient.PodsGetter, tolerance float64) *ReplicaCalculator {
|
||||
return &ReplicaCalculator{
|
||||
metricsClient: metricsClient,
|
||||
podsGetter: podsGetter,
|
||||
tolerance: tolerance,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -105,7 +113,7 @@ func (c *ReplicaCalculator) GetResourceReplicas(currentReplicas int32, targetUti
|
|||
|
||||
rebalanceUnready := len(unreadyPods) > 0 && usageRatio > 1.0
|
||||
if !rebalanceUnready && len(missingPods) == 0 {
|
||||
if math.Abs(1.0-usageRatio) <= tolerance {
|
||||
if math.Abs(1.0-usageRatio) <= c.tolerance {
|
||||
// return the current replicas if the change would be too small
|
||||
return currentReplicas, utilization, rawUtilization, timestamp, nil
|
||||
}
|
||||
|
@ -141,7 +149,7 @@ func (c *ReplicaCalculator) GetResourceReplicas(currentReplicas int32, targetUti
|
|||
return 0, utilization, rawUtilization, time.Time{}, err
|
||||
}
|
||||
|
||||
if math.Abs(1.0-newUsageRatio) <= tolerance || (usageRatio < 1.0 && newUsageRatio > 1.0) || (usageRatio > 1.0 && newUsageRatio < 1.0) {
|
||||
if math.Abs(1.0-newUsageRatio) <= c.tolerance || (usageRatio < 1.0 && newUsageRatio > 1.0) || (usageRatio > 1.0 && newUsageRatio < 1.0) {
|
||||
// return the current replicas if the change would be too small,
|
||||
// or if the new usage ratio would cause a change in scale direction
|
||||
return currentReplicas, utilization, rawUtilization, timestamp, nil
|
||||
|
@ -218,7 +226,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet
|
|||
rebalanceUnready := len(unreadyPods) > 0 && usageRatio > 1.0
|
||||
|
||||
if !rebalanceUnready && len(missingPods) == 0 {
|
||||
if math.Abs(1.0-usageRatio) <= tolerance {
|
||||
if math.Abs(1.0-usageRatio) <= c.tolerance {
|
||||
// return the current replicas if the change would be too small
|
||||
return currentReplicas, utilization, nil
|
||||
}
|
||||
|
@ -251,7 +259,7 @@ func (c *ReplicaCalculator) calcPlainMetricReplicas(metrics metricsclient.PodMet
|
|||
// re-run the utilization calculation with our new numbers
|
||||
newUsageRatio, _ := metricsclient.GetMetricUtilizationRatio(metrics, targetUtilization)
|
||||
|
||||
if math.Abs(1.0-newUsageRatio) <= tolerance || (usageRatio < 1.0 && newUsageRatio > 1.0) || (usageRatio > 1.0 && newUsageRatio < 1.0) {
|
||||
if math.Abs(1.0-newUsageRatio) <= c.tolerance || (usageRatio < 1.0 && newUsageRatio > 1.0) || (usageRatio > 1.0 && newUsageRatio < 1.0) {
|
||||
// return the current replicas if the change would be too small,
|
||||
// or if the new usage ratio would cause a change in scale direction
|
||||
return currentReplicas, utilization, nil
|
||||
|
@ -271,7 +279,7 @@ func (c *ReplicaCalculator) GetObjectMetricReplicas(currentReplicas int32, targe
|
|||
}
|
||||
|
||||
usageRatio := float64(utilization) / float64(targetUtilization)
|
||||
if math.Abs(1.0-usageRatio) <= tolerance {
|
||||
if math.Abs(1.0-usageRatio) <= c.tolerance {
|
||||
// return the current replicas if the change would be too small
|
||||
return currentReplicas, utilization, timestamp, nil
|
||||
}
|
||||
|
|
|
@ -247,6 +247,7 @@ func (tc *replicaCalcTestCase) runTest(t *testing.T) {
|
|||
replicaCalc := &ReplicaCalculator{
|
||||
metricsClient: metricsClient,
|
||||
podsGetter: testClient.Core(),
|
||||
tolerance: defaultTestingTolerance,
|
||||
}
|
||||
|
||||
selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
|
||||
|
@ -727,7 +728,7 @@ func TestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) {
|
|||
perPodRequested := totalRequestedCPUOfAllPods / startPods
|
||||
|
||||
// Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio).
|
||||
target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01
|
||||
target := math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .01
|
||||
finalCPUPercentTarget := int32(target * 100)
|
||||
resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target)
|
||||
|
||||
|
@ -775,7 +776,7 @@ func TestReplicaCalcComputedToleranceAlgImplementation(t *testing.T) {
|
|||
|
||||
// Reuse the data structure above, now testing "unscaling".
|
||||
// Now, we test that no scaling happens if we are in a very close margin to the tolerance
|
||||
target = math.Abs(1/(requestedToUsed*(1-tolerance))) + .004
|
||||
target = math.Abs(1/(requestedToUsed*(1-defaultTestingTolerance))) + .004
|
||||
finalCPUPercentTarget = int32(target * 100)
|
||||
tc.resource.targetUtilization = finalCPUPercentTarget
|
||||
tc.currentReplicas = startPods
|
||||
|
|
Loading…
Reference in New Issue