mirror of https://github.com/k3s-io/k3s
change sort function of scheduling queue to avoid starvation when unschedulable pods are in the queue
When starvation heppens: - a lot of unschedulable pods exists in the head of queue - because condition.LastTransitionTime is updated only when condition.Status changed - (this means that once a pod is marked unschedulable, the field never updated until the pod successfuly scheduled.) What was changed: - condition.LastProbeTime is updated everytime when pod is determined unschedulable. - changed sort function so to use LastProbeTime to avoid starvation described above Consideration: - This changes increases k8s API server load because it updates Pod.status whenever scheduler decides it as unschedulable. Signed-off-by: Shingo Omura <everpeace@gmail.com>pull/564/head
parent
f83bb59047
commit
22079a79d4
|
@ -230,7 +230,10 @@ func podTimestamp(pod *v1.Pod) *metav1.Time {
|
||||||
if condition == nil {
|
if condition == nil {
|
||||||
return &pod.CreationTimestamp
|
return &pod.CreationTimestamp
|
||||||
}
|
}
|
||||||
return &condition.LastTransitionTime
|
if condition.LastProbeTime.IsZero() {
|
||||||
|
return &condition.LastTransitionTime
|
||||||
|
}
|
||||||
|
return &condition.LastProbeTime
|
||||||
}
|
}
|
||||||
|
|
||||||
// activeQComp is the function used by the activeQ heap algorithm to sort pods.
|
// activeQComp is the function used by the activeQ heap algorithm to sort pods.
|
||||||
|
|
|
@ -572,10 +572,11 @@ func TestRecentlyTriedPodsGoBack(t *testing.T) {
|
||||||
}
|
}
|
||||||
// Update pod condition to unschedulable.
|
// Update pod condition to unschedulable.
|
||||||
podutil.UpdatePodCondition(&p1.Status, &v1.PodCondition{
|
podutil.UpdatePodCondition(&p1.Status, &v1.PodCondition{
|
||||||
Type: v1.PodScheduled,
|
Type: v1.PodScheduled,
|
||||||
Status: v1.ConditionFalse,
|
Status: v1.ConditionFalse,
|
||||||
Reason: v1.PodReasonUnschedulable,
|
Reason: v1.PodReasonUnschedulable,
|
||||||
Message: "fake scheduling failure",
|
Message: "fake scheduling failure",
|
||||||
|
LastProbeTime: metav1.Now(),
|
||||||
})
|
})
|
||||||
// Put in the unschedulable queue.
|
// Put in the unschedulable queue.
|
||||||
q.AddUnschedulableIfNotPresent(p1)
|
q.AddUnschedulableIfNotPresent(p1)
|
||||||
|
@ -594,6 +595,98 @@ func TestRecentlyTriedPodsGoBack(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestPodFailedSchedulingMultipleTimesDoesNotBlockNewerPod tests
|
||||||
|
// that a pod determined as unschedulable multiple times doesn't block any newer pod.
|
||||||
|
// This behavior ensures that an unschedulable pod does not block head of the queue when there
|
||||||
|
// are frequent events that move pods to the active queue.
|
||||||
|
func TestPodFailedSchedulingMultipleTimesDoesNotBlockNewerPod(t *testing.T) {
|
||||||
|
q := NewPriorityQueue(nil)
|
||||||
|
|
||||||
|
// Add an unschedulable pod to a priority queue.
|
||||||
|
// This makes a situation that the pod was tried to schedule
|
||||||
|
// and had been determined unschedulable so far.
|
||||||
|
unschedulablePod := v1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: "test-pod-unscheduled",
|
||||||
|
Namespace: "ns1",
|
||||||
|
UID: "tp001",
|
||||||
|
},
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
Priority: &highPriority,
|
||||||
|
},
|
||||||
|
Status: v1.PodStatus{
|
||||||
|
NominatedNodeName: "node1",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
// Update pod condition to unschedulable.
|
||||||
|
podutil.UpdatePodCondition(&unschedulablePod.Status, &v1.PodCondition{
|
||||||
|
Type: v1.PodScheduled,
|
||||||
|
Status: v1.ConditionFalse,
|
||||||
|
Reason: v1.PodReasonUnschedulable,
|
||||||
|
Message: "fake scheduling failure",
|
||||||
|
LastProbeTime: metav1.Now(),
|
||||||
|
})
|
||||||
|
// Put in the unschedulable queue
|
||||||
|
q.AddUnschedulableIfNotPresent(&unschedulablePod)
|
||||||
|
// Clear its backoff to simulate backoff its expiration
|
||||||
|
q.clearPodBackoff(&unschedulablePod)
|
||||||
|
// Move all unschedulable pods to the active queue.
|
||||||
|
q.MoveAllToActiveQueue()
|
||||||
|
|
||||||
|
// Simulate a pod being popped by the scheduler,
|
||||||
|
// At this time, unschedulable pod should be popped.
|
||||||
|
p1, err := q.Pop()
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error while popping the head of the queue: %v", err)
|
||||||
|
}
|
||||||
|
if p1 != &unschedulablePod {
|
||||||
|
t.Errorf("Expected that test-pod-unscheduled was popped, got %v", p1.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assume newer pod was added just after unschedulable pod
|
||||||
|
// being popped and before being pushed back to the queue.
|
||||||
|
newerPod := v1.Pod{
|
||||||
|
ObjectMeta: metav1.ObjectMeta{
|
||||||
|
Name: "test-newer-pod",
|
||||||
|
Namespace: "ns1",
|
||||||
|
UID: "tp002",
|
||||||
|
CreationTimestamp: metav1.Now(),
|
||||||
|
},
|
||||||
|
Spec: v1.PodSpec{
|
||||||
|
Priority: &highPriority,
|
||||||
|
},
|
||||||
|
Status: v1.PodStatus{
|
||||||
|
NominatedNodeName: "node1",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
q.Add(&newerPod)
|
||||||
|
|
||||||
|
// And then unschedulablePod was determined as unschedulable AGAIN.
|
||||||
|
podutil.UpdatePodCondition(&unschedulablePod.Status, &v1.PodCondition{
|
||||||
|
Type: v1.PodScheduled,
|
||||||
|
Status: v1.ConditionFalse,
|
||||||
|
Reason: v1.PodReasonUnschedulable,
|
||||||
|
Message: "fake scheduling failure",
|
||||||
|
LastProbeTime: metav1.Now(),
|
||||||
|
})
|
||||||
|
// And then, put unschedulable pod to the unschedulable queue
|
||||||
|
q.AddUnschedulableIfNotPresent(&unschedulablePod)
|
||||||
|
// Clear its backoff to simulate its backoff expiration
|
||||||
|
q.clearPodBackoff(&unschedulablePod)
|
||||||
|
// Move all unschedulable pods to the active queue.
|
||||||
|
q.MoveAllToActiveQueue()
|
||||||
|
|
||||||
|
// At this time, newerPod should be popped
|
||||||
|
// because it is the oldest tried pod.
|
||||||
|
p2, err2 := q.Pop()
|
||||||
|
if err2 != nil {
|
||||||
|
t.Errorf("Error while popping the head of the queue: %v", err2)
|
||||||
|
}
|
||||||
|
if p2 != &newerPod {
|
||||||
|
t.Errorf("Expected that test-newer-pod was popped, got %v", p2.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestHighPriorityBackoff tests that a high priority pod does not block
|
// TestHighPriorityBackoff tests that a high priority pod does not block
|
||||||
// other pods if it is unschedulable
|
// other pods if it is unschedulable
|
||||||
func TestHighProirotyBackoff(t *testing.T) {
|
func TestHighProirotyBackoff(t *testing.T) {
|
||||||
|
|
|
@ -282,10 +282,11 @@ func (sched *Scheduler) recordSchedulingFailure(pod *v1.Pod, err error, reason s
|
||||||
sched.config.Error(pod, err)
|
sched.config.Error(pod, err)
|
||||||
sched.config.Recorder.Event(pod, v1.EventTypeWarning, "FailedScheduling", message)
|
sched.config.Recorder.Event(pod, v1.EventTypeWarning, "FailedScheduling", message)
|
||||||
sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
|
sched.config.PodConditionUpdater.Update(pod, &v1.PodCondition{
|
||||||
Type: v1.PodScheduled,
|
Type: v1.PodScheduled,
|
||||||
Status: v1.ConditionFalse,
|
Status: v1.ConditionFalse,
|
||||||
Reason: reason,
|
LastProbeTime: metav1.Now(),
|
||||||
Message: err.Error(),
|
Reason: reason,
|
||||||
|
Message: err.Error(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue