mirror of https://github.com/k3s-io/k3s
Merge pull request #58990 from bsalamat/nominated_node
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Add NominatedNodeName field to PodStatus **What this PR does / why we need it**: Today, Scheduler uses an annotation called "nominated-node-name" to mark a preemptor Pod. This annotation helps scheduler know about the Pods that are destined to run on the nodes so that the resources made available by preemption is not allocated to a different Pod. In a recent discussion with @bgrant0607, we learned that we should change the annotation to a field as this field can be used by multiple schedulers and other components that may make scheduling-related decisions (descheduler, auto-scaler, kube-arbitrator, ...). **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: ref #57471 **Special notes for your reviewer**: **Release note**: ```release-note Add "nominatedNodeName" field to PodStatus. This field is set when a pod preempts other pods on the node. ``` /sig schedulingpull/6/head
commit
21387af0b4
|
@ -79086,6 +79086,10 @@
|
|||
"description": "A human readable message indicating details about why the pod is in this condition.",
|
||||
"type": "string"
|
||||
},
|
||||
"nominatedNodeName": {
|
||||
"description": "nominatedNodeName is set only when this pod preempts other pods on the node, but it cannot be scheduled right away as preemption victims receive their graceful termination periods. This field does not guarantee that the pod will be scheduled on this node. Scheduler may decide to place the pod elsewhere if other nodes become available sooner. Scheduler may also decide to give the resources on this node to a higher priority pod that is created after preemption. As a result, this field may be different than PodSpec.nodeName when the pod is scheduled.",
|
||||
"type": "string"
|
||||
},
|
||||
"phase": {
|
||||
"description": "Current condition of the pod. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#pod-phase",
|
||||
"type": "string"
|
||||
|
|
|
@ -23135,6 +23135,10 @@
|
|||
"type": "string",
|
||||
"description": "A brief CamelCase message indicating details about why the pod is in this state. e.g. 'Evicted'"
|
||||
},
|
||||
"nominatedNodeName": {
|
||||
"type": "string",
|
||||
"description": "nominatedNodeName is set only when this pod preempts other pods on the node, but it cannot be scheduled right away as preemption victims receive their graceful termination periods. This field does not guarantee that the pod will be scheduled on this node. Scheduler may decide to place the pod elsewhere if other nodes become available sooner. Scheduler may also decide to give the resources on this node to a higher priority pod that is created after preemption. As a result, this field may be different than PodSpec.nodeName when the pod is scheduled."
|
||||
},
|
||||
"hostIP": {
|
||||
"type": "string",
|
||||
"description": "IP address of the host to which the pod is assigned. Empty if not yet scheduled."
|
||||
|
|
|
@ -9046,6 +9046,13 @@ Examples:<br>
|
|||
<td class="tableblock halign-left valign-top"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">nominatedNodeName</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">nominatedNodeName is set only when this pod preempts other pods on the node, but it cannot be scheduled right away as preemption victims receive their graceful termination periods. This field does not guarantee that the pod will be scheduled on this node. Scheduler may decide to place the pod elsewhere if other nodes become available sooner. Scheduler may also decide to give the resources on this node to a higher priority pod that is created after preemption. As a result, this field may be different than PodSpec.nodeName when the pod is scheduled.</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">string</p></td>
|
||||
<td class="tableblock halign-left valign-top"></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">hostIP</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">IP address of the host to which the pod is assigned. Empty if not yet scheduled.</p></td>
|
||||
<td class="tableblock halign-left valign-top"><p class="tableblock">false</p></td>
|
||||
|
|
|
@ -2724,6 +2724,13 @@ type PodStatus struct {
|
|||
// A brief CamelCase message indicating details about why the pod is in this state. e.g. 'Evicted'
|
||||
// +optional
|
||||
Reason string
|
||||
// nominatedNodeName is set when this pod preempts other pods on the node, but it cannot be
|
||||
// scheduled right away as preemption victims receive their graceful termination periods.
|
||||
// This field does not guarantee that the pod will be scheduled on this node. Scheduler may decide
|
||||
// to place the pod elsewhere if other nodes become available sooner. Scheduler may also decide to
|
||||
// give the resources on this node to a higher priority pod that is created after preemption.
|
||||
// +optional
|
||||
NominatedNodeName string
|
||||
|
||||
// +optional
|
||||
HostIP string
|
||||
|
|
|
@ -163,7 +163,8 @@ func addConversionFuncs(scheme *runtime.Scheme) error {
|
|||
"spec.restartPolicy",
|
||||
"spec.schedulerName",
|
||||
"status.phase",
|
||||
"status.podIP":
|
||||
"status.podIP",
|
||||
"status.nominatedNodeName":
|
||||
return label, value, nil
|
||||
// This is for backwards compatibility with old v1 clients which send spec.host
|
||||
case "spec.host":
|
||||
|
|
|
@ -3942,6 +3942,7 @@ func autoConvert_v1_PodStatus_To_core_PodStatus(in *v1.PodStatus, out *core.PodS
|
|||
out.Conditions = *(*[]core.PodCondition)(unsafe.Pointer(&in.Conditions))
|
||||
out.Message = in.Message
|
||||
out.Reason = in.Reason
|
||||
out.NominatedNodeName = in.NominatedNodeName
|
||||
out.HostIP = in.HostIP
|
||||
out.PodIP = in.PodIP
|
||||
out.StartTime = (*meta_v1.Time)(unsafe.Pointer(in.StartTime))
|
||||
|
@ -3961,6 +3962,7 @@ func autoConvert_core_PodStatus_To_v1_PodStatus(in *core.PodStatus, out *v1.PodS
|
|||
out.Conditions = *(*[]v1.PodCondition)(unsafe.Pointer(&in.Conditions))
|
||||
out.Message = in.Message
|
||||
out.Reason = in.Reason
|
||||
out.NominatedNodeName = in.NominatedNodeName
|
||||
out.HostIP = in.HostIP
|
||||
out.PodIP = in.PodIP
|
||||
out.StartTime = (*meta_v1.Time)(unsafe.Pointer(in.StartTime))
|
||||
|
|
|
@ -3361,6 +3361,12 @@ func ValidatePodStatusUpdate(newPod, oldPod *core.Pod) field.ErrorList {
|
|||
allErrs = append(allErrs, field.Forbidden(fldPath.Child("nodeName"), "may not be changed directly"))
|
||||
}
|
||||
|
||||
if newPod.Status.NominatedNodeName != oldPod.Status.NominatedNodeName && len(newPod.Status.NominatedNodeName) > 0 {
|
||||
for _, msg := range ValidateNodeName(newPod.Status.NominatedNodeName, false) {
|
||||
allErrs = append(allErrs, field.Invalid(fldPath.Child("nominatedNodeName"), newPod.Status.NominatedNodeName, msg))
|
||||
}
|
||||
}
|
||||
|
||||
// If pod should not restart, make sure the status update does not transition
|
||||
// any terminated containers to a non-terminated state.
|
||||
allErrs = append(allErrs, ValidateContainerStateTransition(newPod.Status.ContainerStatuses, oldPod.Status.ContainerStatuses, fldPath.Child("containerStatuses"), oldPod.Spec.RestartPolicy)...)
|
||||
|
|
|
@ -7756,6 +7756,129 @@ func TestValidatePodUpdate(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestValidatePodStatusUpdate(t *testing.T) {
|
||||
tests := []struct {
|
||||
new core.Pod
|
||||
old core.Pod
|
||||
err string
|
||||
test string
|
||||
}{
|
||||
{
|
||||
core.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "foo",
|
||||
},
|
||||
Spec: core.PodSpec{
|
||||
NodeName: "node1",
|
||||
},
|
||||
Status: core.PodStatus{
|
||||
NominatedNodeName: "node1",
|
||||
},
|
||||
},
|
||||
core.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "foo",
|
||||
},
|
||||
Spec: core.PodSpec{
|
||||
NodeName: "node1",
|
||||
},
|
||||
Status: core.PodStatus{},
|
||||
},
|
||||
"",
|
||||
"removed nominatedNodeName",
|
||||
},
|
||||
{
|
||||
core.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "foo",
|
||||
},
|
||||
Spec: core.PodSpec{
|
||||
NodeName: "node1",
|
||||
},
|
||||
},
|
||||
core.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "foo",
|
||||
},
|
||||
Spec: core.PodSpec{
|
||||
NodeName: "node1",
|
||||
},
|
||||
Status: core.PodStatus{
|
||||
NominatedNodeName: "node1",
|
||||
},
|
||||
},
|
||||
"",
|
||||
"add valid nominatedNodeName",
|
||||
},
|
||||
{
|
||||
core.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "foo",
|
||||
},
|
||||
Spec: core.PodSpec{
|
||||
NodeName: "node1",
|
||||
},
|
||||
Status: core.PodStatus{
|
||||
NominatedNodeName: "Node1",
|
||||
},
|
||||
},
|
||||
core.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "foo",
|
||||
},
|
||||
Spec: core.PodSpec{
|
||||
NodeName: "node1",
|
||||
},
|
||||
},
|
||||
"nominatedNodeName",
|
||||
"Add invalid nominatedNodeName",
|
||||
},
|
||||
{
|
||||
core.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "foo",
|
||||
},
|
||||
Spec: core.PodSpec{
|
||||
NodeName: "node1",
|
||||
},
|
||||
Status: core.PodStatus{
|
||||
NominatedNodeName: "node1",
|
||||
},
|
||||
},
|
||||
core.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "foo",
|
||||
},
|
||||
Spec: core.PodSpec{
|
||||
NodeName: "node1",
|
||||
},
|
||||
Status: core.PodStatus{
|
||||
NominatedNodeName: "node2",
|
||||
},
|
||||
},
|
||||
"",
|
||||
"Update nominatedNodeName",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test.new.ObjectMeta.ResourceVersion = "1"
|
||||
test.old.ObjectMeta.ResourceVersion = "1"
|
||||
errs := ValidatePodStatusUpdate(&test.new, &test.old)
|
||||
if test.err == "" {
|
||||
if len(errs) != 0 {
|
||||
t.Errorf("unexpected invalid: %s (%+v)\nA: %+v\nB: %+v", test.test, errs, test.new, test.old)
|
||||
}
|
||||
} else {
|
||||
if len(errs) == 0 {
|
||||
t.Errorf("unexpected valid: %s\nA: %+v\nB: %+v", test.test, test.new, test.old)
|
||||
} else if actualErr := errs.ToAggregate().Error(); !strings.Contains(actualErr, test.err) {
|
||||
t.Errorf("unexpected error message: %s\nExpected error: %s\nActual error: %s", test.test, test.err, actualErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func makeValidService() core.Service {
|
||||
return core.Service{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
|
|
|
@ -651,6 +651,9 @@ func describePod(pod *api.Pod, events *api.EventList) (string, error) {
|
|||
if controlledBy := printController(pod); len(controlledBy) > 0 {
|
||||
w.Write(LEVEL_0, "Controlled By:\t%s\n", controlledBy)
|
||||
}
|
||||
if len(pod.Status.NominatedNodeName) > 0 {
|
||||
w.Write(LEVEL_0, "NominatedNodeName:\t%s\n", pod.Status.NominatedNodeName)
|
||||
}
|
||||
|
||||
if len(pod.Spec.InitContainers) > 0 {
|
||||
describeContainers("Init Containers", pod.Spec.InitContainers, pod.Status.InitContainerStatuses, EnvValueRetriever(pod), w, "")
|
||||
|
|
|
@ -80,7 +80,8 @@ func TestDescribePodNode(t *testing.T) {
|
|||
NodeName: "all-in-one",
|
||||
},
|
||||
Status: api.PodStatus{
|
||||
HostIP: "127.0.0.1",
|
||||
HostIP: "127.0.0.1",
|
||||
NominatedNodeName: "nodeA",
|
||||
},
|
||||
})
|
||||
c := &describeClient{T: t, Namespace: "foo", Interface: fake}
|
||||
|
@ -92,6 +93,9 @@ func TestDescribePodNode(t *testing.T) {
|
|||
if !strings.Contains(out, "all-in-one/127.0.0.1") {
|
||||
t.Errorf("unexpected out: %s", out)
|
||||
}
|
||||
if !strings.Contains(out, "nodeA") {
|
||||
t.Errorf("unexpected out: %s", out)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDescribePodTolerations(t *testing.T) {
|
||||
|
|
|
@ -609,6 +609,9 @@ func printPod(pod *api.Pod, options printers.PrintOptions) ([]metav1alpha1.Table
|
|||
nodeName = "<none>"
|
||||
}
|
||||
row.Cells = append(row.Cells, podIP, nodeName)
|
||||
if len(pod.Status.NominatedNodeName) > 0 {
|
||||
row.Cells = append(row.Cells, pod.Status.NominatedNodeName)
|
||||
}
|
||||
}
|
||||
|
||||
return []metav1alpha1.TableRow{row}, nil
|
||||
|
|
|
@ -1704,9 +1704,10 @@ func TestPrintPodwide(t *testing.T) {
|
|||
{Ready: true, RestartCount: 3, State: api.ContainerState{Running: &api.ContainerStateRunning{}}},
|
||||
{RestartCount: 3},
|
||||
},
|
||||
NominatedNodeName: "node1",
|
||||
},
|
||||
},
|
||||
[]metav1alpha1.TableRow{{Cells: []interface{}{"test1", "1/2", "podPhase", 6, "<unknown>", "1.1.1.1", "test1"}}},
|
||||
[]metav1alpha1.TableRow{{Cells: []interface{}{"test1", "1/2", "podPhase", 6, "<unknown>", "1.1.1.1", "test1", "node1"}}},
|
||||
},
|
||||
{
|
||||
// Test when the NodeName and PodIP are none
|
||||
|
|
|
@ -243,6 +243,7 @@ func PodToSelectableFields(pod *api.Pod) fields.Set {
|
|||
podSpecificFieldsSet["spec.schedulerName"] = string(pod.Spec.SchedulerName)
|
||||
podSpecificFieldsSet["status.phase"] = string(pod.Status.Phase)
|
||||
podSpecificFieldsSet["status.podIP"] = string(pod.Status.PodIP)
|
||||
podSpecificFieldsSet["status.nominatedNodeName"] = string(pod.Status.NominatedNodeName)
|
||||
return generic.AddObjectMetaFieldsSet(podSpecificFieldsSet, &pod.ObjectMeta, true)
|
||||
}
|
||||
|
||||
|
|
|
@ -114,7 +114,20 @@ func TestMatchPod(t *testing.T) {
|
|||
fieldSelector: fields.ParseSelectorOrDie("status.podIP=4.3.2.1"),
|
||||
expectMatch: false,
|
||||
},
|
||||
}
|
||||
{
|
||||
in: &api.Pod{
|
||||
Status: api.PodStatus{NominatedNodeName: "node1"},
|
||||
},
|
||||
fieldSelector: fields.ParseSelectorOrDie("status.nominatedNodeName=node1"),
|
||||
expectMatch: true,
|
||||
},
|
||||
{
|
||||
in: &api.Pod{
|
||||
Status: api.PodStatus{NominatedNodeName: "node1"},
|
||||
},
|
||||
fieldSelector: fields.ParseSelectorOrDie("status.nominatedNodeName=node2"),
|
||||
expectMatch: false,
|
||||
}}
|
||||
for _, testCase := range testCases {
|
||||
m := MatchPod(labels.Everything(), testCase.fieldSelector)
|
||||
result, err := m.Matches(testCase.in)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -3170,6 +3170,16 @@ message PodStatus {
|
|||
// +optional
|
||||
optional string reason = 4;
|
||||
|
||||
// nominatedNodeName is set only when this pod preempts other pods on the node, but it cannot be
|
||||
// scheduled right away as preemption victims receive their graceful termination periods.
|
||||
// This field does not guarantee that the pod will be scheduled on this node. Scheduler may decide
|
||||
// to place the pod elsewhere if other nodes become available sooner. Scheduler may also decide to
|
||||
// give the resources on this node to a higher priority pod that is created after preemption.
|
||||
// As a result, this field may be different than PodSpec.nodeName when the pod is
|
||||
// scheduled.
|
||||
// +optional
|
||||
optional string nominatedNodeName = 11;
|
||||
|
||||
// IP address of the host to which the pod is assigned. Empty if not yet scheduled.
|
||||
// +optional
|
||||
optional string hostIP = 5;
|
||||
|
|
|
@ -3021,6 +3021,15 @@ type PodStatus struct {
|
|||
// e.g. 'Evicted'
|
||||
// +optional
|
||||
Reason string `json:"reason,omitempty" protobuf:"bytes,4,opt,name=reason"`
|
||||
// nominatedNodeName is set only when this pod preempts other pods on the node, but it cannot be
|
||||
// scheduled right away as preemption victims receive their graceful termination periods.
|
||||
// This field does not guarantee that the pod will be scheduled on this node. Scheduler may decide
|
||||
// to place the pod elsewhere if other nodes become available sooner. Scheduler may also decide to
|
||||
// give the resources on this node to a higher priority pod that is created after preemption.
|
||||
// As a result, this field may be different than PodSpec.nodeName when the pod is
|
||||
// scheduled.
|
||||
// +optional
|
||||
NominatedNodeName string `json:"nominatedNodeName,omitempty" protobuf:"bytes,11,opt,name=nominatedNodeName"`
|
||||
|
||||
// IP address of the host to which the pod is assigned. Empty if not yet scheduled.
|
||||
// +optional
|
||||
|
|
|
@ -1528,6 +1528,7 @@ var map_PodStatus = map[string]string{
|
|||
"conditions": "Current service state of pod. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#pod-conditions",
|
||||
"message": "A human readable message indicating details about why the pod is in this condition.",
|
||||
"reason": "A brief CamelCase message indicating details about why the pod is in this state. e.g. 'Evicted'",
|
||||
"nominatedNodeName": "nominatedNodeName is set only when this pod preempts other pods on the node, but it cannot be scheduled right away as preemption victims receive their graceful termination periods. This field does not guarantee that the pod will be scheduled on this node. Scheduler may decide to place the pod elsewhere if other nodes become available sooner. Scheduler may also decide to give the resources on this node to a higher priority pod that is created after preemption. As a result, this field may be different than PodSpec.nodeName when the pod is scheduled.",
|
||||
"hostIP": "IP address of the host to which the pod is assigned. Empty if not yet scheduled.",
|
||||
"podIP": "IP address allocated to the pod. Routable at least within the cluster. Empty if not yet allocated.",
|
||||
"startTime": "RFC 3339 date and time at which the object was acknowledged by the Kubelet. This is before the Kubelet pulled the container image(s) for the pod.",
|
||||
|
|
Loading…
Reference in New Issue