Merge pull request #17915 from justinsb/multizone_spread_rcs

Auto commit by PR queue bot
2015-12-16 19:04:07 -08:00 · 2015-12-16 19:04:07 -08:00 · ae9c61b943
parent be5407b8ef 541ff002c0
commit ae9c61b943
4 changed files with 321 additions and 34 deletions
--- a/docs/devel/scheduler.md
+++ b/docs/devel/scheduler.md
@ -47,7 +47,7 @@ will filter out nodes that don't have at least that much resources available (co
 as the capacity of the node minus the sum of the resource requests of the containers that
 are already running on the node). Second, it applies a set of "priority functions"
 that rank the nodes that weren't filtered out by the predicate check. For example,
-it tries to spread Pods across nodes while at the same time favoring the least-loaded
+it tries to spread Pods across nodes and zones while at the same time favoring the least-loaded
 nodes (where "load" here is sum of the resource requests of the containers running on the node,
 divided by the node's capacity).
 Finally, the node with the highest priority is chosen
--- a/docs/devel/scheduler_algorithm.md
+++ b/docs/devel/scheduler_algorithm.md
@ -61,7 +61,7 @@ Currently, Kubernetes scheduler provides some practical priority functions, incl
 - `LeastRequestedPriority`: The node is prioritized based on the fraction of the node that would be free if the new Pod were scheduled onto the node. (In other words, (capacity - sum of requests of all Pods already on the node - request of Pod that is being scheduled) / capacity). CPU and memory are equally weighted. The node with the highest free fraction is the most preferred. Note that this priority function has the effect of spreading Pods across the nodes with respect to resource consumption.
 - `CalculateNodeLabelPriority`: Prefer nodes that have the specified label.
 - `BalancedResourceAllocation`: This priority function tries to put the Pod on a node such that the CPU and Memory utilization rate is balanced after the Pod is deployed.
- `CalculateSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on the same node.
+- `CalculateSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on the same node.  If zone information is present on the nodes, the priority will be adjusted so that pods are spread across zones and nodes.
 - `CalculateAntiAffinityPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on nodes with the same value for a particular label.

 The details of the above priority functions can be found in [plugin/pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithm/priorities/). Kubernetes uses some, but not all, of these priority functions by default. You can see which ones are used by default in [plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go). Similar as predicates, you can combine the above priority functions and assign weight factors (positive number) to them as you want (check [scheduler.md](scheduler.md) for how to customize).
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading.go
@ -19,11 +19,20 @@ package priorities
 import (
 	"github.com/golang/glog"
 	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/pkg/labels"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 )

+// The maximum priority value to give to a node
+// Prioritiy values range from 0-maxPriority
+const maxPriority = 10
+
+// When zone information is present, give 2/3 of the weighting to zone spreading, 1/3 to node spreading
+// TODO: Any way to justify this weighting?
+const zoneWeighting = 2.0 / 3.0
+
 type SelectorSpread struct {
 	serviceLister    algorithm.ServiceLister
 	controllerLister algorithm.ControllerLister
@ -37,11 +46,34 @@ func NewSelectorSpreadPriority(serviceLister algorithm.ServiceLister, controller
 	return selectorSpread.CalculateSpreadPriority
 }

-// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
-// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
-// pods which match the same selectors of Services and RCs as current pod.
+// Helper function that builds a string identifier that is unique per failure-zone
+// Returns empty-string for no zone
+func getZoneKey(node *api.Node) string {
+	labels := node.Labels
+	if labels == nil {
+		return ""
+	}
+
+	region, _ := labels[unversioned.LabelZoneRegion]
+	failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
+
+	if region == "" && failureDomain == "" {
+		return ""
+	}
+
+	// We include the null character just in case region or failureDomain has a colon
+	// (We do assume there's no null characters in a region or failureDomain)
+	// As a nice side-benefit, the null character is not printed by fmt.Print or glog
+	return region + ":\x00:" + failureDomain
+}
+
+// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
+// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
+// It favors nodes that have fewer existing matching pods.
+// i.e. it pushes the scheduler towards a node where there's the smallest number of
+// pods which match the same service selectors or RC selectors as the pod being scheduled.
+// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
 func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
-	var maxCount int
 	var nsPods []*api.Pod

 	selectors := make([]labels.Selector, 0)
@ -76,35 +108,87 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
 		return nil, err
 	}

-	counts := map[string]int{}
-	if len(nsPods) > 0 {
-		for _, pod := range nsPods {
-			matches := false
-			for _, selector := range selectors {
-				if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
-					matches = true
-					break
-				}
-			}
-			if matches {
-				counts[pod.Spec.NodeName]++
-				// Compute the maximum number of pods hosted on any node
-				if counts[pod.Spec.NodeName] > maxCount {
-					maxCount = counts[pod.Spec.NodeName]
-				}
+	// Count similar pods by node
+	countsByNodeName := map[string]int{}
+	for _, pod := range nsPods {
+		// When we are replacing a failed pod, we often see the previous deleted version
+		// while scheduling the replacement.  Ignore the previous deleted version for spreading
+		// purposes (it can still be considered for resource restrictions etc.)
+		if pod.DeletionTimestamp != nil {
+			glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
+			continue
+		}
+		matches := false
+		for _, selector := range selectors {
+			if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
+				matches = true
+				break
 			}
 		}
+		if !matches {
+			continue
+		}
+
+		countsByNodeName[pod.Spec.NodeName]++
+	}
+
+	// Aggregate by-node information
+	// Compute the maximum number of pods hosted on any node
+	maxCountByNodeName := 0
+	for _, count := range countsByNodeName {
+		if count > maxCountByNodeName {
+			maxCountByNodeName = count
+		}
+	}
+
+	// Count similar pods by zone, if zone information is present
+	countsByZone := map[string]int{}
+	for i := range nodes.Items {
+		node := &nodes.Items[i]
+
+		count, found := countsByNodeName[node.Name]
+		if !found {
+			continue
+		}
+
+		zoneId := getZoneKey(node)
+		if zoneId == "" {
+			continue
+		}
+
+		countsByZone[zoneId] += count
+	}
+
+	// Aggregate by-zone information
+	// Compute the maximum number of pods hosted in any zone
+	haveZones := len(countsByZone) != 0
+	maxCountByZone := 0
+	for _, count := range countsByZone {
+		if count > maxCountByZone {
+			maxCountByZone = count
+		}
 	}

 	result := []schedulerapi.HostPriority{}
-	//score int - scale of 0-10
-	// 0 being the lowest priority and 10 being the highest
-	for _, node := range nodes.Items {
-		// initializing to the default/max node score of 10
-		fScore := float32(10)
-		if maxCount > 0 {
-			fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount))
+	//score int - scale of 0-maxPriority
+	// 0 being the lowest priority and maxPriority being the highest
+	for i := range nodes.Items {
+		node := &nodes.Items[i]
+		// initializing to the default/max node score of maxPriority
+		fScore := float32(maxPriority)
+		if maxCountByNodeName > 0 {
+			fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
 		}
+
+		// If there is zone information present, incorporate it
+		if haveZones {
+			zoneId := getZoneKey(node)
+			if zoneId != "" {
+				zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
+				fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
+			}
+		}
+
 		result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
 		glog.V(10).Infof(
 			"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
@ -177,13 +261,13 @@ func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, podLis

 	numServicePods := len(nsServicePods)
 	result := []schedulerapi.HostPriority{}
-	//score int - scale of 0-10
-	// 0 being the lowest priority and 10 being the highest
+	//score int - scale of 0-maxPriority
+	// 0 being the lowest priority and maxPriority being the highest
 	for node := range labeledNodes {
-		// initializing to the default/max node score of 10
-		fScore := float32(10)
+		// initializing to the default/max node score of maxPriority
+		fScore := float32(maxPriority)
 		if numServicePods > 0 {
-			fScore = 10 * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
+			fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
 		}
 		result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)})
 	}
--- a/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
+++ b/plugin/pkg/scheduler/algorithm/priorities/selector_spreading_test.go
@ -22,6 +22,7 @@ import (
 	"testing"

 	"k8s.io/kubernetes/pkg/api"
+	wellknownlabels "k8s.io/kubernetes/pkg/api/unversioned"
 	"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
 	schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
 )
@ -228,6 +229,208 @@ func TestSelectorSpreadPriority(t *testing.T) {
 	}
 }

+func TestZoneSelectorSpreadPriority(t *testing.T) {
+	labels1 := map[string]string{
+		"label1": "l1",
+		"baz":    "blah",
+	}
+	labels2 := map[string]string{
+		"label2": "l2",
+		"baz":    "blah",
+	}
+
+	const nodeMachine1Zone1 = "machine1.zone1"
+	const nodeMachine1Zone2 = "machine1.zone2"
+	const nodeMachine2Zone2 = "machine2.zone2"
+	const nodeMachine1Zone3 = "machine1.zone3"
+	const nodeMachine2Zone3 = "machine2.zone3"
+	const nodeMachine3Zone3 = "machine3.zone3"
+
+	buildNodeLabels := func(failureDomain string) map[string]string {
+		labels := map[string]string{
+			wellknownlabels.LabelZoneFailureDomain: failureDomain,
+		}
+		return labels
+	}
+	labeledNodes := map[string]map[string]string{
+		nodeMachine1Zone1: buildNodeLabels("zone1"),
+		nodeMachine1Zone2: buildNodeLabels("zone2"),
+		nodeMachine2Zone2: buildNodeLabels("zone2"),
+		nodeMachine1Zone3: buildNodeLabels("zone3"),
+		nodeMachine2Zone3: buildNodeLabels("zone3"),
+		nodeMachine3Zone3: buildNodeLabels("zone3"),
+	}
+
+	buildPod := func(nodeName string, labels map[string]string) *api.Pod {
+		pod := &api.Pod{Spec: api.PodSpec{NodeName: nodeName}, ObjectMeta: api.ObjectMeta{Labels: labels}}
+		return pod
+	}
+
+	tests := []struct {
+		pod          *api.Pod
+		pods         []*api.Pod
+		nodes        []string
+		rcs          []api.ReplicationController
+		services     []api.Service
+		expectedList schedulerapi.HostPriorityList
+		test         string
+	}{
+		{
+			pod: new(api.Pod),
+			expectedList: []schedulerapi.HostPriority{
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
+			},
+			test: "nothing scheduled",
+		},
+		{
+			pod:  buildPod("", labels1),
+			pods: []*api.Pod{buildPod(nodeMachine1Zone1, nil)},
+			expectedList: []schedulerapi.HostPriority{
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
+			},
+			test: "no services",
+		},
+		{
+			pod:      buildPod("", labels1),
+			pods:     []*api.Pod{buildPod(nodeMachine1Zone1, labels2)},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
+			expectedList: []schedulerapi.HostPriority{
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 10},
+				{nodeMachine2Zone2, 10},
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
+			},
+			test: "different services",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod(nodeMachine1Zone1, labels2),
+				buildPod(nodeMachine1Zone2, labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 0}, // Already have pod on machine
+				{nodeMachine2Zone2, 3}, // Already have pod in zone
+				{nodeMachine1Zone3, 10},
+				{nodeMachine2Zone3, 10},
+				{nodeMachine3Zone3, 10},
+			},
+			test: "two pods, 1 matching (in z2)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod(nodeMachine1Zone1, labels2),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine2Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels2),
+				buildPod(nodeMachine2Zone3, labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{nodeMachine1Zone1, 10},
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 0}, // Pod on node
+				{nodeMachine1Zone3, 6}, // Pod in zone
+				{nodeMachine2Zone3, 3}, // Pod on node
+				{nodeMachine3Zone3, 6}, // Pod in zone
+			},
+			test: "five pods, 3 matching (z2=2, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod(nodeMachine1Zone1, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine2Zone2, labels2),
+				buildPod(nodeMachine1Zone3, labels1),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{nodeMachine1Zone1, 0}, // Pod on node
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 3}, // Pod in zone
+				{nodeMachine1Zone3, 0}, // Pod on node
+				{nodeMachine2Zone3, 3}, // Pod in zone
+				{nodeMachine3Zone3, 3}, // Pod in zone
+			},
+			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod(nodeMachine1Zone1, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels1),
+				buildPod(nodeMachine2Zone2, labels2),
+			},
+			services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				{nodeMachine1Zone1, 0}, // Pod on node
+				{nodeMachine1Zone2, 0}, // Pod on node
+				{nodeMachine2Zone2, 3}, // Pod in zone
+				{nodeMachine1Zone3, 0}, // Pod on node
+				{nodeMachine2Zone3, 3}, // Pod in zone
+				{nodeMachine3Zone3, 3}, // Pod in zone
+			},
+			test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
+		},
+		{
+			pod: buildPod("", labels1),
+			pods: []*api.Pod{
+				buildPod(nodeMachine1Zone3, labels1),
+				buildPod(nodeMachine1Zone2, labels1),
+				buildPod(nodeMachine1Zone3, labels1),
+			},
+			rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
+			expectedList: []schedulerapi.HostPriority{
+				// Note that because we put two pods on the same node (nodeMachine1Zone3),
+				// the values here are questionable for zone2, in particular for nodeMachine1Zone2.
+				// However they kind of make sense; zone1 is still most-highly favored.
+				// zone3 is in general least favored, and m1.z3 particularly low priority.
+				// We would probably prefer to see a bigger gap between putting a second
+				// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
+				// This is also consistent with what we have already.
+				{nodeMachine1Zone1, 10}, // No pods in zone
+				{nodeMachine1Zone2, 5},  // Pod on node
+				{nodeMachine2Zone2, 6},  // Pod in zone
+				{nodeMachine1Zone3, 0},  // Two pods on node
+				{nodeMachine2Zone3, 3},  // Pod in zone
+				{nodeMachine3Zone3, 3},  // Pod in zone
+			},
+			test: "Replication controller spreading (z1=0, z2=1, z3=2)",
+		},
+	}
+
+	for _, test := range tests {
+		selectorSpread := SelectorSpread{serviceLister: algorithm.FakeServiceLister(test.services), controllerLister: algorithm.FakeControllerLister(test.rcs)}
+		list, err := selectorSpread.CalculateSpreadPriority(test.pod, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(makeLabeledNodeList(labeledNodes)))
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+		// sort the two lists to avoid failures on account of different ordering
+		sort.Sort(test.expectedList)
+		sort.Sort(list)
+		if !reflect.DeepEqual(test.expectedList, list) {
+			t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
+		}
+	}
+}
+
 func TestZoneSpreadPriority(t *testing.T) {
 	labels1 := map[string]string{
 		"foo": "bar",