Merge pull request #17915 from justinsb/multizone_spread_rcs

Auto commit by PR queue bot
pull/6/head
k8s-merge-robot 2015-12-16 19:04:07 -08:00
commit ae9c61b943
4 changed files with 321 additions and 34 deletions

View File

@ -47,7 +47,7 @@ will filter out nodes that don't have at least that much resources available (co
as the capacity of the node minus the sum of the resource requests of the containers that
are already running on the node). Second, it applies a set of "priority functions"
that rank the nodes that weren't filtered out by the predicate check. For example,
it tries to spread Pods across nodes while at the same time favoring the least-loaded
it tries to spread Pods across nodes and zones while at the same time favoring the least-loaded
nodes (where "load" here is sum of the resource requests of the containers running on the node,
divided by the node's capacity).
Finally, the node with the highest priority is chosen

View File

@ -61,7 +61,7 @@ Currently, Kubernetes scheduler provides some practical priority functions, incl
- `LeastRequestedPriority`: The node is prioritized based on the fraction of the node that would be free if the new Pod were scheduled onto the node. (In other words, (capacity - sum of requests of all Pods already on the node - request of Pod that is being scheduled) / capacity). CPU and memory are equally weighted. The node with the highest free fraction is the most preferred. Note that this priority function has the effect of spreading Pods across the nodes with respect to resource consumption.
- `CalculateNodeLabelPriority`: Prefer nodes that have the specified label.
- `BalancedResourceAllocation`: This priority function tries to put the Pod on a node such that the CPU and Memory utilization rate is balanced after the Pod is deployed.
- `CalculateSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on the same node.
- `CalculateSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on the same node. If zone information is present on the nodes, the priority will be adjusted so that pods are spread across zones and nodes.
- `CalculateAntiAffinityPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on nodes with the same value for a particular label.
The details of the above priority functions can be found in [plugin/pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithm/priorities/). Kubernetes uses some, but not all, of these priority functions by default. You can see which ones are used by default in [plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go). Similar as predicates, you can combine the above priority functions and assign weight factors (positive number) to them as you want (check [scheduler.md](scheduler.md) for how to customize).

View File

@ -19,11 +19,20 @@ package priorities
import (
"github.com/golang/glog"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
)
// The maximum priority value to give to a node
// Prioritiy values range from 0-maxPriority
const maxPriority = 10
// When zone information is present, give 2/3 of the weighting to zone spreading, 1/3 to node spreading
// TODO: Any way to justify this weighting?
const zoneWeighting = 2.0 / 3.0
type SelectorSpread struct {
serviceLister algorithm.ServiceLister
controllerLister algorithm.ControllerLister
@ -37,11 +46,34 @@ func NewSelectorSpreadPriority(serviceLister algorithm.ServiceLister, controller
return selectorSpread.CalculateSpreadPriority
}
// CalculateSpreadPriority spreads pods by minimizing the number of pods belonging to the same service or replication controller. It counts number of pods that run under
// Services or RCs as the pod being scheduled and tries to minimize the number of conflicts. I.e. pushes scheduler towards a Node where there's a smallest number of
// pods which match the same selectors of Services and RCs as current pod.
// Helper function that builds a string identifier that is unique per failure-zone
// Returns empty-string for no zone
func getZoneKey(node *api.Node) string {
labels := node.Labels
if labels == nil {
return ""
}
region, _ := labels[unversioned.LabelZoneRegion]
failureDomain, _ := labels[unversioned.LabelZoneFailureDomain]
if region == "" && failureDomain == "" {
return ""
}
// We include the null character just in case region or failureDomain has a colon
// (We do assume there's no null characters in a region or failureDomain)
// As a nice side-benefit, the null character is not printed by fmt.Print or glog
return region + ":\x00:" + failureDomain
}
// CalculateSpreadPriority spreads pods across hosts and zones, considering pods belonging to the same service or replication controller.
// When a pod is scheduled, it looks for services or RCs that match the pod, then finds existing pods that match those selectors.
// It favors nodes that have fewer existing matching pods.
// i.e. it pushes the scheduler towards a node where there's the smallest number of
// pods which match the same service selectors or RC selectors as the pod being scheduled.
// Where zone information is included on the nodes, it favors nodes in zones with fewer existing matching pods.
func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
var maxCount int
var nsPods []*api.Pod
selectors := make([]labels.Selector, 0)
@ -76,35 +108,87 @@ func (s *SelectorSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorit
return nil, err
}
counts := map[string]int{}
if len(nsPods) > 0 {
for _, pod := range nsPods {
matches := false
for _, selector := range selectors {
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
matches = true
break
}
}
if matches {
counts[pod.Spec.NodeName]++
// Compute the maximum number of pods hosted on any node
if counts[pod.Spec.NodeName] > maxCount {
maxCount = counts[pod.Spec.NodeName]
}
// Count similar pods by node
countsByNodeName := map[string]int{}
for _, pod := range nsPods {
// When we are replacing a failed pod, we often see the previous deleted version
// while scheduling the replacement. Ignore the previous deleted version for spreading
// purposes (it can still be considered for resource restrictions etc.)
if pod.DeletionTimestamp != nil {
glog.V(2).Infof("skipping pending-deleted pod: %s/%s", pod.Namespace, pod.Name)
continue
}
matches := false
for _, selector := range selectors {
if selector.Matches(labels.Set(pod.ObjectMeta.Labels)) {
matches = true
break
}
}
if !matches {
continue
}
countsByNodeName[pod.Spec.NodeName]++
}
// Aggregate by-node information
// Compute the maximum number of pods hosted on any node
maxCountByNodeName := 0
for _, count := range countsByNodeName {
if count > maxCountByNodeName {
maxCountByNodeName = count
}
}
// Count similar pods by zone, if zone information is present
countsByZone := map[string]int{}
for i := range nodes.Items {
node := &nodes.Items[i]
count, found := countsByNodeName[node.Name]
if !found {
continue
}
zoneId := getZoneKey(node)
if zoneId == "" {
continue
}
countsByZone[zoneId] += count
}
// Aggregate by-zone information
// Compute the maximum number of pods hosted in any zone
haveZones := len(countsByZone) != 0
maxCountByZone := 0
for _, count := range countsByZone {
if count > maxCountByZone {
maxCountByZone = count
}
}
result := []schedulerapi.HostPriority{}
//score int - scale of 0-10
// 0 being the lowest priority and 10 being the highest
for _, node := range nodes.Items {
// initializing to the default/max node score of 10
fScore := float32(10)
if maxCount > 0 {
fScore = 10 * (float32(maxCount-counts[node.Name]) / float32(maxCount))
//score int - scale of 0-maxPriority
// 0 being the lowest priority and maxPriority being the highest
for i := range nodes.Items {
node := &nodes.Items[i]
// initializing to the default/max node score of maxPriority
fScore := float32(maxPriority)
if maxCountByNodeName > 0 {
fScore = maxPriority * (float32(maxCountByNodeName-countsByNodeName[node.Name]) / float32(maxCountByNodeName))
}
// If there is zone information present, incorporate it
if haveZones {
zoneId := getZoneKey(node)
if zoneId != "" {
zoneScore := maxPriority * (float32(maxCountByZone-countsByZone[zoneId]) / float32(maxCountByZone))
fScore = (fScore * (1.0 - zoneWeighting)) + (zoneWeighting * zoneScore)
}
}
result = append(result, schedulerapi.HostPriority{Host: node.Name, Score: int(fScore)})
glog.V(10).Infof(
"%v -> %v: SelectorSpreadPriority, Score: (%d)", pod.Name, node.Name, int(fScore),
@ -177,13 +261,13 @@ func (s *ServiceAntiAffinity) CalculateAntiAffinityPriority(pod *api.Pod, podLis
numServicePods := len(nsServicePods)
result := []schedulerapi.HostPriority{}
//score int - scale of 0-10
// 0 being the lowest priority and 10 being the highest
//score int - scale of 0-maxPriority
// 0 being the lowest priority and maxPriority being the highest
for node := range labeledNodes {
// initializing to the default/max node score of 10
fScore := float32(10)
// initializing to the default/max node score of maxPriority
fScore := float32(maxPriority)
if numServicePods > 0 {
fScore = 10 * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
fScore = maxPriority * (float32(numServicePods-podCounts[labeledNodes[node]]) / float32(numServicePods))
}
result = append(result, schedulerapi.HostPriority{Host: node, Score: int(fScore)})
}

View File

@ -22,6 +22,7 @@ import (
"testing"
"k8s.io/kubernetes/pkg/api"
wellknownlabels "k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
schedulerapi "k8s.io/kubernetes/plugin/pkg/scheduler/api"
)
@ -228,6 +229,208 @@ func TestSelectorSpreadPriority(t *testing.T) {
}
}
func TestZoneSelectorSpreadPriority(t *testing.T) {
labels1 := map[string]string{
"label1": "l1",
"baz": "blah",
}
labels2 := map[string]string{
"label2": "l2",
"baz": "blah",
}
const nodeMachine1Zone1 = "machine1.zone1"
const nodeMachine1Zone2 = "machine1.zone2"
const nodeMachine2Zone2 = "machine2.zone2"
const nodeMachine1Zone3 = "machine1.zone3"
const nodeMachine2Zone3 = "machine2.zone3"
const nodeMachine3Zone3 = "machine3.zone3"
buildNodeLabels := func(failureDomain string) map[string]string {
labels := map[string]string{
wellknownlabels.LabelZoneFailureDomain: failureDomain,
}
return labels
}
labeledNodes := map[string]map[string]string{
nodeMachine1Zone1: buildNodeLabels("zone1"),
nodeMachine1Zone2: buildNodeLabels("zone2"),
nodeMachine2Zone2: buildNodeLabels("zone2"),
nodeMachine1Zone3: buildNodeLabels("zone3"),
nodeMachine2Zone3: buildNodeLabels("zone3"),
nodeMachine3Zone3: buildNodeLabels("zone3"),
}
buildPod := func(nodeName string, labels map[string]string) *api.Pod {
pod := &api.Pod{Spec: api.PodSpec{NodeName: nodeName}, ObjectMeta: api.ObjectMeta{Labels: labels}}
return pod
}
tests := []struct {
pod *api.Pod
pods []*api.Pod
nodes []string
rcs []api.ReplicationController
services []api.Service
expectedList schedulerapi.HostPriorityList
test string
}{
{
pod: new(api.Pod),
expectedList: []schedulerapi.HostPriority{
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 10},
{nodeMachine2Zone2, 10},
{nodeMachine1Zone3, 10},
{nodeMachine2Zone3, 10},
{nodeMachine3Zone3, 10},
},
test: "nothing scheduled",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{buildPod(nodeMachine1Zone1, nil)},
expectedList: []schedulerapi.HostPriority{
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 10},
{nodeMachine2Zone2, 10},
{nodeMachine1Zone3, 10},
{nodeMachine2Zone3, 10},
{nodeMachine3Zone3, 10},
},
test: "no services",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{buildPod(nodeMachine1Zone1, labels2)},
services: []api.Service{{Spec: api.ServiceSpec{Selector: map[string]string{"key": "value"}}}},
expectedList: []schedulerapi.HostPriority{
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 10},
{nodeMachine2Zone2, 10},
{nodeMachine1Zone3, 10},
{nodeMachine2Zone3, 10},
{nodeMachine3Zone3, 10},
},
test: "different services",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod(nodeMachine1Zone1, labels2),
buildPod(nodeMachine1Zone2, labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 0}, // Already have pod on machine
{nodeMachine2Zone2, 3}, // Already have pod in zone
{nodeMachine1Zone3, 10},
{nodeMachine2Zone3, 10},
{nodeMachine3Zone3, 10},
},
test: "two pods, 1 matching (in z2)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod(nodeMachine1Zone1, labels2),
buildPod(nodeMachine1Zone2, labels1),
buildPod(nodeMachine2Zone2, labels1),
buildPod(nodeMachine1Zone3, labels2),
buildPod(nodeMachine2Zone3, labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{nodeMachine1Zone1, 10},
{nodeMachine1Zone2, 0}, // Pod on node
{nodeMachine2Zone2, 0}, // Pod on node
{nodeMachine1Zone3, 6}, // Pod in zone
{nodeMachine2Zone3, 3}, // Pod on node
{nodeMachine3Zone3, 6}, // Pod in zone
},
test: "five pods, 3 matching (z2=2, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod(nodeMachine1Zone1, labels1),
buildPod(nodeMachine1Zone2, labels1),
buildPod(nodeMachine2Zone2, labels2),
buildPod(nodeMachine1Zone3, labels1),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{nodeMachine1Zone1, 0}, // Pod on node
{nodeMachine1Zone2, 0}, // Pod on node
{nodeMachine2Zone2, 3}, // Pod in zone
{nodeMachine1Zone3, 0}, // Pod on node
{nodeMachine2Zone3, 3}, // Pod in zone
{nodeMachine3Zone3, 3}, // Pod in zone
},
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod(nodeMachine1Zone1, labels1),
buildPod(nodeMachine1Zone2, labels1),
buildPod(nodeMachine1Zone3, labels1),
buildPod(nodeMachine2Zone2, labels2),
},
services: []api.Service{{Spec: api.ServiceSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
{nodeMachine1Zone1, 0}, // Pod on node
{nodeMachine1Zone2, 0}, // Pod on node
{nodeMachine2Zone2, 3}, // Pod in zone
{nodeMachine1Zone3, 0}, // Pod on node
{nodeMachine2Zone3, 3}, // Pod in zone
{nodeMachine3Zone3, 3}, // Pod in zone
},
test: "four pods, 3 matching (z1=1, z2=1, z3=1)",
},
{
pod: buildPod("", labels1),
pods: []*api.Pod{
buildPod(nodeMachine1Zone3, labels1),
buildPod(nodeMachine1Zone2, labels1),
buildPod(nodeMachine1Zone3, labels1),
},
rcs: []api.ReplicationController{{Spec: api.ReplicationControllerSpec{Selector: labels1}}},
expectedList: []schedulerapi.HostPriority{
// Note that because we put two pods on the same node (nodeMachine1Zone3),
// the values here are questionable for zone2, in particular for nodeMachine1Zone2.
// However they kind of make sense; zone1 is still most-highly favored.
// zone3 is in general least favored, and m1.z3 particularly low priority.
// We would probably prefer to see a bigger gap between putting a second
// pod on m1.z2 and putting a pod on m2.z2, but the ordering is correct.
// This is also consistent with what we have already.
{nodeMachine1Zone1, 10}, // No pods in zone
{nodeMachine1Zone2, 5}, // Pod on node
{nodeMachine2Zone2, 6}, // Pod in zone
{nodeMachine1Zone3, 0}, // Two pods on node
{nodeMachine2Zone3, 3}, // Pod in zone
{nodeMachine3Zone3, 3}, // Pod in zone
},
test: "Replication controller spreading (z1=0, z2=1, z3=2)",
},
}
for _, test := range tests {
selectorSpread := SelectorSpread{serviceLister: algorithm.FakeServiceLister(test.services), controllerLister: algorithm.FakeControllerLister(test.rcs)}
list, err := selectorSpread.CalculateSpreadPriority(test.pod, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(makeLabeledNodeList(labeledNodes)))
if err != nil {
t.Errorf("unexpected error: %v", err)
}
// sort the two lists to avoid failures on account of different ordering
sort.Sort(test.expectedList)
sort.Sort(list)
if !reflect.DeepEqual(test.expectedList, list) {
t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
}
}
}
func TestZoneSpreadPriority(t *testing.T) {
labels1 := map[string]string{
"foo": "bar",