Merge pull request #68081 from silveryfu/image-locality-tests-new

Automatic merge from submit-queue (batch tested with PRs 63437, 68081). If you want to cherry-pick this change to another branch, please follow the instructions here: https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md. Enable ImageLocalityPriority by default with integration tests **What this PR does / why we need it**: This PR is a follow-up to [#63842](https://github.com/kubernetes/kubernetes/issues/63842). It moves the ImageLocalityPriority function to default priority functions of the default algorithm provider and adds integration tests for the updated scheduling policy. - Compared to [#64662](https://github.com/kubernetes/kubernetes/pull/64662), this PR does note provide e2e test due to concerns about a large image may add too much overhead to the testing infrastructure and pipeline. We should add e2e tests in the future with the use of large enough image(s) in following PRs. - Compared to [#64662](https://github.com/kubernetes/kubernetes/pull/64662), this PR simplifies the code changes and keeps code changes under test/integration/scheduler/. - The PR contains a bug fix for [#65745](https://github.com/kubernetes/kubernetes/pull/65745) - caught by the integration test - where the image states are not properly cloned to the scheduler's cachedNodeInfoMap. We might split this fix into a separate PR. The integration test covers what follows: a pod requiring a large image (~= 3GB) is submitted to the cluster and there is a single node in the cluster has the same large image; the pod should get scheduled to that node. We might also consider whether more scenarios are desired. **Which issue(s) this PR fixes** *(optional, in `fixes #<issue number>(, fixes #<issue_number>, ...)` format, will close the issue(s) when PR gets merged)*: Fixes # **Special notes for your reviewer**: Kindly ping @resouer and @bsalamat **Release note**: ```release-note None ```
2018-09-01 16:58:33 -07:00 · 2018-09-01 16:58:33 -07:00 · da25aaa39e
parent 147520fcf4 3e438d7083
commit da25aaa39e
6 changed files with 121 additions and 10 deletions
--- a/pkg/scheduler/algorithmprovider/defaults/defaults.go
+++ b/pkg/scheduler/algorithmprovider/defaults/defaults.go
@ -94,10 +94,6 @@ func init() {
 	// Register the priority function so that its available
 	// but do not include it as part of the default priorities
 	factory.RegisterPriorityFunction2("EqualPriority", core.EqualPriorityMap, nil, 1)
-	// ImageLocalityPriority prioritizes nodes based on locality of images requested by a pod. Nodes with larger size
-	// of already-installed packages required by the pod will be preferred over nodes with no already-installed
-	// packages required by the pod or a small total size of already-installed packages required by the pod.
-	factory.RegisterPriorityFunction2("ImageLocalityPriority", priorities.ImageLocalityPriorityMap, nil, 1)
 	// Optional, cluster-autoscaler friendly priority function - give used nodes higher priority.
 	factory.RegisterPriorityFunction2("MostRequestedPriority", priorities.MostRequestedPriorityMap, nil, 1)
 	factory.RegisterPriorityFunction2(
@ -260,6 +256,9 @@ func defaultPriorities() sets.String {

 		// Prioritizes nodes that marked with taint which pod can tolerate.
 		factory.RegisterPriorityFunction2("TaintTolerationPriority", priorities.ComputeTaintTolerationPriorityMap, priorities.ComputeTaintTolerationPriorityReduce, 1),
+
+		// ImageLocalityPriority prioritizes nodes that have images requested by the pod present.
+		factory.RegisterPriorityFunction2("ImageLocalityPriority", priorities.ImageLocalityPriorityMap, nil, 1),
 	)
 }

--- a/pkg/scheduler/algorithmprovider/defaults/defaults_test.go
+++ b/pkg/scheduler/algorithmprovider/defaults/defaults_test.go
@ -59,7 +59,8 @@ func TestDefaultPriorities(t *testing.T) {
 		"BalancedResourceAllocation",
 		"NodePreferAvoidPodsPriority",
 		"NodeAffinityPriority",
-		"TaintTolerationPriority")
+		"TaintTolerationPriority",
+		"ImageLocalityPriority")
 	if expected := defaultPriorities(); !result.Equal(expected) {
 		t.Errorf("expected %v got %v", expected, result)
 	}
--- a/pkg/scheduler/cache/node_info.go
+++ b/pkg/scheduler/cache/node_info.go
@ -393,7 +393,7 @@ func (n *NodeInfo) Clone() *NodeInfo {
 		diskPressureCondition:   n.diskPressureCondition,
 		pidPressureCondition:    n.pidPressureCondition,
 		usedPorts:               make(util.HostPortInfo),
-		imageStates:             make(map[string]*ImageStateSummary),
+		imageStates:             n.imageStates,
 		generation:              n.generation,
 	}
 	if len(n.pods) > 0 {
--- a/test/integration/scheduler/priorities_test.go
+++ b/test/integration/scheduler/priorities_test.go
@ -22,6 +22,7 @@ import (
 	"k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	testutils "k8s.io/kubernetes/test/utils"
+	"strings"
 )

 // This file tests the scheduler priority functions.
@ -172,3 +173,64 @@ func TestPodAffinity(t *testing.T) {
 	}
 	t.Errorf("Pod %v got scheduled on an unexpected node: %v.", podName, pod.Spec.NodeName)
 }
+
+// TestImageLocality verifies that the scheduler's image locality priority function
+// works correctly, i.e., the pod gets scheduled to the node where its container images are ready.
+func TestImageLocality(t *testing.T) {
+	context := initTest(t, "image-locality")
+	defer cleanupTest(t, context)
+
+	// Add a few nodes.
+	_, err := createNodes(context.clientSet, "testnode", nil, 10)
+	if err != nil {
+		t.Fatalf("cannot create nodes: %v", err)
+	}
+
+	// We use a fake large image as the test image used by the pod, which has relatively large image size.
+	image := v1.ContainerImage{
+		Names: []string{
+			"fake-large-image:v1",
+		},
+		SizeBytes: 3000 * 1024 * 1024,
+	}
+
+	// Create a node with the large image
+	nodeWithLargeImage, err := createNodeWithImages(context.clientSet, "testnode-large-image", nil, []v1.ContainerImage{image})
+	if err != nil {
+		t.Fatalf("cannot create node with a large image: %v", err)
+	}
+
+	// Create a pod with containers each having the specified image.
+	podName := "pod-using-large-image"
+	pod, err := runPodWithContainers(context.clientSet, initPodWithContainers(context.clientSet, &podWithContainersConfig{
+		Name:       podName,
+		Namespace:  context.ns.Name,
+		Containers: makeContainersWithImages(image.Names),
+	}))
+	if err != nil {
+		t.Fatalf("error running pod with images: %v", err)
+	}
+	if pod.Spec.NodeName != nodeWithLargeImage.Name {
+		t.Errorf("pod %v got scheduled on an unexpected node: %v. Expected node: %v.", podName, pod.Spec.NodeName, nodeWithLargeImage.Name)
+	} else {
+		t.Logf("pod %v got successfully scheduled on node %v.", podName, pod.Spec.NodeName)
+	}
+}
+
+// makeContainerWithImage returns a list of v1.Container objects for each given image. Duplicates of an image are ignored,
+// i.e., each image is used only once.
+func makeContainersWithImages(images []string) []v1.Container {
+	var containers []v1.Container
+	usedImages := make(map[string]struct{})
+
+	for _, image := range images {
+		if _, ok := usedImages[image]; !ok {
+			containers = append(containers, v1.Container{
+				Name:  strings.Replace(image, ":", "-", -1) + "-container",
+				Image: image,
+			})
+			usedImages[image] = struct{}{}
+		}
+	}
+	return containers
+}
--- a/test/integration/scheduler/scheduler_test.go
+++ b/test/integration/scheduler/scheduler_test.go
@ -154,6 +154,7 @@ func TestSchedulerCreationFromConfigMap(t *testing.T) {
 				"NodePreferAvoidPodsPriority",
 				"SelectorSpreadPriority",
 				"TaintTolerationPriority",
+				"ImageLocalityPriority",
 			),
 		},
 		{
--- a/test/integration/scheduler/util.go
+++ b/test/integration/scheduler/util.go
@ -325,24 +325,35 @@ func waitForNodeLabels(cs clientset.Interface, nodeName string, labels map[strin
 	return wait.Poll(time.Millisecond*100, wait.ForeverTestTimeout, nodeHasLabels(cs, nodeName, labels))
 }

-// createNode creates a node with the given resource list and
-// returns a pointer and error status. If 'res' is nil, a predefined amount of
+// initNode returns a node with the given resource list and images. If 'res' is nil, a predefined amount of
 // resource will be used.
-func createNode(cs clientset.Interface, name string, res *v1.ResourceList) (*v1.Node, error) {
+func initNode(name string, res *v1.ResourceList, images []v1.ContainerImage) *v1.Node {
 	// if resource is nil, we use a default amount of resources for the node.
 	if res == nil {
 		res = &v1.ResourceList{
 			v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI),
 		}
 	}
+
 	n := &v1.Node{
 		ObjectMeta: metav1.ObjectMeta{Name: name},
 		Spec:       v1.NodeSpec{Unschedulable: false},
 		Status: v1.NodeStatus{
 			Capacity: *res,
+			Images:   images,
 		},
 	}
-	return cs.CoreV1().Nodes().Create(n)
+	return n
+}
+
+// createNode creates a node with the given resource list.
+func createNode(cs clientset.Interface, name string, res *v1.ResourceList) (*v1.Node, error) {
+	return cs.CoreV1().Nodes().Create(initNode(name, res, nil))
+}
+
+// createNodeWithImages creates a node with the given resource list and images.
+func createNodeWithImages(cs clientset.Interface, name string, res *v1.ResourceList, images []v1.ContainerImage) (*v1.Node, error) {
+	return cs.CoreV1().Nodes().Create(initNode(name, res, images))
 }

 // updateNodeStatus updates the status of node.
@ -492,6 +503,43 @@ func runPausePod(cs clientset.Interface, pod *v1.Pod) (*v1.Pod, error) {
 	return pod, nil
 }

+type podWithContainersConfig struct {
+	Name       string
+	Namespace  string
+	Containers []v1.Container
+}
+
+// initPodWithContainers initializes a pod API object from the given config. This is used primarily for generating
+// pods with containers each having a specific image.
+func initPodWithContainers(cs clientset.Interface, conf *podWithContainersConfig) *v1.Pod {
+	pod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      conf.Name,
+			Namespace: conf.Namespace,
+		},
+		Spec: v1.PodSpec{
+			Containers: conf.Containers,
+		},
+	}
+	return pod
+}
+
+// runPodWithContainers creates a pod with given config and containers and waits
+// until it is scheduled. It returns its pointer and error status.
+func runPodWithContainers(cs clientset.Interface, pod *v1.Pod) (*v1.Pod, error) {
+	pod, err := cs.CoreV1().Pods(pod.Namespace).Create(pod)
+	if err != nil {
+		return nil, fmt.Errorf("Error creating pod-with-containers: %v", err)
+	}
+	if err = waitForPodToSchedule(cs, pod); err != nil {
+		return pod, fmt.Errorf("Pod %v didn't schedule successfully. Error: %v", pod.Name, err)
+	}
+	if pod, err = cs.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}); err != nil {
+		return pod, fmt.Errorf("Error getting pod %v info: %v", pod.Name, err)
+	}
+	return pod, nil
+}
+
 // podDeleted returns true if a pod is not found in the given namespace.
 func podDeleted(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc {
 	return func() (bool, error) {