diff --git a/pkg/scheduler/algorithmprovider/defaults/defaults.go b/pkg/scheduler/algorithmprovider/defaults/defaults.go index 94d73b7e7f..475b7ea042 100644 --- a/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -94,10 +94,6 @@ func init() { // Register the priority function so that its available // but do not include it as part of the default priorities factory.RegisterPriorityFunction2("EqualPriority", core.EqualPriorityMap, nil, 1) - // ImageLocalityPriority prioritizes nodes based on locality of images requested by a pod. Nodes with larger size - // of already-installed packages required by the pod will be preferred over nodes with no already-installed - // packages required by the pod or a small total size of already-installed packages required by the pod. - factory.RegisterPriorityFunction2("ImageLocalityPriority", priorities.ImageLocalityPriorityMap, nil, 1) // Optional, cluster-autoscaler friendly priority function - give used nodes higher priority. factory.RegisterPriorityFunction2("MostRequestedPriority", priorities.MostRequestedPriorityMap, nil, 1) factory.RegisterPriorityFunction2( @@ -260,6 +256,9 @@ func defaultPriorities() sets.String { // Prioritizes nodes that marked with taint which pod can tolerate. factory.RegisterPriorityFunction2("TaintTolerationPriority", priorities.ComputeTaintTolerationPriorityMap, priorities.ComputeTaintTolerationPriorityReduce, 1), + + // ImageLocalityPriority prioritizes nodes that have images requested by the pod present. + factory.RegisterPriorityFunction2("ImageLocalityPriority", priorities.ImageLocalityPriorityMap, nil, 1), ) } diff --git a/pkg/scheduler/algorithmprovider/defaults/defaults_test.go b/pkg/scheduler/algorithmprovider/defaults/defaults_test.go index 8dbc2f1536..90ee134342 100644 --- a/pkg/scheduler/algorithmprovider/defaults/defaults_test.go +++ b/pkg/scheduler/algorithmprovider/defaults/defaults_test.go @@ -59,7 +59,8 @@ func TestDefaultPriorities(t *testing.T) { "BalancedResourceAllocation", "NodePreferAvoidPodsPriority", "NodeAffinityPriority", - "TaintTolerationPriority") + "TaintTolerationPriority", + "ImageLocalityPriority") if expected := defaultPriorities(); !result.Equal(expected) { t.Errorf("expected %v got %v", expected, result) } diff --git a/pkg/scheduler/cache/node_info.go b/pkg/scheduler/cache/node_info.go index 2cad6573fa..c6b5f96cd8 100644 --- a/pkg/scheduler/cache/node_info.go +++ b/pkg/scheduler/cache/node_info.go @@ -393,7 +393,7 @@ func (n *NodeInfo) Clone() *NodeInfo { diskPressureCondition: n.diskPressureCondition, pidPressureCondition: n.pidPressureCondition, usedPorts: make(util.HostPortInfo), - imageStates: make(map[string]*ImageStateSummary), + imageStates: n.imageStates, generation: n.generation, } if len(n.pods) > 0 { diff --git a/test/integration/scheduler/priorities_test.go b/test/integration/scheduler/priorities_test.go index 6dd75ee670..5992dcffeb 100644 --- a/test/integration/scheduler/priorities_test.go +++ b/test/integration/scheduler/priorities_test.go @@ -22,6 +22,7 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" testutils "k8s.io/kubernetes/test/utils" + "strings" ) // This file tests the scheduler priority functions. @@ -172,3 +173,64 @@ func TestPodAffinity(t *testing.T) { } t.Errorf("Pod %v got scheduled on an unexpected node: %v.", podName, pod.Spec.NodeName) } + +// TestImageLocality verifies that the scheduler's image locality priority function +// works correctly, i.e., the pod gets scheduled to the node where its container images are ready. +func TestImageLocality(t *testing.T) { + context := initTest(t, "image-locality") + defer cleanupTest(t, context) + + // Add a few nodes. + _, err := createNodes(context.clientSet, "testnode", nil, 10) + if err != nil { + t.Fatalf("cannot create nodes: %v", err) + } + + // We use a fake large image as the test image used by the pod, which has relatively large image size. + image := v1.ContainerImage{ + Names: []string{ + "fake-large-image:v1", + }, + SizeBytes: 3000 * 1024 * 1024, + } + + // Create a node with the large image + nodeWithLargeImage, err := createNodeWithImages(context.clientSet, "testnode-large-image", nil, []v1.ContainerImage{image}) + if err != nil { + t.Fatalf("cannot create node with a large image: %v", err) + } + + // Create a pod with containers each having the specified image. + podName := "pod-using-large-image" + pod, err := runPodWithContainers(context.clientSet, initPodWithContainers(context.clientSet, &podWithContainersConfig{ + Name: podName, + Namespace: context.ns.Name, + Containers: makeContainersWithImages(image.Names), + })) + if err != nil { + t.Fatalf("error running pod with images: %v", err) + } + if pod.Spec.NodeName != nodeWithLargeImage.Name { + t.Errorf("pod %v got scheduled on an unexpected node: %v. Expected node: %v.", podName, pod.Spec.NodeName, nodeWithLargeImage.Name) + } else { + t.Logf("pod %v got successfully scheduled on node %v.", podName, pod.Spec.NodeName) + } +} + +// makeContainerWithImage returns a list of v1.Container objects for each given image. Duplicates of an image are ignored, +// i.e., each image is used only once. +func makeContainersWithImages(images []string) []v1.Container { + var containers []v1.Container + usedImages := make(map[string]struct{}) + + for _, image := range images { + if _, ok := usedImages[image]; !ok { + containers = append(containers, v1.Container{ + Name: strings.Replace(image, ":", "-", -1) + "-container", + Image: image, + }) + usedImages[image] = struct{}{} + } + } + return containers +} diff --git a/test/integration/scheduler/scheduler_test.go b/test/integration/scheduler/scheduler_test.go index 2af1054e62..412c0c7cb2 100644 --- a/test/integration/scheduler/scheduler_test.go +++ b/test/integration/scheduler/scheduler_test.go @@ -154,6 +154,7 @@ func TestSchedulerCreationFromConfigMap(t *testing.T) { "NodePreferAvoidPodsPriority", "SelectorSpreadPriority", "TaintTolerationPriority", + "ImageLocalityPriority", ), }, { diff --git a/test/integration/scheduler/util.go b/test/integration/scheduler/util.go index c56b8b63d0..d801caa4a0 100644 --- a/test/integration/scheduler/util.go +++ b/test/integration/scheduler/util.go @@ -325,24 +325,35 @@ func waitForNodeLabels(cs clientset.Interface, nodeName string, labels map[strin return wait.Poll(time.Millisecond*100, wait.ForeverTestTimeout, nodeHasLabels(cs, nodeName, labels)) } -// createNode creates a node with the given resource list and -// returns a pointer and error status. If 'res' is nil, a predefined amount of +// initNode returns a node with the given resource list and images. If 'res' is nil, a predefined amount of // resource will be used. -func createNode(cs clientset.Interface, name string, res *v1.ResourceList) (*v1.Node, error) { +func initNode(name string, res *v1.ResourceList, images []v1.ContainerImage) *v1.Node { // if resource is nil, we use a default amount of resources for the node. if res == nil { res = &v1.ResourceList{ v1.ResourcePods: *resource.NewQuantity(32, resource.DecimalSI), } } + n := &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: name}, Spec: v1.NodeSpec{Unschedulable: false}, Status: v1.NodeStatus{ Capacity: *res, + Images: images, }, } - return cs.CoreV1().Nodes().Create(n) + return n +} + +// createNode creates a node with the given resource list. +func createNode(cs clientset.Interface, name string, res *v1.ResourceList) (*v1.Node, error) { + return cs.CoreV1().Nodes().Create(initNode(name, res, nil)) +} + +// createNodeWithImages creates a node with the given resource list and images. +func createNodeWithImages(cs clientset.Interface, name string, res *v1.ResourceList, images []v1.ContainerImage) (*v1.Node, error) { + return cs.CoreV1().Nodes().Create(initNode(name, res, images)) } // updateNodeStatus updates the status of node. @@ -492,6 +503,43 @@ func runPausePod(cs clientset.Interface, pod *v1.Pod) (*v1.Pod, error) { return pod, nil } +type podWithContainersConfig struct { + Name string + Namespace string + Containers []v1.Container +} + +// initPodWithContainers initializes a pod API object from the given config. This is used primarily for generating +// pods with containers each having a specific image. +func initPodWithContainers(cs clientset.Interface, conf *podWithContainersConfig) *v1.Pod { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: conf.Name, + Namespace: conf.Namespace, + }, + Spec: v1.PodSpec{ + Containers: conf.Containers, + }, + } + return pod +} + +// runPodWithContainers creates a pod with given config and containers and waits +// until it is scheduled. It returns its pointer and error status. +func runPodWithContainers(cs clientset.Interface, pod *v1.Pod) (*v1.Pod, error) { + pod, err := cs.CoreV1().Pods(pod.Namespace).Create(pod) + if err != nil { + return nil, fmt.Errorf("Error creating pod-with-containers: %v", err) + } + if err = waitForPodToSchedule(cs, pod); err != nil { + return pod, fmt.Errorf("Pod %v didn't schedule successfully. Error: %v", pod.Name, err) + } + if pod, err = cs.CoreV1().Pods(pod.Namespace).Get(pod.Name, metav1.GetOptions{}); err != nil { + return pod, fmt.Errorf("Error getting pod %v info: %v", pod.Name, err) + } + return pod, nil +} + // podDeleted returns true if a pod is not found in the given namespace. func podDeleted(c clientset.Interface, podNamespace, podName string) wait.ConditionFunc { return func() (bool, error) {