Merge pull request #18458 from hongchaodeng/perf

Auto commit by PR queue bot
2015-12-21 01:54:09 -08:00 · 2015-12-21 01:54:09 -08:00 · 2efc738d5b
parent 439d7a721e 9704222cf3
commit 2efc738d5b
7 changed files with 420 additions and 131 deletions
--- a/hack/test-go.sh
+++ b/hack/test-go.sh
@ -39,6 +39,7 @@ kube::test::find_dirs() {
          -o -path './test/e2e/*' \
          -o -path './test/e2e_node/*' \
          -o -path './test/integration/*' \
+          -o -path './test/component/scheduler/perf/*' \
        \) -prune \
      \) -name '*_test.go' -print0 | xargs -0n1 dirname | sed 's|^\./||' | sort -u
  )
--- a/test/component/scheduler/perf/README.md
+++ b/test/component/scheduler/perf/README.md
@ -0,0 +1,75 @@
+<!-- BEGIN MUNGE: UNVERSIONED_WARNING -->
+
+<!-- BEGIN STRIP_FOR_RELEASE -->
+
+<img src="http://kubernetes.io/img/warning.png" alt="WARNING"
+     width="25" height="25">
+<img src="http://kubernetes.io/img/warning.png" alt="WARNING"
+     width="25" height="25">
+<img src="http://kubernetes.io/img/warning.png" alt="WARNING"
+     width="25" height="25">
+<img src="http://kubernetes.io/img/warning.png" alt="WARNING"
+     width="25" height="25">
+<img src="http://kubernetes.io/img/warning.png" alt="WARNING"
+     width="25" height="25">
+
+<h2>PLEASE NOTE: This document applies to the HEAD of the source tree</h2>
+
+If you are using a released version of Kubernetes, you should
+refer to the docs that go with that version.
+
+<strong>
+The latest release of this document can be found
+[here](http://releases.k8s.io/release-1.1/docs/proposals/choosing-scheduler.md).
+
+Documentation for other releases can be found at
+[releases.k8s.io](http://releases.k8s.io).
+</strong>
+--
+
+<!-- END STRIP_FOR_RELEASE -->
+
+<!-- END MUNGE: UNVERSIONED_WARNING -->
+
+Scheduler Performance Test
+======
+
+Motivation
+------
+We already have a performance testing system -- Kubemark. However, Kubemark requires setting up and bootstrapping a whole cluster, which takes a lot of time.
+
+We want to have a standard way to reproduce scheduling latency metrics result and benchmark scheduler as simple and fast as possible. We have the following goals:
+
+- Save time on testing
+  - The test and benchmark can be run in a single box.
+    We only set up components necessary to scheduling without booting up a cluster.
+- Profiling runtime metrics to find out bottleneck
+  - Write scheduler integration test but focus on performance measurement.
+    Take advantage of go profiling tools and collect fine-grained metrics,
+    like cpu-profiling, memory-profiling and block-profiling.
+- Reproduce test result easily
+  - We want to have a known place to do the performance related test for scheduler.
+    Developers should just run one script to collect all the information they need.
+
+Currently the test suite has the following:
+
+- density test (by adding a new Go test)
+  - schedule 30k pods on 1000 (fake) nodes and 3k pods on 100 (fake) nodes
+  - print out scheduling rate every second
+  - let you learn the rate changes vs number of scheduled pods
+- benchmark
+  - make use of `go test -bench` and report nanosecond/op.
+  - schedule b.N pods when the cluster has N nodes and P scheduled pods. Since it takes relatively long time to finish one round, b.N is small: 10 - 100.
+
+
+How To Run
+------
+```
+cd kubernetes/test/component/scheduler/perf
+./test-performance.sh
+```
+
+
+<!-- BEGIN MUNGE: GENERATED_ANALYTICS -->
+[![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/test/component/scheduler/perf/README.md?pixel)]()
+<!-- END MUNGE: GENERATED_ANALYTICS -->
--- a/test/component/scheduler/perf/scheduler_bench_test.go
+++ b/test/component/scheduler/perf/scheduler_bench_test.go
@ -0,0 +1,79 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package benchmark
+
+import (
+	"testing"
+	"time"
+)
+
+// BenchmarkScheduling100Nodes0Pods benchmarks the scheduling rate
+// when the cluster has 100 nodes and 0 scheduled pods
+func BenchmarkScheduling100Nodes0Pods(b *testing.B) {
+	benchmarkScheduling(100, 0, b)
+}
+
+// BenchmarkScheduling100Nodes1000Pods benchmarks the scheduling rate
+// when the cluster has 100 nodes and 1000 scheduled pods
+func BenchmarkScheduling100Nodes1000Pods(b *testing.B) {
+	benchmarkScheduling(100, 1000, b)
+}
+
+// BenchmarkScheduling1000Nodes0Pods benchmarks the scheduling rate
+// when the cluster has 1000 nodes and 0 scheduled pods
+func BenchmarkScheduling1000Nodes0Pods(b *testing.B) {
+	benchmarkScheduling(1000, 0, b)
+}
+
+// BenchmarkScheduling1000Nodes1000Pods benchmarks the scheduling rate
+// when the cluster has 1000 nodes and 1000 scheduled pods
+func BenchmarkScheduling1000Nodes1000Pods(b *testing.B) {
+	benchmarkScheduling(1000, 1000, b)
+}
+
+// benchmarkScheduling benchmarks scheduling rate with specific number of nodes
+// and specific number of pods already scheduled. Since an operation takes relatively
+// long time, b.N should be small: 10 - 100.
+func benchmarkScheduling(numNodes, numScheduledPods int, b *testing.B) {
+	schedulerConfigFactory, finalFunc := mustSetupScheduler()
+	defer finalFunc()
+	c := schedulerConfigFactory.Client
+
+	makeNodes(c, numNodes)
+	makePods(c, numScheduledPods)
+	for {
+		scheduled := schedulerConfigFactory.ScheduledPodLister.Store.List()
+		if len(scheduled) >= numScheduledPods {
+			break
+		}
+		time.Sleep(1 * time.Second)
+	}
+	// start benchmark
+	b.ResetTimer()
+	makePods(c, b.N)
+	for {
+		// This can potentially affect performance of scheduler, since List() is done under mutex.
+		// TODO: Setup watch on apiserver and wait until all pods scheduled.
+		scheduled := schedulerConfigFactory.ScheduledPodLister.Store.List()
+		if len(scheduled) >= numScheduledPods+b.N {
+			break
+		}
+		// Note: This might introduce slight deviation in accuracy of benchmark results.
+		// Since the total amount of time is relatively large, it might not be a concern.
+		time.Sleep(100 * time.Millisecond)
+	}
+}
--- a/test/component/scheduler/perf/scheduler_test.go
+++ b/test/component/scheduler/perf/scheduler_test.go
@ -0,0 +1,61 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package benchmark
+
+import (
+	"fmt"
+	"testing"
+	"time"
+)
+
+// TestSchedule100Node3KPods schedules 3k pods on 100 nodes.
+func TestSchedule100Node3KPods(t *testing.T) {
+	schedulePods(100, 3000)
+}
+
+// TestSchedule1000Node30KPods schedules 30k pods on 1000 nodes.
+func TestSchedule1000Node30KPods(t *testing.T) {
+	schedulePods(1000, 30000)
+}
+
+// schedulePods schedules specific number of pods on specific number of nodes.
+// This is used to learn the scheduling throughput on various
+// sizes of cluster and changes as more and more pods are scheduled.
+// It won't stop until all pods are scheduled.
+func schedulePods(numNodes, numPods int) {
+	schedulerConfigFactory, destroyFunc := mustSetupScheduler()
+	defer destroyFunc()
+	c := schedulerConfigFactory.Client
+
+	makeNodes(c, numNodes)
+	makePods(c, numPods)
+
+	prev := 0
+	start := time.Now()
+	for {
+		// This can potentially affect performance of scheduler, since List() is done under mutex.
+		// Listing 10000 pods is an expensive operation, so running it frequently may impact scheduler.
+		// TODO: Setup watch on apiserver and wait until all pods scheduled.
+		scheduled := schedulerConfigFactory.ScheduledPodLister.Store.List()
+		fmt.Printf("%ds\trate: %d\ttotal: %d\n", time.Since(start)/time.Second, len(scheduled)-prev, len(scheduled))
+		if len(scheduled) >= numPods {
+			return
+		}
+		prev = len(scheduled)
+		time.Sleep(1 * time.Second)
+	}
+}
--- a/test/component/scheduler/perf/test-performance.sh
+++ b/test/component/scheduler/perf/test-performance.sh
@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+# Copyright 2014 The Kubernetes Authors All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -o errexit
+set -o nounset
+set -o pipefail
+
+pushd "../../../.."
+source "./hack/lib/util.sh"
+source "./hack/lib/logging.sh"
+source "./hack/lib/etcd.sh"
+popd
+
+cleanup() {
+  kube::etcd::cleanup
+  kube::log::status "performance test cleanup complete"
+}
+
+trap cleanup EXIT
+
+kube::etcd::start
+kube::log::status "performance test start"
+
+# TODO: set log-dir and prof output dir.
+DIR_BASENAME=$(basename `pwd`)
+go test -c -o "${DIR_BASENAME}.test"
+# We are using the benchmark suite to do profiling. Because it only runs a few pods and
+# theoretically it has less variance.
+"./${DIR_BASENAME}.test" -test.bench=. -test.run=xxxx -test.cpuprofile=prof.out -logtostderr=false
+kube::log::status "benchmark tests finished"
+# Running density tests. It might take a long time.
+"./${DIR_BASENAME}.test" -test.run=. -test.timeout=60m
+kube::log::status "density tests finished"
--- a/test/component/scheduler/perf/util.go
+++ b/test/component/scheduler/perf/util.go
@ -0,0 +1,158 @@
+/*
+Copyright 2015 The Kubernetes Authors All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package benchmark
+
+import (
+	"net/http"
+	"net/http/httptest"
+
+	"github.com/golang/glog"
+	"k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/api/resource"
+	"k8s.io/kubernetes/pkg/api/testapi"
+	"k8s.io/kubernetes/pkg/client/record"
+	client "k8s.io/kubernetes/pkg/client/unversioned"
+	"k8s.io/kubernetes/pkg/master"
+	"k8s.io/kubernetes/plugin/pkg/scheduler"
+	_ "k8s.io/kubernetes/plugin/pkg/scheduler/algorithmprovider"
+	"k8s.io/kubernetes/plugin/pkg/scheduler/factory"
+	"k8s.io/kubernetes/test/integration/framework"
+)
+
+// mustSetupScheduler starts the following components:
+// - k8s api server (a.k.a. master)
+// - scheduler
+// It returns scheduler config factory and destroyFunc which should be used to
+// remove resources after finished.
+// Notes on rate limiter:
+//   - The BindPodsRateLimiter is nil, meaning no rate limits.
+//   - client rate limit is set to 5000.
+func mustSetupScheduler() (schedulerConfigFactory *factory.ConfigFactory, destroyFunc func()) {
+	framework.DeleteAllEtcdKeys()
+
+	var m *master.Master
+	masterConfig := framework.NewIntegrationTestMasterConfig()
+	m = master.New(masterConfig)
+	s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
+		m.Handler.ServeHTTP(w, req)
+	}))
+
+	c := client.NewOrDie(&client.Config{
+		Host:         s.URL,
+		GroupVersion: testapi.Default.GroupVersion(),
+		QPS:          5000.0,
+		Burst:        5000,
+	})
+
+	schedulerConfigFactory = factory.NewConfigFactory(c, nil)
+	schedulerConfig, err := schedulerConfigFactory.Create()
+	if err != nil {
+		panic("Couldn't create scheduler config")
+	}
+	eventBroadcaster := record.NewBroadcaster()
+	schedulerConfig.Recorder = eventBroadcaster.NewRecorder(api.EventSource{Component: "scheduler"})
+	eventBroadcaster.StartRecordingToSink(c.Events(""))
+	scheduler.New(schedulerConfig).Run()
+
+	destroyFunc = func() {
+		glog.Infof("destroying")
+		close(schedulerConfig.StopEverything)
+		s.Close()
+		glog.Infof("destroyed")
+	}
+	return
+}
+
+func makeNodes(c client.Interface, nodeCount int) {
+	glog.Infof("making %d nodes", nodeCount)
+	baseNode := &api.Node{
+		ObjectMeta: api.ObjectMeta{
+			GenerateName: "scheduler-test-node-",
+		},
+		Spec: api.NodeSpec{
+			ExternalID: "foobar",
+		},
+		Status: api.NodeStatus{
+			Capacity: api.ResourceList{
+				api.ResourcePods:   *resource.NewQuantity(32, resource.DecimalSI),
+				api.ResourceCPU:    resource.MustParse("4"),
+				api.ResourceMemory: resource.MustParse("32Gi"),
+			},
+			Phase: api.NodeRunning,
+			Conditions: []api.NodeCondition{
+				{Type: api.NodeReady, Status: api.ConditionTrue},
+			},
+		},
+	}
+	for i := 0; i < nodeCount; i++ {
+		if _, err := c.Nodes().Create(baseNode); err != nil {
+			panic("error creating node: " + err.Error())
+		}
+	}
+}
+
+// makePods will setup specified number of scheduled pods.
+// Currently it goes through scheduling path and it's very slow to setup large number of pods.
+// TODO: Setup pods evenly on all nodes and quickly/non-linearly.
+func makePods(c client.Interface, podCount int) {
+	glog.Infof("making %d pods", podCount)
+	basePod := &api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			GenerateName: "scheduler-test-pod-",
+		},
+		Spec: api.PodSpec{
+			Containers: []api.Container{{
+				Name:  "pause",
+				Image: "gcr.io/google_containers/pause:1.0",
+				Resources: api.ResourceRequirements{
+					Limits: api.ResourceList{
+						api.ResourceCPU:    resource.MustParse("100m"),
+						api.ResourceMemory: resource.MustParse("500Mi"),
+					},
+					Requests: api.ResourceList{
+						api.ResourceCPU:    resource.MustParse("100m"),
+						api.ResourceMemory: resource.MustParse("500Mi"),
+					},
+				},
+			}},
+		},
+	}
+	threads := 30
+	remaining := make(chan int, 1000)
+	go func() {
+		for i := 0; i < podCount; i++ {
+			remaining <- i
+		}
+		close(remaining)
+	}()
+	for i := 0; i < threads; i++ {
+		go func() {
+			for {
+				_, ok := <-remaining
+				if !ok {
+					return
+				}
+				for {
+					_, err := c.Pods("default").Create(basePod)
+					if err == nil {
+						break
+					}
+				}
+			}
+		}()
+	}
+}
--- a/test/integration/scheduler_test.go
+++ b/test/integration/scheduler_test.go
@ -24,7 +24,6 @@ import (
 	"fmt"
 	"net/http"
 	"net/http/httptest"
-	"sync"
 	"testing"
 	"time"

@ -274,133 +273,3 @@ func DoTestUnschedulableNodes(t *testing.T, restClient *client.Client, nodeStore
 		}
 	}
 }
-
-func BenchmarkScheduling(b *testing.B) {
-	framework.DeleteAllEtcdKeys()
-
-	var m *master.Master
-	s := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
-		m.Handler.ServeHTTP(w, req)
-	}))
-	defer s.Close()
-
-	masterConfig := framework.NewIntegrationTestMasterConfig()
-	m = master.New(masterConfig)
-
-	c := client.NewOrDie(&client.Config{
-		Host:         s.URL,
-		GroupVersion: testapi.Default.GroupVersion(),
-		QPS:          5000.0,
-		Burst:        5000,
-	})
-
-	schedulerConfigFactory := factory.NewConfigFactory(c, nil)
-	schedulerConfig, err := schedulerConfigFactory.Create()
-	if err != nil {
-		b.Fatalf("Couldn't create scheduler config: %v", err)
-	}
-	eventBroadcaster := record.NewBroadcaster()
-	schedulerConfig.Recorder = eventBroadcaster.NewRecorder(api.EventSource{Component: "scheduler"})
-	eventBroadcaster.StartRecordingToSink(c.Events(""))
-	scheduler.New(schedulerConfig).Run()
-
-	defer close(schedulerConfig.StopEverything)
-
-	makeNNodes(c, 1000)
-	N := b.N
-	b.ResetTimer()
-	makeNPods(c, N)
-	for {
-		objs := schedulerConfigFactory.ScheduledPodLister.Store.List()
-		if len(objs) >= N {
-			fmt.Printf("%v pods scheduled.\n", len(objs))
-			/* // To prove that this actually works:
-			for _, o := range objs {
-				fmt.Printf("%s\n", o.(*api.Pod).Spec.NodeName)
-			}
-			*/
-			break
-		}
-		time.Sleep(time.Millisecond)
-	}
-	b.StopTimer()
-}
-
-func makeNNodes(c client.Interface, N int) {
-	baseNode := &api.Node{
-		ObjectMeta: api.ObjectMeta{
-			GenerateName: "scheduler-test-node-",
-		},
-		Spec: api.NodeSpec{
-			ExternalID: "foobar",
-		},
-		Status: api.NodeStatus{
-			Capacity: api.ResourceList{
-				api.ResourcePods:   *resource.NewQuantity(32, resource.DecimalSI),
-				api.ResourceCPU:    resource.MustParse("4"),
-				api.ResourceMemory: resource.MustParse("32Gi"),
-			},
-			Phase: api.NodeRunning,
-			Conditions: []api.NodeCondition{
-				{Type: api.NodeReady, Status: api.ConditionTrue},
-			},
-		},
-	}
-	for i := 0; i < N; i++ {
-		if _, err := c.Nodes().Create(baseNode); err != nil {
-			panic("error creating node: " + err.Error())
-		}
-	}
-}
-
-func makeNPods(c client.Interface, N int) {
-	basePod := &api.Pod{
-		ObjectMeta: api.ObjectMeta{
-			GenerateName: "scheduler-test-pod-",
-		},
-		Spec: api.PodSpec{
-			Containers: []api.Container{{
-				Name:  "pause",
-				Image: "gcr.io/google_containers/pause:1.0",
-				Resources: api.ResourceRequirements{
-					Limits: api.ResourceList{
-						api.ResourceCPU:    resource.MustParse("100m"),
-						api.ResourceMemory: resource.MustParse("500Mi"),
-					},
-					Requests: api.ResourceList{
-						api.ResourceCPU:    resource.MustParse("100m"),
-						api.ResourceMemory: resource.MustParse("500Mi"),
-					},
-				},
-			}},
-		},
-	}
-	wg := sync.WaitGroup{}
-	threads := 30
-	wg.Add(threads)
-	remaining := make(chan int, N)
-	go func() {
-		for i := 0; i < N; i++ {
-			remaining <- i
-		}
-		close(remaining)
-	}()
-	for i := 0; i < threads; i++ {
-		go func() {
-			defer wg.Done()
-			for {
-				_, ok := <-remaining
-				if !ok {
-					return
-				}
-				for {
-					_, err := c.Pods("default").Create(basePod)
-					if err == nil {
-						break
-					}
-				}
-			}
-		}()
-	}
-	wg.Wait()
-}