2015-09-22 23:38:36 +00:00
/ *
2016-06-03 00:25:58 +00:00
Copyright 2015 The Kubernetes Authors .
2015-09-22 23:38:36 +00:00
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package e2e
import (
"fmt"
2015-12-08 16:14:48 +00:00
"strings"
2015-09-22 23:38:36 +00:00
"time"
2016-06-03 18:03:35 +00:00
"k8s.io/kubernetes/pkg/api"
2015-09-22 23:38:36 +00:00
client "k8s.io/kubernetes/pkg/client/unversioned"
2016-04-07 22:20:50 +00:00
"k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
2015-09-22 23:38:36 +00:00
"k8s.io/kubernetes/pkg/util/sets"
2016-07-26 15:13:18 +00:00
"k8s.io/kubernetes/pkg/util/uuid"
2016-04-07 17:21:31 +00:00
"k8s.io/kubernetes/test/e2e/framework"
2015-09-22 23:38:36 +00:00
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)
const (
// Interval to poll /stats/container on a node
2016-04-07 22:20:50 +00:00
containerStatsPollingPeriod = 10 * time . Second
2015-09-22 23:38:36 +00:00
// The monitoring time for one test.
2015-12-07 23:26:30 +00:00
monitoringTime = 20 * time . Minute
2015-09-22 23:38:36 +00:00
// The periodic reporting period.
reportingPeriod = 5 * time . Minute
2016-06-03 18:03:35 +00:00
// Timeout for waiting for the image prepulling to complete.
imagePrePullingLongTimeout = time . Minute * 8
2015-09-22 23:38:36 +00:00
)
2015-12-08 16:14:48 +00:00
type resourceTest struct {
podsPerNode int
2016-04-07 17:21:31 +00:00
cpuLimits framework . ContainersCPUSummary
memLimits framework . ResourceUsagePerContainer
2015-12-08 16:14:48 +00:00
}
2015-09-22 23:38:36 +00:00
func logPodsOnNodes ( c * client . Client , nodeNames [ ] string ) {
for _ , n := range nodeNames {
2016-04-07 17:21:31 +00:00
podList , err := framework . GetKubeletRunningPods ( c , n )
2015-09-22 23:38:36 +00:00
if err != nil {
2016-04-07 17:21:31 +00:00
framework . Logf ( "Unable to retrieve kubelet pods for node %v" , n )
2015-09-22 23:38:36 +00:00
continue
}
2016-04-07 17:21:31 +00:00
framework . Logf ( "%d pods are running on node %v" , len ( podList . Items ) , n )
2015-09-22 23:38:36 +00:00
}
}
2016-04-07 17:21:31 +00:00
func runResourceTrackingTest ( f * framework . Framework , podsPerNode int , nodeNames sets . String , rm * framework . ResourceMonitor ,
expectedCPU map [ string ] map [ float64 ] float64 , expectedMemory framework . ResourceUsagePerContainer ) {
2015-10-01 17:46:46 +00:00
numNodes := nodeNames . Len ( )
totalPods := podsPerNode * numNodes
By ( fmt . Sprintf ( "Creating a RC of %d pods and wait until all pods of this RC are running" , totalPods ) )
2016-07-26 15:13:18 +00:00
rcName := fmt . Sprintf ( "resource%d-%s" , totalPods , string ( uuid . NewUUID ( ) ) )
2015-10-01 17:46:46 +00:00
// TODO: Use a more realistic workload
2016-04-07 17:21:31 +00:00
Expect ( framework . RunRC ( framework . RCConfig {
Client : f . Client ,
2015-10-01 17:46:46 +00:00
Name : rcName ,
2016-04-07 17:21:31 +00:00
Namespace : f . Namespace . Name ,
2016-05-26 16:16:43 +00:00
Image : framework . GetPauseImageName ( f . Client ) ,
2015-10-01 17:46:46 +00:00
Replicas : totalPods ,
} ) ) . NotTo ( HaveOccurred ( ) )
// Log once and flush the stats.
2015-12-08 16:14:48 +00:00
rm . LogLatest ( )
rm . Reset ( )
2015-10-01 17:46:46 +00:00
By ( "Start monitoring resource usage" )
// Periodically dump the cpu summary until the deadline is met.
2016-04-07 17:21:31 +00:00
// Note that without calling framework.ResourceMonitor.Reset(), the stats
2015-10-01 17:46:46 +00:00
// would occupy increasingly more memory. This should be fine
// for the current test duration, but we should reclaim the
// entries if we plan to monitor longer (e.g., 8 hours).
deadline := time . Now ( ) . Add ( monitoringTime )
for time . Now ( ) . Before ( deadline ) {
timeLeft := deadline . Sub ( time . Now ( ) )
2016-04-07 17:21:31 +00:00
framework . Logf ( "Still running...%v left" , timeLeft )
2015-10-01 17:46:46 +00:00
if timeLeft < reportingPeriod {
time . Sleep ( timeLeft )
} else {
time . Sleep ( reportingPeriod )
}
2016-04-07 17:21:31 +00:00
logPodsOnNodes ( f . Client , nodeNames . List ( ) )
2015-10-01 17:46:46 +00:00
}
By ( "Reporting overall resource usage" )
2016-04-07 17:21:31 +00:00
logPodsOnNodes ( f . Client , nodeNames . List ( ) )
2016-02-11 00:35:17 +00:00
usageSummary , err := rm . GetLatest ( )
Expect ( err ) . NotTo ( HaveOccurred ( ) )
2016-03-12 10:57:58 +00:00
// TODO(random-liu): Remove the original log when we migrate to new perfdash
2016-04-07 17:21:31 +00:00
framework . Logf ( "%s" , rm . FormatResourceUsage ( usageSummary ) )
2016-03-12 10:57:58 +00:00
// Log perf result
framework . PrintPerfData ( framework . ResourceUsageToPerfData ( rm . GetMasterNodeLatest ( usageSummary ) ) )
2016-04-07 17:21:31 +00:00
verifyMemoryLimits ( f . Client , expectedMemory , usageSummary )
2016-02-11 00:35:17 +00:00
cpuSummary := rm . GetCPUSummary ( )
2016-04-07 17:21:31 +00:00
framework . Logf ( "%s" , rm . FormatCPUSummary ( cpuSummary ) )
2016-03-12 10:57:58 +00:00
// Log perf result
framework . PrintPerfData ( framework . CPUUsageToPerfData ( rm . GetMasterNodeCPUSummary ( cpuSummary ) ) )
2016-02-23 20:14:09 +00:00
verifyCPULimits ( expectedCPU , cpuSummary )
2015-10-01 17:46:46 +00:00
By ( "Deleting the RC" )
2016-08-09 00:11:21 +00:00
framework . DeleteRCAndPods ( f . Client , f . Namespace . Name , rcName )
2015-10-01 17:46:46 +00:00
}
2016-04-07 17:21:31 +00:00
func verifyMemoryLimits ( c * client . Client , expected framework . ResourceUsagePerContainer , actual framework . ResourceUsagePerNode ) {
2016-02-11 00:35:17 +00:00
if expected == nil {
return
}
var errList [ ] string
for nodeName , nodeSummary := range actual {
var nodeErrs [ ] string
for cName , expectedResult := range expected {
container , ok := nodeSummary [ cName ]
if ! ok {
nodeErrs = append ( nodeErrs , fmt . Sprintf ( "container %q: missing" , cName ) )
continue
}
expectedValue := expectedResult . MemoryRSSInBytes
actualValue := container . MemoryRSSInBytes
if expectedValue != 0 && actualValue > expectedValue {
nodeErrs = append ( nodeErrs , fmt . Sprintf ( "container %q: expected RSS memory (MB) < %d; got %d" ,
cName , expectedValue , actualValue ) )
}
}
if len ( nodeErrs ) > 0 {
errList = append ( errList , fmt . Sprintf ( "node %v:\n %s" , nodeName , strings . Join ( nodeErrs , ", " ) ) )
2016-04-07 17:21:31 +00:00
heapStats , err := framework . GetKubeletHeapStats ( c , nodeName )
2016-03-02 19:33:46 +00:00
if err != nil {
2016-04-07 17:21:31 +00:00
framework . Logf ( "Unable to get heap stats from %q" , nodeName )
2016-03-02 19:33:46 +00:00
} else {
2016-04-07 17:21:31 +00:00
framework . Logf ( "Heap stats on %q\n:%v" , nodeName , heapStats )
2016-03-02 19:33:46 +00:00
}
2016-02-11 00:35:17 +00:00
}
}
if len ( errList ) > 0 {
2016-04-07 17:21:31 +00:00
framework . Failf ( "Memory usage exceeding limits:\n %s" , strings . Join ( errList , "\n" ) )
2016-02-11 00:35:17 +00:00
}
}
2016-04-07 17:21:31 +00:00
func verifyCPULimits ( expected framework . ContainersCPUSummary , actual framework . NodesCPUSummary ) {
2015-12-08 16:14:48 +00:00
if expected == nil {
return
}
var errList [ ] string
for nodeName , perNodeSummary := range actual {
var nodeErrs [ ] string
for cName , expectedResult := range expected {
perContainerSummary , ok := perNodeSummary [ cName ]
if ! ok {
nodeErrs = append ( nodeErrs , fmt . Sprintf ( "container %q: missing" , cName ) )
continue
}
for p , expectedValue := range expectedResult {
actualValue , ok := perContainerSummary [ p ]
if ! ok {
nodeErrs = append ( nodeErrs , fmt . Sprintf ( "container %q: missing percentile %v" , cName , p ) )
continue
}
if actualValue > expectedValue {
nodeErrs = append ( nodeErrs , fmt . Sprintf ( "container %q: expected %.0fth%% usage < %.3f; got %.3f" ,
cName , p * 100 , expectedValue , actualValue ) )
}
}
}
if len ( nodeErrs ) > 0 {
errList = append ( errList , fmt . Sprintf ( "node %v:\n %s" , nodeName , strings . Join ( nodeErrs , ", " ) ) )
}
}
if len ( errList ) > 0 {
2016-04-07 17:21:31 +00:00
framework . Failf ( "CPU usage exceeding limits:\n %s" , strings . Join ( errList , "\n" ) )
2015-12-08 16:14:48 +00:00
}
}
2015-12-20 19:00:00 +00:00
// Slow by design (1 hour)
2016-04-07 17:21:31 +00:00
var _ = framework . KubeDescribe ( "Kubelet [Serial] [Slow]" , func ( ) {
2015-09-22 23:38:36 +00:00
var nodeNames sets . String
2016-04-07 17:21:31 +00:00
f := framework . NewDefaultFramework ( "kubelet-perf" )
2016-05-06 17:25:18 +00:00
var om * framework . RuntimeOperationMonitor
2016-04-07 17:21:31 +00:00
var rm * framework . ResourceMonitor
2015-09-22 23:38:36 +00:00
BeforeEach ( func ( ) {
2016-06-03 18:03:35 +00:00
// Wait until image prepull pod has completed so that they wouldn't
// affect the runtime cpu usage. Fail the test if prepulling cannot
// finish in time.
if err := framework . WaitForPodsSuccess ( f . Client , api . NamespaceSystem , framework . ImagePullerLabels , imagePrePullingLongTimeout ) ; err != nil {
framework . Failf ( "Image puller didn't complete in %v, not running resource usage test since the metrics might be adultrated" , imagePrePullingLongTimeout )
}
2016-05-05 20:56:25 +00:00
nodes := framework . GetReadySchedulableNodesOrDie ( f . Client )
2015-09-22 23:38:36 +00:00
nodeNames = sets . NewString ( )
for _ , node := range nodes . Items {
nodeNames . Insert ( node . Name )
}
2016-05-06 17:25:18 +00:00
om = framework . NewRuntimeOperationMonitor ( f . Client )
2016-04-07 17:21:31 +00:00
rm = framework . NewResourceMonitor ( f . Client , framework . TargetContainers ( ) , containerStatsPollingPeriod )
2015-12-08 16:14:48 +00:00
rm . Start ( )
2015-09-22 23:38:36 +00:00
} )
AfterEach ( func ( ) {
2015-12-08 16:14:48 +00:00
rm . Stop ( )
2016-05-06 17:25:18 +00:00
result := om . GetLatestRuntimeOperationErrorRate ( )
framework . Logf ( "runtime operation error metrics:\n%s" , framework . FormatRuntimeOperationErrorRate ( result ) )
2015-09-22 23:38:36 +00:00
} )
2016-04-07 17:21:31 +00:00
framework . KubeDescribe ( "regular resource usage tracking" , func ( ) {
2016-02-01 18:29:45 +00:00
// We assume that the scheduler will make reasonable scheduling choices
// and assign ~N pods on the node.
// Although we want to track N pods per node, there are N + add-on pods
// in the cluster. The cluster add-on pods can be distributed unevenly
// among the nodes because they are created during the cluster
// initialization. This *noise* is obvious when N is small. We
// deliberately set higher resource usage limits to account for the
// noise.
2015-12-08 16:14:48 +00:00
rTests := [ ] resourceTest {
2016-03-02 23:22:40 +00:00
{
podsPerNode : 0 ,
2016-04-07 17:21:31 +00:00
cpuLimits : framework . ContainersCPUSummary {
2016-04-07 22:20:50 +00:00
stats . SystemContainerKubelet : { 0.50 : 0.06 , 0.95 : 0.08 } ,
stats . SystemContainerRuntime : { 0.50 : 0.05 , 0.95 : 0.06 } ,
2015-12-08 16:14:48 +00:00
} ,
2016-02-23 20:14:09 +00:00
// We set the memory limits generously because the distribution
// of the addon pods affect the memory usage on each node.
2016-04-07 17:21:31 +00:00
memLimits : framework . ResourceUsagePerContainer {
stats . SystemContainerKubelet : & framework . ContainerResourceUsage { MemoryRSSInBytes : 70 * 1024 * 1024 } ,
stats . SystemContainerRuntime : & framework . ContainerResourceUsage { MemoryRSSInBytes : 85 * 1024 * 1024 } ,
2016-02-23 20:14:09 +00:00
} ,
2015-12-08 16:14:48 +00:00
} ,
2016-03-02 23:22:40 +00:00
{
podsPerNode : 35 ,
2016-04-07 17:21:31 +00:00
cpuLimits : framework . ContainersCPUSummary {
2016-04-07 22:20:50 +00:00
stats . SystemContainerKubelet : { 0.50 : 0.12 , 0.95 : 0.14 } ,
2016-05-31 18:00:33 +00:00
stats . SystemContainerRuntime : { 0.50 : 0.05 , 0.95 : 0.07 } ,
2015-12-08 16:14:48 +00:00
} ,
2016-02-23 20:14:09 +00:00
// We set the memory limits generously because the distribution
// of the addon pods affect the memory usage on each node.
2016-04-07 17:21:31 +00:00
memLimits : framework . ResourceUsagePerContainer {
2016-05-31 18:00:33 +00:00
stats . SystemContainerKubelet : & framework . ContainerResourceUsage { MemoryRSSInBytes : 70 * 1024 * 1024 } ,
stats . SystemContainerRuntime : & framework . ContainerResourceUsage { MemoryRSSInBytes : 150 * 1024 * 1024 } ,
2016-02-23 20:14:09 +00:00
} ,
2015-12-08 16:14:48 +00:00
} ,
2016-03-02 23:22:40 +00:00
{
2016-05-31 18:00:33 +00:00
cpuLimits : framework . ContainersCPUSummary {
2016-06-01 18:40:48 +00:00
stats . SystemContainerKubelet : { 0.50 : 0.17 , 0.95 : 0.22 } ,
2016-05-31 18:00:33 +00:00
stats . SystemContainerRuntime : { 0.50 : 0.06 , 0.95 : 0.09 } ,
} ,
2016-03-02 23:22:40 +00:00
podsPerNode : 100 ,
2016-05-31 18:00:33 +00:00
// We set the memory limits generously because the distribution
// of the addon pods affect the memory usage on each node.
memLimits : framework . ResourceUsagePerContainer {
stats . SystemContainerKubelet : & framework . ContainerResourceUsage { MemoryRSSInBytes : 80 * 1024 * 1024 } ,
stats . SystemContainerRuntime : & framework . ContainerResourceUsage { MemoryRSSInBytes : 300 * 1024 * 1024 } ,
} ,
2016-03-02 23:22:40 +00:00
} ,
2015-12-08 16:14:48 +00:00
}
for _ , testArg := range rTests {
itArg := testArg
podsPerNode := itArg . podsPerNode
2015-10-01 17:46:46 +00:00
name := fmt . Sprintf (
2016-03-12 10:57:58 +00:00
"resource tracking for %d pods per node" , podsPerNode )
2015-10-01 17:46:46 +00:00
It ( name , func ( ) {
2016-04-07 17:21:31 +00:00
runResourceTrackingTest ( f , podsPerNode , nodeNames , rm , itArg . cpuLimits , itArg . memLimits )
2015-10-01 17:46:46 +00:00
} )
}
} )
2016-04-07 17:21:31 +00:00
framework . KubeDescribe ( "experimental resource usage tracking [Feature:ExperimentalResourceUsageTracking]" , func ( ) {
2015-12-07 23:26:30 +00:00
density := [ ] int { 100 }
2015-09-26 01:20:56 +00:00
for i := range density {
podsPerNode := density [ i ]
2015-09-22 23:38:36 +00:00
name := fmt . Sprintf (
2016-03-12 10:57:58 +00:00
"resource tracking for %d pods per node" , podsPerNode )
2015-09-22 23:38:36 +00:00
It ( name , func ( ) {
2016-04-07 17:21:31 +00:00
runResourceTrackingTest ( f , podsPerNode , nodeNames , rm , nil , nil )
2015-09-22 23:38:36 +00:00
} )
}
} )
} )