mirror of https://github.com/k3s-io/k3s
Add gpu cluster upgrade test.
parent
5da925ad4f
commit
c9e85ec309
|
@ -54,6 +54,10 @@ var upgradeTests = []upgrades.Test{
|
||||||
&upgrades.AppArmorUpgradeTest{},
|
&upgrades.AppArmorUpgradeTest{},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var gpuUpgradeTests = []upgrades.Test{
|
||||||
|
&upgrades.NvidiaGPUUpgradeTest{},
|
||||||
|
}
|
||||||
|
|
||||||
var statefulsetUpgradeTests = []upgrades.Test{
|
var statefulsetUpgradeTests = []upgrades.Test{
|
||||||
&upgrades.MySqlUpgradeTest{},
|
&upgrades.MySqlUpgradeTest{},
|
||||||
&upgrades.EtcdUpgradeTest{},
|
&upgrades.EtcdUpgradeTest{},
|
||||||
|
@ -256,6 +260,52 @@ var _ = SIGDescribe("ingress Downgrade [Feature:IngressDowngrade]", func() {
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
var _ = SIGDescribe("gpu Upgrade [Feature:GPUUpgrade]", func() {
|
||||||
|
f := framework.NewDefaultFramework("gpu-upgrade")
|
||||||
|
|
||||||
|
// Create the frameworks here because we can only create them
|
||||||
|
// in a "Describe".
|
||||||
|
testFrameworks := createUpgradeFrameworks(gpuUpgradeTests)
|
||||||
|
Describe("master upgrade", func() {
|
||||||
|
It("should NOT disrupt gpu pod [Feature:GPUMasterUpgrade]", func() {
|
||||||
|
upgCtx, err := getUpgradeContext(f.ClientSet.Discovery(), framework.TestContext.UpgradeTarget)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
testSuite := &junit.TestSuite{Name: "GPU master upgrade"}
|
||||||
|
gpuUpgradeTest := &junit.TestCase{Name: "[sig-node] gpu-master-upgrade", Classname: "upgrade_tests"}
|
||||||
|
testSuite.TestCases = append(testSuite.TestCases, gpuUpgradeTest)
|
||||||
|
upgradeFunc := func() {
|
||||||
|
start := time.Now()
|
||||||
|
defer finalizeUpgradeTest(start, gpuUpgradeTest)
|
||||||
|
target := upgCtx.Versions[1].Version.String()
|
||||||
|
framework.ExpectNoError(framework.MasterUpgrade(target))
|
||||||
|
framework.ExpectNoError(framework.CheckMasterVersion(f.ClientSet, target))
|
||||||
|
}
|
||||||
|
runUpgradeSuite(f, gpuUpgradeTests, testFrameworks, testSuite, upgCtx, upgrades.MasterUpgrade, upgradeFunc)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
Describe("cluster upgrade", func() {
|
||||||
|
It("should be able to run gpu pod after upgrade [Feature:GPUClusterUpgrade]", func() {
|
||||||
|
upgCtx, err := getUpgradeContext(f.ClientSet.Discovery(), framework.TestContext.UpgradeTarget)
|
||||||
|
framework.ExpectNoError(err)
|
||||||
|
|
||||||
|
testSuite := &junit.TestSuite{Name: "GPU cluster upgrade"}
|
||||||
|
gpuUpgradeTest := &junit.TestCase{Name: "[sig-node] gpu-cluster-upgrade", Classname: "upgrade_tests"}
|
||||||
|
testSuite.TestCases = append(testSuite.TestCases, gpuUpgradeTest)
|
||||||
|
upgradeFunc := func() {
|
||||||
|
start := time.Now()
|
||||||
|
defer finalizeUpgradeTest(start, gpuUpgradeTest)
|
||||||
|
target := upgCtx.Versions[1].Version.String()
|
||||||
|
framework.ExpectNoError(framework.MasterUpgrade(target))
|
||||||
|
framework.ExpectNoError(framework.CheckMasterVersion(f.ClientSet, target))
|
||||||
|
framework.ExpectNoError(framework.NodeUpgrade(f, target, framework.TestContext.UpgradeImage))
|
||||||
|
framework.ExpectNoError(framework.CheckNodesVersions(f.ClientSet, target))
|
||||||
|
}
|
||||||
|
runUpgradeSuite(f, gpuUpgradeTests, testFrameworks, testSuite, upgCtx, upgrades.ClusterUpgrade, upgradeFunc)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
var _ = Describe("[sig-apps] stateful Upgrade [Feature:StatefulUpgrade]", func() {
|
var _ = Describe("[sig-apps] stateful Upgrade [Feature:StatefulUpgrade]", func() {
|
||||||
f := framework.NewDefaultFramework("stateful-upgrade")
|
f := framework.NewDefaultFramework("stateful-upgrade")
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,7 @@ go_library(
|
||||||
"ingress.go",
|
"ingress.go",
|
||||||
"kube_proxy_migration.go",
|
"kube_proxy_migration.go",
|
||||||
"mysql.go",
|
"mysql.go",
|
||||||
|
"nvidia-gpu.go",
|
||||||
"secrets.go",
|
"secrets.go",
|
||||||
"services.go",
|
"services.go",
|
||||||
"sysctl.go",
|
"sysctl.go",
|
||||||
|
@ -28,6 +29,7 @@ go_library(
|
||||||
"//pkg/util/version:go_default_library",
|
"//pkg/util/version:go_default_library",
|
||||||
"//test/e2e/common:go_default_library",
|
"//test/e2e/common:go_default_library",
|
||||||
"//test/e2e/framework:go_default_library",
|
"//test/e2e/framework:go_default_library",
|
||||||
|
"//test/e2e/scheduling:go_default_library",
|
||||||
"//test/utils/image:go_default_library",
|
"//test/utils/image:go_default_library",
|
||||||
"//vendor/github.com/davecgh/go-spew/spew:go_default_library",
|
"//vendor/github.com/davecgh/go-spew/spew:go_default_library",
|
||||||
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
"//vendor/github.com/onsi/ginkgo:go_default_library",
|
||||||
|
@ -38,6 +40,7 @@ go_library(
|
||||||
"//vendor/k8s.io/api/core/v1:go_default_library",
|
"//vendor/k8s.io/api/core/v1:go_default_library",
|
||||||
"//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
|
"//vendor/k8s.io/api/extensions/v1beta1:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/api/errors:go_default_library",
|
||||||
|
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/labels:go_default_library",
|
||||||
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
|
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
|
||||||
|
|
|
@ -0,0 +1,112 @@
|
||||||
|
/*
|
||||||
|
Copyright 2018 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package upgrades
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"k8s.io/api/core/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/api/resource"
|
||||||
|
"k8s.io/kubernetes/test/e2e/framework"
|
||||||
|
"k8s.io/kubernetes/test/e2e/scheduling"
|
||||||
|
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||||
|
|
||||||
|
. "github.com/onsi/ginkgo"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NvidiaGPUUpgradeTest tests that gpu resource is available before and after
|
||||||
|
// a cluster upgrade.
|
||||||
|
type NvidiaGPUUpgradeTest struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (NvidiaGPUUpgradeTest) Name() string { return "nvidia-gpu-upgrade [sig-node] [sig-scheduling]" }
|
||||||
|
|
||||||
|
// Setup creates a job requesting gpu.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) Setup(f *framework.Framework) {
|
||||||
|
scheduling.SetupNVIDIAGPUNode(f, false)
|
||||||
|
By("Creating a job requesting gpu")
|
||||||
|
t.startJob(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test waits for the upgrade to complete, and then verifies that the
|
||||||
|
// cuda pod started by the gpu job can successfully finish.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) Test(f *framework.Framework, done <-chan struct{}, upgrade UpgradeType) {
|
||||||
|
<-done
|
||||||
|
By("Verifying gpu job success")
|
||||||
|
t.verifyJobPodSuccess(f)
|
||||||
|
if upgrade == MasterUpgrade {
|
||||||
|
// MasterUpgrade should be totally hitless.
|
||||||
|
job, err := framework.GetJob(f.ClientSet, f.Namespace.Name, "cuda-add")
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
Expect(job.Status.Failed).To(BeZero(), "Job pods failed during master upgrade: %v", job.Status.Failed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Teardown cleans up any remaining resources.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) Teardown(f *framework.Framework) {
|
||||||
|
// rely on the namespace deletion to clean up everything
|
||||||
|
}
|
||||||
|
|
||||||
|
// startJob creates a job that requests gpu and runs a simple cuda container.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) startJob(f *framework.Framework) {
|
||||||
|
var activeSeconds int64 = 3600
|
||||||
|
// Specifies 100 completions to make sure the job life spans across the upgrade.
|
||||||
|
testJob := framework.NewTestJob("succeed", "cuda-add", v1.RestartPolicyAlways, 1, 100, &activeSeconds, 6)
|
||||||
|
testJob.Spec.Template.Spec = v1.PodSpec{
|
||||||
|
RestartPolicy: v1.RestartPolicyOnFailure,
|
||||||
|
Containers: []v1.Container{
|
||||||
|
{
|
||||||
|
Name: "vector-addition",
|
||||||
|
Image: imageutils.GetE2EImage(imageutils.CudaVectorAdd),
|
||||||
|
Command: []string{"/bin/sh", "-c", "./vectorAdd && sleep 60"},
|
||||||
|
Resources: v1.ResourceRequirements{
|
||||||
|
Limits: v1.ResourceList{
|
||||||
|
framework.NVIDIAGPUResourceName: *resource.NewQuantity(1, resource.DecimalSI),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
ns := f.Namespace.Name
|
||||||
|
_, err := framework.CreateJob(f.ClientSet, ns, testJob)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
framework.Logf("Created job %v", testJob)
|
||||||
|
By("Waiting for gpu job pod start")
|
||||||
|
err = framework.WaitForAllJobPodsRunning(f.ClientSet, ns, testJob.Name, 1)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
By("Done with gpu job pod start")
|
||||||
|
}
|
||||||
|
|
||||||
|
// verifyJobPodSuccess verifies that the started cuda pod successfully passes.
|
||||||
|
func (t *NvidiaGPUUpgradeTest) verifyJobPodSuccess(f *framework.Framework) {
|
||||||
|
// Wait for client pod to complete.
|
||||||
|
ns := f.Namespace.Name
|
||||||
|
err := framework.WaitForAllJobPodsRunning(f.ClientSet, f.Namespace.Name, "cuda-add", 1)
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
pods, err := framework.GetJobPods(f.ClientSet, f.Namespace.Name, "cuda-add")
|
||||||
|
Expect(err).NotTo(HaveOccurred())
|
||||||
|
createdPod := pods.Items[0].Name
|
||||||
|
framework.Logf("Created pod %v", createdPod)
|
||||||
|
f.PodClient().WaitForSuccess(createdPod, 5*time.Minute)
|
||||||
|
logs, err := framework.GetPodLogs(f.ClientSet, ns, createdPod, "vector-addition")
|
||||||
|
framework.ExpectNoError(err, "Should be able to get pod logs")
|
||||||
|
framework.Logf("Got pod logs: %v", logs)
|
||||||
|
regex := regexp.MustCompile("PASSED")
|
||||||
|
Expect(regex.MatchString(logs)).To(BeTrue())
|
||||||
|
}
|
Loading…
Reference in New Issue