k3s/test/e2e/ingress.go

/*
Copyright 2015 The Kubernetes Authors All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package e2e

import (
	"encoding/json"
	"fmt"
	"net/http"
	"os/exec"
	"path/filepath"
	"sort"
	"strings"
	"time"

	compute "google.golang.org/api/compute/v1"
	"k8s.io/kubernetes/pkg/api"
	"k8s.io/kubernetes/pkg/api/unversioned"
	"k8s.io/kubernetes/pkg/apis/extensions"
	client "k8s.io/kubernetes/pkg/client/unversioned"
	"k8s.io/kubernetes/pkg/labels"
	"k8s.io/kubernetes/pkg/util/intstr"
	"k8s.io/kubernetes/pkg/util/wait"

	. "github.com/onsi/ginkgo"
	. "github.com/onsi/gomega"
)

// Before enabling this test you must make sure the associated project has
// enough quota. At the time of this writing GCE projects are allowed 3
// backend services by default. This test requires at least 5.

// This test exercises the GCE L7 loadbalancer controller cluster-addon. It
// will fail if the addon isn't running, or doesn't send traffic to the expected
// backend. Common failure modes include:
// * GCE L7 took too long to spin up
// * GCE L7 took too long to health check a backend
// * Repeated 404:
//   - L7 is sending traffic to the default backend of the addon.
//   - Backend is receiving /foo when it expects /bar.
// * Repeated 5xx:
//   - Out of quota (describe ing should show you if this is the case)
//   - Mismatched service/container port, or endpoints are dead.

var (
	appPrefix         = "app"
	pathPrefix        = "foo"
	testImage         = "gcr.io/google_containers/n-way-http:1.0"
	httpContainerPort = 8080

	expectedLBCreationTime    = 7 * time.Minute
	expectedLBHealthCheckTime = 7 * time.Minute

	// Labels applied throughout, to the RC, the default backend, as a selector etc.
	controllerLabels = map[string]string{"name": "glbc"}

	// Name of the loadbalancer controller within the cluster addon
	lbContainerName = "l7-lb-controller"

	// If set, the test tries to perform an HTTP GET on each url endpoint of
	// the Ingress. Only set to false to short-circuit test runs in debugging.
	verifyHTTPGET = true

	// On average it takes ~6 minutes for a single backend to come online.
	// We *don't* expect this poll to consistently take 15 minutes for every
	// Ingress as GCE is creating/checking backends in parallel, but at the
	// same time, we're not testing GCE startup latency. So give it enough
	// time, and fail if the average is too high.
	lbPollTimeout  = 15 * time.Minute
	lbPollInterval = 30 * time.Second

	// Time required by the loadbalancer to cleanup, proportional to numApps/Ing.
	lbCleanupTimeout = 5 * time.Minute

	// One can scale this test by tweaking numApps and numIng, the former will
	// create more RCs/Services and add them to a single Ingress, while the latter
	// will create smaller, more fragmented Ingresses. The numbers 2, 1 are chosen
	// arbitrarity, we want to test more than a single endpoint.
	numApps = 2
	numIng  = 1

	// GCE only allows names < 64 characters, and the loadbalancer controller inserts
	// a single character of padding.
	nameLenLimit = 62
)

// timeSlice allows sorting of time.Duration
type timeSlice []time.Duration

func (p timeSlice) Len() int {
	return len(p)
}

func (p timeSlice) Less(i, j int) bool {
	return p[i] < p[j]
}

func (p timeSlice) Swap(i, j int) {
	p[i], p[j] = p[j], p[i]
}

// ruleByIndex returns an IngressRule for the given index.
func ruleByIndex(i int) extensions.IngressRule {
	return extensions.IngressRule{
		Host: fmt.Sprintf("foo%d.bar.com", i),
		IngressRuleValue: extensions.IngressRuleValue{
			HTTP: &extensions.HTTPIngressRuleValue{
				Paths: []extensions.HTTPIngressPath{
					{
						Path: fmt.Sprintf("/%v%d", pathPrefix, i),
						Backend: extensions.IngressBackend{
							ServiceName: fmt.Sprintf("%v%d", appPrefix, i),
							ServicePort: intstr.FromInt(httpContainerPort),
						},
					},
				},
			},
		},
	}
}

// createIngress creates an Ingress with num rules. Eg:
// start = 1 num = 2 will given you a single Ingress with 2 rules:
// Ingress {
//	 foo1.bar.com: /foo1
//	 foo2.bar.com: /foo2
// }
func createIngress(c *client.Client, ns string, start, num int) extensions.Ingress {
	ing := extensions.Ingress{
		ObjectMeta: api.ObjectMeta{
			Name:      fmt.Sprintf("%v%d", appPrefix, start),
			Namespace: ns,
		},
		Spec: extensions.IngressSpec{
			Backend: &extensions.IngressBackend{
				ServiceName: fmt.Sprintf("%v%d", appPrefix, start),
				ServicePort: intstr.FromInt(httpContainerPort),
			},
			Rules: []extensions.IngressRule{},
		},
	}
	for i := start; i < start+num; i++ {
		ing.Spec.Rules = append(ing.Spec.Rules, ruleByIndex(i))
	}
	Logf("Creating ingress %v", start)
	_, err := c.Extensions().Ingress(ns).Create(&ing)
	Expect(err).NotTo(HaveOccurred())
	return ing
}

// createApp will create a single RC and Svc. The Svc will match pods of the
// RC using the selector: 'name'=<name arg>
func createApp(c *client.Client, ns string, i int) {
	name := fmt.Sprintf("%v%d", appPrefix, i)
	l := map[string]string{}

	Logf("Creating svc %v", name)
	svc := svcByName(name, httpContainerPort)
	svc.Spec.Type = api.ServiceTypeNodePort
	_, err := c.Services(ns).Create(svc)
	Expect(err).NotTo(HaveOccurred())

	Logf("Creating rc %v", name)
	rc := rcByNamePort(name, 1, testImage, httpContainerPort, l)
	rc.Spec.Template.Spec.Containers[0].Args = []string{
		"--num=1",
		fmt.Sprintf("--start=%d", i),
		fmt.Sprintf("--prefix=%v", pathPrefix),
		fmt.Sprintf("--port=%d", httpContainerPort),
	}
	_, err = c.ReplicationControllers(ns).Create(rc)
	Expect(err).NotTo(HaveOccurred())
}

// gcloudUnmarshal unmarshals json output of gcloud into given out interface.
func gcloudUnmarshal(resource, regex, project string, out interface{}) {
	output, err := exec.Command("gcloud", "compute", resource, "list",
		fmt.Sprintf("--regex=%v", regex),
		fmt.Sprintf("--project=%v", project),
		"-q", "--format=json").CombinedOutput()
	if err != nil {
		Logf("Error unmarshalling gcloud err: %v, output: %v", err, string(output))
	}
	if err := json.Unmarshal([]byte(output), out); err != nil {
		Logf("Error unmarshalling gcloud output for %v: %v, output: %v", resource, err, string(output))
	}
}

func gcloudDelete(resource, name, project string) {
	Logf("Deleting %v: %v", resource, name)
	output, err := exec.Command("gcloud", "compute", resource, "delete",
		name, fmt.Sprintf("--project=%v", project), "-q").CombinedOutput()
	if err != nil {
		Logf("Error deleting %v, output: %v\nerror: %+v", resource, string(output), err)
	}
}

// kubectlLogLBController logs kubectl debug output for the L7 controller pod.
func kubectlLogLBController(c *client.Client, ns string) {
	selector := labels.SelectorFromSet(labels.Set(controllerLabels))
	options := unversioned.ListOptions{LabelSelector: unversioned.LabelSelector{selector}}
	podList, err := c.Pods(api.NamespaceAll).List(options)
	if err != nil {
		Logf("Cannot log L7 controller output, error listing pods %v", err)
		return
	}
	if len(podList.Items) == 0 {
		Logf("Loadbalancer controller pod not found")
		return
	}
	for _, p := range podList.Items {
		Logf("\nLast 100 log lines of %v\n", p.Name)
		l, _ := runKubectl("logs", p.Name, fmt.Sprintf("--namespace=%v", ns), "-c", lbContainerName, "--tail=100")
		Logf(l)
	}
}

type IngressController struct {
	ns             string
	rcPath         string
	defaultSvcPath string
	UID            string
	Project        string
	rc             *api.ReplicationController
	svc            *api.Service
	c              *client.Client
}

func (cont *IngressController) create() {

	// TODO: This cop out is because it would be *more* brittle to duplicate all
	// the name construction logic from the controller cross-repo. We will not
	// need to be so paranoid about leaked resources once we figure out a solution
	// for issues like #16337. Currently, all names should fall within 63 chars.
	testName := fmt.Sprintf("k8s-fw-foo-app-X-%v--%v", cont.ns, cont.UID)
	if len(testName) > nameLenLimit {
		Failf("Cannot reliably test the given namespace(%v)/uid(%v), too close to GCE limit of %v",
			cont.ns, cont.UID, nameLenLimit)
	}

	if cont.defaultSvcPath != "" {
		svc := svcFromManifest(cont.defaultSvcPath)
		svc.Namespace = cont.ns
		svc.Labels = controllerLabels
		svc.Spec.Selector = controllerLabels
		cont.svc = svc
		_, err := cont.c.Services(cont.ns).Create(cont.svc)
		Expect(err).NotTo(HaveOccurred())
	}
	rc := rcFromManifest(cont.rcPath)
	existingRc, err := cont.c.ReplicationControllers(api.NamespaceSystem).Get(lbContainerName)
	Expect(err).NotTo(HaveOccurred())
	// Merge the existing spec and new spec. The modifications should not
	// manifest as functional changes to the controller. Most importantly, the
	// podTemplate shouldn't change (but for the additional test cmd line flags)
	// to ensure we test actual cluster functionality across upgrades.
	rc.Spec = existingRc.Spec
	rc.Name = "glbc"
	rc.Namespace = cont.ns
	rc.Labels = controllerLabels
	rc.Spec.Selector = controllerLabels
	rc.Spec.Template.Labels = controllerLabels
	rc.Spec.Replicas = 1

	// These command line params are only recognized by v0.51 and above.
	testArgs := []string{
		// Pass namespace uid so the controller will tag resources with it.
		fmt.Sprintf("--cluster-uid=%v", cont.UID),
		// Tell the controller to delete all resources as it quits.
		fmt.Sprintf("--delete-all-on-quit=true"),
		// Don't use the default Service from kube-system.
		fmt.Sprintf("--default-backend-service=%v/%v", cont.svc.Namespace, cont.svc.Name),
	}
	for i, c := range rc.Spec.Template.Spec.Containers {
		if c.Name == lbContainerName {
			rc.Spec.Template.Spec.Containers[i].Args = append(c.Args, testArgs...)
		}
	}
	cont.rc = rc
	_, err = cont.c.ReplicationControllers(cont.ns).Create(cont.rc)
	Expect(err).NotTo(HaveOccurred())
	Expect(waitForRCPodsRunning(cont.c, cont.ns, cont.rc.Name)).NotTo(HaveOccurred())
}

func (cont *IngressController) Cleanup(del bool) error {
	errMsg := ""
	// Ordering is important here because we cannot delete resources that other
	// resources hold references to.
	fwList := []compute.ForwardingRule{}
	gcloudUnmarshal("forwarding-rules", fmt.Sprintf("k8s-fw-.*--%v", cont.UID), cont.Project, &fwList)
	if len(fwList) != 0 {
		msg := ""
		for _, f := range fwList {
			msg += fmt.Sprintf("%v\n", f.Name)
			if del {
				Logf("Deleting forwarding-rule: %v", f.Name)
				output, err := exec.Command("gcloud", "compute", "forwarding-rules", "delete",
					f.Name, fmt.Sprintf("--project=%v", cont.Project), "-q", "--global").CombinedOutput()
				if err != nil {
					Logf("Error deleting forwarding rules, output: %v\nerror:%v", string(output), err)
				}
			}
		}
		errMsg += fmt.Sprintf("\nFound forwarding rules:\n%v", msg)
	}

	tpList := []compute.TargetHttpProxy{}
	gcloudUnmarshal("target-http-proxies", fmt.Sprintf("k8s-tp-.*--%v", cont.UID), cont.Project, &tpList)
	if len(tpList) != 0 {
		msg := ""
		for _, t := range tpList {
			msg += fmt.Sprintf("%v\n", t.Name)
			if del {
				gcloudDelete("target-http-proxies", t.Name, cont.Project)
			}
		}
		errMsg += fmt.Sprintf("Found target proxies:\n%v", msg)
	}

	umList := []compute.UrlMap{}
	gcloudUnmarshal("url-maps", fmt.Sprintf("k8s-um-.*--%v", cont.UID), cont.Project, &umList)
	if len(umList) != 0 {
		msg := ""
		for _, u := range umList {
			msg += fmt.Sprintf("%v\n", u.Name)
			if del {
				gcloudDelete("url-maps", u.Name, cont.Project)
			}
		}
		errMsg += fmt.Sprintf("Found url maps:\n%v", msg)
	}

	beList := []compute.BackendService{}
	gcloudUnmarshal("backend-services", fmt.Sprintf("k8s-be-[0-9]+--%v", cont.UID), cont.Project, &beList)
	if len(beList) != 0 {
		msg := ""
		for _, b := range beList {
			msg += fmt.Sprintf("%v\n", b.Name)
			if del {
				gcloudDelete("backend-services", b.Name, cont.Project)
			}
		}
		errMsg += fmt.Sprintf("Found backend services:\n%v", msg)
	}

	hcList := []compute.HttpHealthCheck{}
	gcloudUnmarshal("http-health-checks", fmt.Sprintf("k8s-be-[0-9]+--%v", cont.UID), cont.Project, &hcList)
	if len(hcList) != 0 {
		msg := ""
		for _, h := range hcList {
			msg += fmt.Sprintf("%v\n", h.Name)
			if del {
				gcloudDelete("http-health-checks", h.Name, cont.Project)
			}
		}
		errMsg += fmt.Sprintf("Found health check:\n%v", msg)
	}
	// TODO: Verify instance-groups, issue #16636. Gcloud mysteriously barfs when told
	// to unmarshal instance groups into the current vendored gce-client's understanding
	// of the struct.
	if errMsg == "" {
		return nil
	}
	return fmt.Errorf(errMsg)
}

var _ = Describe("GCE L7 LoadBalancer Controller", func() {
	// These variables are initialized after framework's beforeEach.
	var ns string
	var addonDir string
	var client *client.Client
	var responseTimes, creationTimes []time.Duration
	var ingController *IngressController

	framework := Framework{BaseName: "glbc"}

	BeforeEach(func() {
		// This test requires a GCE/GKE only cluster-addon
		SkipUnlessProviderIs("gce", "gke")
		framework.beforeEach()
		client = framework.Client
		ns = framework.Namespace.Name
		// Scaled down the existing Ingress controller so it doesn't interfere with the test.
		Expect(scaleRCByName(client, api.NamespaceSystem, lbContainerName, 0)).NotTo(HaveOccurred())
		addonDir = filepath.Join(
			testContext.RepoRoot, "cluster", "addons", "cluster-loadbalancing", "glbc")

		nsParts := strings.Split(ns, "-")
		ingController = &IngressController{
			ns: ns,
			// The UID in the namespace was generated by the master, so it's
			// global to the cluster.
			UID:            nsParts[len(nsParts)-1],
			Project:        testContext.CloudConfig.ProjectID,
			rcPath:         filepath.Join(addonDir, "glbc-controller.yaml"),
			defaultSvcPath: filepath.Join(addonDir, "default-svc.yaml"),
			c:              client,
		}
		ingController.create()
		// If we somehow get the same namespace uid as someone else in this
		// gce project, just back off.
		Expect(ingController.Cleanup(false)).NotTo(HaveOccurred())
		responseTimes = []time.Duration{}
		creationTimes = []time.Duration{}
	})

	AfterEach(func() {
		Logf("Average creation time %+v, health check time %+v", creationTimes, responseTimes)
		if CurrentGinkgoTestDescription().Failed {
			kubectlLogLBController(client, ns)
			Logf("\nOutput of kubectl describe ing:\n")
			desc, _ := runKubectl("describe", "ing", fmt.Sprintf("--namespace=%v", ns))
			Logf(desc)
		}
		// Delete all Ingress, then wait for the controller to cleanup.
		ings, err := client.Extensions().Ingress(ns).List(unversioned.ListOptions{})
		if err != nil {
			Logf("WARNING: Failed to list ingress: %+v", err)
		} else {
			for _, ing := range ings.Items {
				Logf("Deleting ingress %v/%v", ing.Namespace, ing.Name)
				if err := client.Extensions().Ingress(ns).Delete(ing.Name, nil); err != nil {
					Logf("WARNING: Failed to delete ingress %v: %v", ing.Name, err)
				}
			}
		}
		pollErr := wait.Poll(5*time.Second, lbCleanupTimeout, func() (bool, error) {
			if err := ingController.Cleanup(false); err != nil {
				Logf("Still waiting for glbc to cleanup: %v", err)
				return false, nil
			}
			return true, nil
		})
		// TODO: Remove this once issue #17802 is fixed
		Expect(scaleRCByName(client, ingController.rc.Namespace, ingController.rc.Name, 0)).NotTo(HaveOccurred())

		// If the controller failed to cleanup the test will fail, but we want to cleanup
		// resources before that.
		if pollErr != nil {
			if cleanupErr := ingController.Cleanup(true); cleanupErr != nil {
				Logf("WARNING: Failed to cleanup resources %v", cleanupErr)
			}
			Failf("Failed to cleanup GCE L7 resources.")
		}
		// Restore the cluster Addon.
		Expect(scaleRCByName(client, api.NamespaceSystem, lbContainerName, 1)).NotTo(HaveOccurred())
		framework.afterEach()
		Logf("Successfully verified GCE L7 loadbalancer via Ingress.")
	})

	It("should create GCE L7 loadbalancers and verify Ingress", func() {
		// Create numApps apps, exposed via numIng Ingress each with 2 paths.
		// Eg with numApp=10, numIng=5:
		// apps: {foo-app-(0-10)}
		// ingress: {foo-app-(0, 2, 4, 6, 8)}
		// paths:
		//  ingress foo-app-0:
		//	  default1.bar.com
		//	  foo0.bar.com: /foo0
		//	  foo1.bar.com: /foo1
		if numApps < numIng {
			Failf("Need more apps than Ingress")
		}
		appsPerIngress := numApps / numIng
		By(fmt.Sprintf("Creating %d rcs + svc, and %d apps per Ingress", numApps, appsPerIngress))
		for appID := 0; appID < numApps; appID = appID + appsPerIngress {
			// Creates appsPerIngress apps, then creates one Ingress with paths to all the apps.
			for j := appID; j < appID+appsPerIngress; j++ {
				createApp(client, ns, j)
			}
			createIngress(client, ns, appID, appsPerIngress)
		}

		ings, err := client.Extensions().Ingress(ns).List(unversioned.ListOptions{})
		Expect(err).NotTo(HaveOccurred())

		for _, ing := range ings.Items {
			// Wait for the loadbalancer IP.
			start := time.Now()
			address, err := waitForIngressAddress(client, ing.Namespace, ing.Name, lbPollTimeout)
			if err != nil {
				Failf("Ingress failed to acquire an IP address within %v", lbPollTimeout)
			}
			Expect(err).NotTo(HaveOccurred())
			By(fmt.Sprintf("Found address %v for ingress %v, took %v to come online",
				address, ing.Name, time.Since(start)))
			creationTimes = append(creationTimes, time.Since(start))

			if !verifyHTTPGET {
				continue
			}

			// Check that all rules respond to a simple GET.
			for _, rules := range ing.Spec.Rules {
				// As of Kubernetes 1.1 we only support HTTP Ingress.
				if rules.IngressRuleValue.HTTP == nil {
					continue
				}
				for _, p := range rules.IngressRuleValue.HTTP.Paths {
					route := fmt.Sprintf("http://%v%v", address, p.Path)
					Logf("Testing route %v host %v with simple GET", route, rules.Host)

					// Make sure the service node port is reachable
					Expect(curlServiceNodePort(client, ns, p.Backend.ServiceName, int(p.Backend.ServicePort.IntVal))).NotTo(HaveOccurred())

					GETStart := time.Now()
					var lastBody string
					pollErr := wait.Poll(lbPollInterval, lbPollTimeout, func() (bool, error) {
						var err error
						lastBody, err = simpleGET(http.DefaultClient, route, rules.Host)
						if err != nil {
							Logf("host %v path %v: %v", rules.Host, route, err)
							return false, nil
						}
						return true, nil
					})
					if pollErr != nil {
						msg := fmt.Sprintf("Failed to execute a successful GET within %v, Last response body for %v, host %v:\n%v\n\n%v\n",
							lbPollTimeout, route, rules.Host, lastBody, pollErr)

						// Make sure the service node port is still reachable
						if err := curlServiceNodePort(client, ns, p.Backend.ServiceName, int(p.Backend.ServicePort.IntVal)); err != nil {
							msg += fmt.Sprintf("Also unable to curl service node port: %v", err)
						}
						Failf(msg)
					}
					rt := time.Since(GETStart)
					By(fmt.Sprintf("Route %v host %v took %v to respond", route, rules.Host, rt))
					responseTimes = append(responseTimes, rt)
				}
			}
		}
		// In most cases slow loadbalancer creation/startup translates directly to
		// GCE api sluggishness. However this might be because of something the
		// controller is doing, eg: maxing out QPS by repeated polling.
		sort.Sort(timeSlice(creationTimes))
		perc50 := creationTimes[len(creationTimes)/2]
		if perc50 > expectedLBCreationTime {
			Logf("WARNING: Average creation time is too high: %+v", creationTimes)
		}
		if !verifyHTTPGET {
			return
		}
		sort.Sort(timeSlice(responseTimes))
		perc50 = responseTimes[len(responseTimes)/2]
		if perc50 > expectedLBHealthCheckTime {
			Logf("WARNING: Average startup time is too high: %+v", responseTimes)
		}
	})
})

func curlServiceNodePort(client *client.Client, ns, name string, port int) error {
	// TODO: Curl all nodes?
	u, err := getNodePortURL(client, ns, name, port)
	if err != nil {
		return err
	}
	svcCurlBody, err := simpleGET(http.DefaultClient, u, "")
	if err != nil {
		return fmt.Errorf("Failed to curl service node port, body: %v\nerror %v", svcCurlBody, err)
	}
	Logf("Successfully curled service node port, body: %v", svcCurlBody)
	return nil
}