more changes and Add readme

2019-11-04 18:09:54 +02:00 · 2019-11-04 18:09:54 +02:00 · 2165464af6
parent 128eff9b0e
commit 2165464af6
18 changed files with 395 additions and 413 deletions
--- a/tests/perf/Makefile
+++ b/tests/perf/Makefile
@ -1,6 +1,6 @@
 MODULE := $(shell basename $$PWD)

-.PHONY: init config apply destroy clean test
+.PHONY: init config apply destroy clean test info

 init:
 	@scripts/perf init
@ -8,6 +8,9 @@ init:
 config:
 	@scripts/perf config

+plan:
+	@scripts/perf plan
+
 apply:
 	@scripts/perf apply

@ -19,3 +22,6 @@ clean:

 test:
 	@scripts/test test_load
+
+info:
+	@scripts/perf info
--- a/tests/perf/README.md
+++ b/tests/perf/README.md
@ -0,0 +1,116 @@
+## K3S Performance Tests
+---
+
+These scripts uses Terraform to automate building and testing on k3s clusters on AWS, it supports building normal and HA clusters with N master nodes, N workers nodes and multiple storage backends including:
+
+- MySQL RDS
+- Postgres RDS
+- Etcd
+- SQlite
+
+The scripts divides into three sections:
+
+- server
+- agents
+- tests
+
+### Server
+
+The server section deploys the storage backend and then deploys N master nodes, the scripts can be customized to use HA mode or use a single node cluster with sqlite backend, it can also support using 1 master node with external DB, the scripts can also be customized to specify instance type and k3s version, all available options are described in the variable section below.
+
+The server section will also create a one or more agent nodes specifically for Prometheus deployment, clusterloader2 will deploy prometheus and grafana.
+
+### Agents
+
+The agents section deploys the k3s agents, it can be customized with different options that controls the agent node count and the instance types.
+
+### Tests
+
+The tests section uses a fork off the  (clusterloader2)[https://github.com/kubernetes/perf-tests/tree/master/clusterloader2] tool, the fork just modifies the logging and removes the etcd metrics probes.
+
+this section will use a dockerized version of the tool, which will run the tests and save the report in `tests/<test_name>-<random-number>`.
+
+The current available tests are:
+
+- load test
+- density test
+
+## Variables
+
+The scripts can be modified by customizing the variables in `scripts/config`, the variables includes:
+
+**Main Vars**
+
+|       Name       |                                   Description                                  |
+|:----------------:|:------------------------------------------------------------------------------:|
+|   CLUSTER_NAME   |     The cluster name on aws, this will prefix each component in the cluster    |
+|    DOMAIN_NAME   |                 DNS name of the Loadbalancer for k3s master(s)                 |
+|      ZONE_ID     |                 AWS route53 zone id for modifying the dns name                 |
+|    K3S_VERSION   |                 K3S version that will be used with the cluster                 |
+|  EXTRA_SSH_KEYS  |                Public ssh keys that will be added to the servers               |
+| PRIVATE_KEY_PATH | Private ssh key that will be used by clusterloader2 to ssh and collect metrics |
+|       DEBUG      |                           Debug mode for k3s servers                           |
+
+**Database Variables**
+
+|       Name       |                                             Description                                             |
+|:----------------:|:---------------------------------------------------------------------------------------------------:|
+|     DB_ENGINE    |                    The database type, this can be "mysql", "postgres", or "etcd"                    |
+| DB_INSTANCE_TYPE | The RDS instance type for mysql and postgres, etcd uses db.* class as well as its parsed internally |
+|      DB_NAME     |                           Database name created only in postgres and mysql                          |
+|    DB_USERNAME   |                        Database username created only for postgres and mysql                        |
+|    DB_PASSWORD   |                  Database password for the user created only for postgres and mysql                 |
+|    DB_VERSION    |                                           Database version                                          |
+
+**K3S Server Variables**
+
+|         Name         |                                    Description                                    |
+|:--------------------:|:---------------------------------------------------------------------------------:|
+|       SERVER_HA      | Whether or not to use HA mode, if not then sqlite will be used as storage backend |
+|     SERVER_COUNT     |                               k3s master node count                               |
+| SERVER_INSTANCE_TYPE |                    Ec2 instance type created for k3s server(s)                    |
+
+**K3S Agent Variables**
+
+|         Name        |                Description                |
+|:-------------------:|:-----------------------------------------:|
+|   AGENT_NODE_COUNT  | Number of k3s agents that will be created |
+| AGENT_INSTANCE_TYPE |  Ec2 instance type created for k3s agents |
+
+**Prometheus server Variables**
+
+|            Name           |                             Description                             |
+|:-------------------------:|:-------------------------------------------------------------------:|
+|   PROM_WORKER_NODE_COUNT  | Number of k3s agents that will be created for prometheus deployment |
+| PROM_WORKER_INSTANCE_TYPE |         Ec2 instance type created for k3s prometheus agents         |
+
+
+## Usage
+
+### build
+
+The script includes a Makefile that run different sections, to build the master and workers, adjust the config file in `tests/perf/scripts/config` and then use the following:
+
+```
+cd tests/perf
+make apply
+```
+
+This will basically build the db, server, and agent layers, it will also deploy a kubeconfig file in tests/kubeconfig.yaml.
+
+### test
+
+To start the clusterloader2 load test you can modify the tests/perf/tests/load/config.yaml and then run the following:
+
+```
+cd tests/perf
+make test
+```
+
+### destroy
+
+To destroy the cluster just run the following:
+```
+make destroy
+make clean
+```
--- a/tests/perf/agents/main.tf
+++ b/tests/perf/agents/main.tf
@ -52,15 +52,19 @@ module "k3s-pool-worker-asg" {
  version       = "3.0.0"
  name          = "${local.name}-pool"
  asg_name      = "${local.name}-pool"
-  instance_type = var.worker_instance_type
+  instance_type = var.agent_instance_type
  image_id      = data.aws_ami.ubuntu.id
  user_data     = base64encode(templatefile("${path.module}/files/pool_worker_userdata.tmpl", { k3s_url = data.terraform_remote_state.server.outputs.public_ip, k3s_cluster_secret = local.k3s_cluster_secret, extra_ssh_keys = var.extra_ssh_keys, install_k3s_version = var.k3s_version }))
  ebs_optimized = true

-  desired_capacity    = var.node_count
+  default_cooldown          = 10
+  health_check_grace_period = 30
+  wait_for_capacity_timeout = "60m"
+
+  desired_capacity    = var.agent_node_count
  health_check_type   = "EC2"
-  max_size            = var.node_count
-  min_size            = var.node_count
+  max_size            = var.agent_node_count
+  min_size            = var.agent_node_count
  vpc_zone_identifier = [data.aws_subnet.selected.id]
  spot_price          = "0.680"

--- a/tests/perf/agents/variables.tf
+++ b/tests/perf/agents/variables.tf
@ -1,10 +1,10 @@
-variable "node_count" {
+variable "agent_node_count" {
  description = "Number of nodes to run k3s agents on."
  type        = number
  # default   = 10
 }

-variable "worker_instance_type" {
+variable "agent_instance_type" {
  type    = string
  default = "t3.2xlarge"
 }
--- a/tests/perf/scripts/config
+++ b/tests/perf/scripts/config
@ -1,28 +1,34 @@
 ## MAIN VARIABLES ##
 ####################
-CLUSTER_NAME="hgalal-k3s"
-K3S_VERSION="v0.10.0"
-EXTRA_SSH_KEYS="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDZBAE6I9J733HJfCBVu7iWSUuJ7th0U4P4IFfpFDca52n/Hk4yFFr8SPR8JJc1n42c3vEVCbExp/MD4ihqEBy9+pLewxA+fkb7UAT4cT2eLfvZdTTVe8KSiw6lVN6tWSoNXmNqY+wH7zWQ04lfjXPa/c01L1n2XwV/O+5xii9vEuSxN9YhfQ/s61SdLFqQ5yS8gPsM0qQW+bFt5KGGbapqztDO+h9lxGbZRcRAKbCzZ5kF1mhjI/+VubTWKtoVLCumjzjYqILYyx9g/mLSo26qjDEZvtwBQB9KLugDAtnalLVp0HgivC5YfLHr8PxViVSHfIIKS2DhUpn07jr8eKi9"
-PRIVATE_KEY_PATH="/home/hussein/.ssh/id_rsa" #this has to be a full path
+CLUSTER_NAME="loadtest-k3s"
+DOMAIN_NAME=""
+ZONE_ID=""
+K3S_VERSION="v0.11.0-alpha2"
+EXTRA_SSH_KEYS="" # comma separated public keys
+PRIVATE_KEY_PATH="~/.ssh/id_rsa"
+DEBUG=1

+## K3S DB VARIABLES ##
+##########################
+DB_ENGINE="postgres"
+DB_INSTANCE_TYPE="db.m4.4xlarge"
+DB_NAME="k3s"
+DB_USERNAME="k3suser"
+DB_PASSWORD="024d9442b3add64b7ef90655bc302cd8"
+DB_VERSION=11.5

 ## K3S SERVER VARIABLES ##
 ##########################
-K3S_HA=1
-MASTER_COUNT=3
-DB_INSTANCE_TYPE="db.m4.4xlarge"
+SERVER_HA=1
+SERVER_COUNT=3
 SERVER_INSTANCE_TYPE="m5.2xlarge"
-DEBUG=1
-

 ## PROMETHEUS SERVER VARIABLES ##
 #################################
 PROM_WORKER_NODE_COUNT=1
-PROM_HOST="prometheus-load.eng.rancher.space"
-GRAF_HOST="prometheus-load.eng.rancher.space"
-
+PROM_WORKER_INSTANCE_TYPE="m5.large"

 ## K3S AGENTS VARIABLES ##
 ##########################
 AGENT_NODE_COUNT=100
-WORKER_INSTANCE_TYPE="m5.xlarge"
+AGENT_INSTANCE_TYPE="m5.large"
--- a/tests/perf/scripts/perf
+++ b/tests/perf/scripts/perf
@ -18,6 +18,8 @@ init() {

 apply() {
  # init terraform
+  init
+  # configure variables
  config
  # Run apply for server and agents
  for i in server agents; do
@ -32,32 +34,52 @@ apply() {
  done
 }

+plan() {
+  # init terraform
+  config
+  # Run apply for server and agents
+  for i in server agents; do
+    pushd $i
+    $TERRAFORM_PLAN_CMD
+    popd
+  done
+}
+
+
 config() {
  source scripts/config
  pushd ./server
+  eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH
+  EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH`
 cat <<MAIN > variables.tfvars
 name = "${CLUSTER_NAME}"
 db_instance_type = "${DB_INSTANCE_TYPE}"
+db_name = "${DB_NAME}"
+db_username = "${DB_USERNAME}"
+db_password = "${DB_PASSWORD}"
+db_engine = "${DB_ENGINE}"
+db_version = "${DB_VERSION}"
 server_instance_type = "${SERVER_INSTANCE_TYPE}"
 extra_ssh_keys = ["${EXTRA_SSH_KEYS}"]
-master_count = ${MASTER_COUNT}
-k3s_ha = ${K3S_HA}
+server_count = ${SERVER_COUNT}
+server_ha = ${SERVER_HA}
 k3s_version = "${K3S_VERSION}"
 prom_worker_node_count = ${PROM_WORKER_NODE_COUNT}
-prom_host = "${PROM_HOST}"
-graf_host = "${GRAF_HOST}"
-ssh_key_path = "${PRIVATE_KEY_PATH}"
+prom_worker_instance_type = "${PROM_WORKER_INSTANCE_TYPE}"
+ssh_key_path = "${EXPANDED_PRIV_KEY_PATH}"
 debug = ${DEBUG}
+domain_name = "${DOMAIN_NAME}"
+zone_id = "${ZONE_ID}"
 MAIN
 popd

 pushd ./agents
 cat <<MAIN > variables.tfvars
 name = "${CLUSTER_NAME}"
-node_count = ${AGENT_NODE_COUNT}
 extra_ssh_keys = ["${EXTRA_SSH_KEYS}"]
 k3s_version = "${K3S_VERSION}"
-worker_instance_type = "${WORKER_INSTANCE_TYPE}"
+agent_node_count = ${AGENT_NODE_COUNT}
+agent_instance_type = "${AGENT_INSTANCE_TYPE}"
 MAIN
 popd
 }
@ -71,6 +93,16 @@ clean() {
  done
 }

+cleanall() {
+  clean
+  # clean kubeconfig
+  pushd tests/
+  rm -f kubeconfig
+  rm -rf load_tests_results*
+  rm -rf density_tests_results*
+  popd
+}
+
 destroy() {
  for i in agents server; do
    pushd $i
@ -80,4 +112,15 @@ destroy() {
  clean
 }

+info() {
+  set +x
+  for i in agents server; do
+    pushd $i
+    if [ -f $i.tfstate ]; then
+      terraform output --state=$i.tfstate
+    fi
+    popd
+  done
+}
+
 $@
--- a/tests/perf/scripts/test
+++ b/tests/perf/scripts/test
@ -2,9 +2,11 @@

 test_load() {
  source scripts/config
+  eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH
+  EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH`
  masterips=`terraform output -state=server/server.tfstate | grep k3s_server_ips | cut -d "=" -f 2`
  pushd tests/
-  docker run -v $PRIVATE_KEY_PATH:/opt/priv_key \
+  docker run -v $EXPANDED_PRIV_KEY_PATH:/opt/priv_key \
             -e KUBE_SSH_USER=ubuntu \
             -e LOCAL_SSH_KEY=/opt/priv_key \
             -it -v $PWD/:/opt/k3s/perf-tests husseingalal/clusterloader:dev \
@ -12,7 +14,7 @@ test_load() {
             --kubeconfig /opt/k3s/perf-tests/kubeconfig.yaml  \
             --masterip $masterips \
             --provider=local  \
-             --report-dir /opt/k3s/perf-tests/load_tests_results \
+             --report-dir /opt/k3s/perf-tests/load_tests_results-$RANDOM \
             --enable-prometheus-server \
             --tear-down-prometheus-server=0
  popd
@ -20,17 +22,19 @@ test_load() {

 test_density() {
  source scripts/config
+  eval PRIVATE_KEY_PATH=$PRIVATE_KEY_PATH
+  EXPANDED_PRIV_KEY_PATH=`readlink -f $PRIVATE_KEY_PATH`
  masterips=`terraform output -state=server/server.tfstate | grep k3s_server_ips | cut -d "=" -f 2`
  pushd tests/
  docker run -e KUBE_SSH_USER=ubuntu \
-             -v $PRIVATE_KEY_PATH:/opt/priv_key \
+             -v $EXPANDED_PRIV_KEY_PATH:/opt/priv_key \
             -e LOCAL_SSH_KEY=/opt/priv_key \
             -it -v $PWD/:/opt/k3s/perf-tests husseingalal/clusterloader:dev \
             clusterloader --testconfig /opt/k3s/perf-tests/density/config.yaml \
             --kubeconfig /opt/k3s/perf-tests/kubeconfig.yaml  \
             --masterip $masterips \
             --provider=local  \
-             --report-dir /opt/k3s/perf-tests/density_tests_results \
+             --report-dir /opt/k3s/perf-tests/density_tests_results-$RANDOM \
             --enable-prometheus-server \
             --tear-down-prometheus-server=0
  popd
@ -40,7 +44,7 @@ clean() {
  # clean kubeconfig
  pushd tests/
  rm -f kubeconfig
-  rm -rf load_tests_results/
+  rm -rf load_tests_results*
  rm -rf density_tests_results/
  popd
 }
--- a/tests/perf/server/data.tf
+++ b/tests/perf/server/data.tf
@ -34,19 +34,3 @@ data "aws_ami" "ubuntu" {
    values = ["x86_64"]
  }
 }
-
-data "template_file" "metrics" {
-  template = file("${path.module}/files/metrics.yaml")
-  vars = {
-    prom_worker_node_count = local.prom_worker_node_count
-
-  }
-}
-data "template_file" "k3s-prom-yaml" {
-  template = file("${path.module}/files/prom.yaml")
-  vars = {
-    prom_host = var.prom_host
-    graf_host = var.graf_host
-    prom_worker_node_count = local.prom_worker_node_count
-  }
-}
--- a/tests/perf/server/files/etcd.tmpl
+++ b/tests/perf/server/files/etcd.tmpl
@ -0,0 +1,31 @@
+#cloud-config
+%{ if length(extra_ssh_keys) > 0 }
+ssh_authorized_keys:
+%{ for ssh_key in extra_ssh_keys }
+- ${ssh_key}
+%{ endfor }
+%{ endif }
+runcmd:
+- echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf
+- echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf
+- echo "net.ipv4.neigh.default.gc_thresh3 = 16384" >> /etc/sysctl.conf
+- echo "net.ipv4.neigh.default.gc_thresh2 = 8192" >> /etc/sysctl.conf
+- echo "net.ipv4.neigh.default.gc_thresh1 = 4096" >> /etc/sysctl.conf
+- echo "fs.file-max = 12000500" >> /etc/sysctl.conf
+- echo "fs.nr_open = 20000500" >> /etc/sysctl.conf
+- echo "net.ipv4.tcp_mem = '10000000 10000000 10000000'" >> /etc/sysctl.conf
+- echo "net.ipv4.tcp_rmem = '1024 4096 16384'" >> /etc/sysctl.conf
+- echo "net.ipv4.tcp_wmem = '1024 4096 16384'" >> /etc/sysctl.conf
+- echo "net.core.rmem_max = 16384" >> /etc/sysctl.conf
+- echo "net.core.wmem_max = 16384" >> /etc/sysctl.conf
+- ulimit -n 20000000
+- echo "# <domain> <type> <item>  <value>" >> /etc/security/limits.d/limits.conf
+- echo "    *       soft  nofile  20000" >> /etc/security/limits.d/limits.conf
+- echo "    *       hard  nofile  20000" >> /etc/security/limits.d/limits.conf
+- sysctl -p
+- apt-get update
+- apt-get install -y git vim software-properties-common resolvconf linux-headers-$(uname -r)
+- echo "nameserver 1.1.1.1" > /etc/resolvconf/resolv.conf.d/tail
+- echo "RateLimitIntervalSec=0" >> /etc/systemd/journald.conf
+- echo "RateLimitBurst=0" >> /etc/systemd/journald.conf
+- curl -sSL https://releases.rancher.com/install-docker/19.03.sh | sh
--- a/tests/perf/server/files/etcd_build.sh
+++ b/tests/perf/server/files/etcd_build.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+set -x
+
+IFS=',' read -r -a public_ips <<< "$PUBLIC_IPS"
+IFS=',' read -r -a private_ips <<< "$PRIVATE_IPS"
+
+conn_string=""
+for i in "${!private_ips[@]}"; do
+  conn_string=$conn_string"etcd-$i=http://${private_ips[i]}:2380,"
+done
+conn_string=${conn_string%?}
+for i in "${!public_ips[@]}"; do
+  while true; do
+    ssh -i $SSH_KEY_PATH -l ubuntu ${public_ips[i]} "sudo docker run -v /etcd-data:/etcd-data -d -p ${private_ips[i]}:2379:2379 -p ${private_ips[i]}:2380:2380 quay.io/coreos/etcd:$DB_VERSION etcd --initial-advertise-peer-urls http://${private_ips[i]}:2380 --name=etcd-$i --data-dir=/etcd-data --advertise-client-urls=http://0.0.0.0:2379 --listen-peer-urls=http://0.0.0.0:2380 --listen-client-urls=http://0.0.0.0:2379 --initial-cluster-token=etcd-cluster-1 --initial-cluster-state new --initial-cluster $conn_string"
+    if [ $? == 0 ]; then
+      break
+    fi
+    sleep 10
+  done
+done
+
+#
--- a/tests/perf/server/files/metrics.yaml
+++ b/tests/perf/server/files/metrics.yaml
@ -1,227 +0,0 @@
-%{ if prom_worker_node_count != 0 }
---
-apiVersion: rbac.authorization.k8s.io/v1
-# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1
-kind: ClusterRoleBinding
-metadata:
-  name: kube-state-metrics
-roleRef:
-  apiGroup: rbac.authorization.k8s.io
-  kind: ClusterRole
-  name: kube-state-metrics
-subjects:
- kind: ServiceAccount
-  name: kube-state-metrics
-  namespace: kube-system
---
-apiVersion: rbac.authorization.k8s.io/v1
-# kubernetes versions before 1.8.0 should use rbac.authorization.k8s.io/v1beta1
-kind: ClusterRole
-metadata:
-  name: kube-state-metrics
-rules:
- apiGroups: [""]
-  resources:
-  - configmaps
-  - secrets
-  - nodes
-  - pods
-  - services
-  - resourcequotas
-  - replicationcontrollers
-  - limitranges
-  - persistentvolumeclaims
-  - persistentvolumes
-  - namespaces
-  - endpoints
-  verbs: ["list", "watch"]
- apiGroups: ["extensions"]
-  resources:
-  - daemonsets
-  - deployments
-  - replicasets
-  - ingresses
-  verbs: ["list", "watch"]
- apiGroups: ["apps"]
-  resources:
-  - daemonsets
-  - deployments
-  - replicasets
-  - statefulsets
-  verbs: ["list", "watch"]
- apiGroups: ["batch"]
-  resources:
-  - cronjobs
-  - jobs
-  verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
-  resources:
-  - horizontalpodautoscalers
-  verbs: ["list", "watch"]
- apiGroups: ["policy"]
-  resources:
-  - poddisruptionbudgets
-  verbs: ["list", "watch"]
- apiGroups: ["certificates.k8s.io"]
-  resources:
-  - certificatesigningrequests
-  verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
-  resources:
-  - storageclasses
-  verbs: ["list", "watch"]
- apiGroups: ["autoscaling.k8s.io"]
-  resources:
-  - verticalpodautoscalers
-  verbs: ["list", "watch"]
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  labels:
-    k8s-app: kube-state-metrics
-  name: kube-state-metrics
-  namespace: kube-system
-spec:
-  selector:
-    matchLabels:
-      k8s-app: kube-state-metrics
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        k8s-app: kube-state-metrics
-    spec:
-      serviceAccountName: kube-state-metrics
-      containers:
-      - name: kube-state-metrics
-        image: quay.io/coreos/kube-state-metrics:v1.7.2
-        ports:
-        - name: http-metrics
-          containerPort: 8080
-        - name: telemetry
-          containerPort: 8081
-        livenessProbe:
-          httpGet:
-            path: /healthz
-            port: 8080
-          initialDelaySeconds: 5
-          timeoutSeconds: 5
-        readinessProbe:
-          httpGet:
-            path: /
-            port: 8080
-          initialDelaySeconds: 5
-          timeoutSeconds: 5
---
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: kube-state-metrics
-  namespace: kube-system
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: kube-state-metrics
-  namespace: kube-system
-  labels:
-    k8s-app: kube-state-metrics
-  annotations:
-    prometheus.io/scrape: 'true'
-spec:
-  ports:
-  - name: http-metrics
-    port: 8080
-    targetPort: http-metrics
-    protocol: TCP
-  - name: telemetry
-    port: 8081
-    targetPort: telemetry
-    protocol: TCP
-  selector:
-    k8s-app: kube-state-metrics
---
-kind: ClusterRoleBinding
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: slo-monitor
-subjects:
- kind: ServiceAccount
-  name: slo-monitor
-  namespace: kube-system
-roleRef:
-  kind: ClusterRole
-  name: slo-monitor
-  apiGroup: rbac.authorization.k8s.io
---
-kind: ClusterRole
-apiVersion: rbac.authorization.k8s.io/v1
-metadata:
-  name: slo-monitor
-  namespace: kube-system
-rules:
- apiGroups: [""]
-  resources: ["pods", "events"]
-  verbs: ["get", "watch", "list"]
---
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: slo-monitor
-  namespace: kube-system
---
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: slo-monitor
-  namespace: kube-system
-  labels:
-    app: slo-monitor
-spec:
-  selector:
-    matchLabels:
-      app: slo-monitor
-  template:
-    metadata:
-      labels:
-        app: slo-monitor
-      annotations:
-        prometheus.io/scrape: "true"
-    spec:
-      containers:
-      - name: slo-monitor
-        image: gcr.io/google-containers/slo-monitor:0.12.0
-        command:
-          - /slo-monitor
-          -  --alsologtostderr=true
-        imagePullPolicy: Always
-        ports:
-        - name: metrics
-          containerPort: 8080
-        resources:
-          requests:
-            cpu: 300m
-            memory: 100Mi
-          limits:
-            cpu: 300m
-            memory: 100Mi
-      restartPolicy: Always
-      serviceAccountName: slo-monitor
---
-apiVersion: v1
-kind: Service
-metadata:
-  name: slo-monitor
-  namespace: kube-system
-  labels:
-    app: slo-monitor
-spec:
-  selector:
-    app: slo-monitor
-  ports:
-  - name: metrics
-    port: 80
-    targetPort: metrics
-  type: ClusterIP
-%{ endif }
--- a/tests/perf/server/files/prom.yaml
+++ b/tests/perf/server/files/prom.yaml
@ -1,86 +0,0 @@
-%{ if prom_worker_node_count != 0 }
---
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: monitoring
-
---
-apiVersion: helm.cattle.io/v1
-kind: HelmChart
-metadata:
-  name: prometheus
-  namespace: kube-system
-spec:
-  chart: https://raw.githubusercontent.com/galal-hussein/charts/master/prometheus-9.2.0.tgz
-  targetNamespace: monitoring
-  valuesContent: |-
-    alertmanager:
-      nodeSelector:
-        prom: "true"
-      persistentVolume:
-        enabled: false
-    kubeStateMetrics:
-      nodeSelector:
-        prom: "true"
-    nodeExporter:
-      nodeSelector:
-        prom: "true"
-    server:
-      nodeSelector:
-        prom: "true"
-      ingress:
-        enabled: true
-        hosts:
-          - ${prom_host}
-      persistentVolume:
-        enabled: false
-    pushgateway:
-      nodeSelector:
-        prom: "true"
-      persistentVolume:
-        enabled: false
-    serverFiles:
-      prometheus.yml:
-        scrape_configs:
-        - job_name: prometheus
-          static_configs:
-            - targets:
-              - localhost:9090
-        - job_name: kubernetes-apiservers
-          scrape_interval: 10s
-          scrape_timeout: 10s
-          metrics_path: /metrics
-          scheme: https
-          kubernetes_sd_configs:
-          - api_server: null
-            role: endpoints
-            namespaces:
-              names: []
-          bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
-          tls_config:
-            ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
-            insecure_skip_verify: true
-          relabel_configs:
-          - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
-            separator: ;
-            regex: default;kubernetes;https
-            replacement: $1
-            action: keep
---
-apiVersion: helm.cattle.io/v1
-kind: HelmChart
-metadata:
-  name: grafana
-  namespace: kube-system
-spec:
-  chart: stable/grafana
-  targetNamespace: monitoring
-  valuesContent: |-
-    ingress:
-      enabled: true
-      hosts:
-        - ${graf_host}
-    nodeSelector:
-      prom: "true"
-%{ endif }
--- a/tests/perf/server/files/server_userdata.tmpl
+++ b/tests/perf/server/files/server_userdata.tmpl
@ -6,16 +6,33 @@ ssh_authorized_keys:
 %{ endfor }
 %{ endif }
 write_files:
- path: /var/lib/rancher/k3s/server/manifests/metrics.yaml
+- path: /opt/k3s/run_k3s.sh
  permissions: "0755"
  owner: root:root
-  encoding: b64
-  content: ${metrics_yaml}
- path: /var/lib/rancher/k3s/server/manifests/prom.yaml
-  permissions: "0755"
-  owner: root:root
-  encoding: b64
-  content: ${prom_yaml}
+  content: |
+    #!/bin/bash
+    set -x
+    if [ ${db_engine} == "postgres" ]; then
+      STORAGE_ENDPOINT="postgres://${db_username}:${db_password}@${db_address}:5432/${db_name}"
+    elif [ ${db_engine} == "mysql" ]; then
+      STORAGE_ENDPOINT="mysql://${db_username}:${db_password}@(${db_address})/${db_name}"
+    else
+     IFS=',' read -r -a private_ips <<< "${db_address}"
+     for i in "$${!private_ips[@]}"; do
+      STORAGE_ENDPOINT=$STORAGE_ENDPOINT"http://$${private_ips[i]}:2379",
+     done
+     STORAGE_ENDPOINT=$${STORAGE_ENDPOINT%?}
+     echo hello
+    fi
+    while true; do
+      curl -sfL https://get.k3s.io | K3S_CLUSTER_SECRET="${k3s_cluster_secret}" \
+         INSTALL_K3S_VERSION="${install_k3s_version}" \
+         INSTALL_K3S_EXEC="${k3s_server_args} --cluster-cidr=10.0.0.0/8 --no-deploy traefik --no-deploy servicelb --tls-san ${lb_address} %{ if use_ha == "true" } --storage-endpoint=$STORAGE_ENDPOINT %{ endif }" sh -
+      if [ $? -eq 0 ]; then
+        break
+      fi
+      sleep 1
+    done
 runcmd:
 - echo "net.ipv4.neigh.default.gc_interval = 3600" >> /etc/sysctl.conf
 - echo "net.ipv4.neigh.default.gc_stale_time = 3600" >> /etc/sysctl.conf
@ -41,14 +58,11 @@ runcmd:
 - echo "RateLimitBurst=0" >> /etc/systemd/journald.conf
 - systemctl restart systemd-journald.service
 - systemctl start resolvconf
- wget https://raw.githubusercontent.com/galal-hussein/k3s/k3s_with_kine_fix/k3s
- cp k3s /usr/local/bin/k3s
- chmod +x /usr/local/bin/k3s
-%{if master_index != 0 }
+%{ if master_index != 0 }
 - sleep 20
 %{ endif }
- until (curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_DOWNLOAD=true K3S_CLUSTER_SECRET="${k3s_cluster_secret}" INSTALL_K3S_VERSION="${install_k3s_version}" INSTALL_K3S_EXEC="${k3s_server_args} --cluster-cidr=10.0.0.0/8 --no-deploy traefik --no-deploy servicelb --tls-san ${lb_address}  %{ if use_ha == "true" } --storage-endpoint="postgres://${db_username}:${db_password}@${db_address}:5432/${db_name}" %{ if master_index == 0 }--bootstrap-save%{ endif } %{ endif }" sh -); do echo 'Error installing k3s'; sleep 1; done
-%{if debug != 0 }
+- /opt/k3s/run_k3s.sh
+%{ if debug != 0 }
 - sed -i 's/bin\/k3s/bin\/k3s --debug/g' /etc/systemd/system/k3s.service
 - systemctl daemon-reload
 - systemctl restart k3s
--- a/tests/perf/server/files/worker_userdata.tmpl
+++ b/tests/perf/server/files/worker_userdata.tmpl
@ -23,7 +23,7 @@ runcmd:
 - echo "    *       soft  nofile  20000" >> /etc/security/limits.d/limits.conf
 - echo "    *       hard  nofile  20000" >> /etc/security/limits.d/limits.conf
 - sysctl -p
- wget https://raw.githubusercontent.com/galal-hussein/k3s/k3s_with_kine_fix/k3s
+- wget https://raw.githubusercontent.com/galal-hussein/k3s/scale_test/k3s
 - cp k3s /usr/local/bin/k3s
 - chmod +x /usr/local/bin/k3s
 - until (curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${install_k3s_version} INSTALL_K3S_EXEC="${k3s_exec}" K3S_URL=https://${k3s_url}:6443 K3S_CLUSTER_SECRET="${k3s_cluster_secret}" sh -); do echo 'k3s did not install correctly'; sleep 1; done
--- a/tests/perf/server/main.tf
+++ b/tests/perf/server/main.tf
@ -5,10 +5,11 @@ terraform {
 }

 locals {
-  name                   = var.name
-  k3s_cluster_secret     = var.k3s_cluster_secret
-  install_k3s_version    = var.k3s_version
-  prom_worker_node_count = var.prom_worker_node_count
+  name                      = var.name
+  k3s_cluster_secret        = var.k3s_cluster_secret
+  install_k3s_version       = var.k3s_version
+  prom_worker_node_count    = var.prom_worker_node_count
+  prom_worker_instance_type = var.prom_worker_instance_type
 }

 provider "aws" {
@ -57,12 +58,12 @@ resource "aws_security_group" "k3s" {
 }

 resource "aws_db_instance" "k3s_db" {
-  count                = "${var.k3s_ha}"
+  count                = "${var.db_engine == "etcd" ? 0 : var.server_ha}"
  allocated_storage    = 100 #baseline iops is 300 with gp2
  storage_type         = "io1"
  iops                 = "3000"
-  engine               = "postgres"
-  engine_version       = "11.5"
+  engine               = "${var.db_engine}"
+  engine_version       = "${var.db_version}"
  instance_class       = "${var.db_instance_type}"
  name                 = "${var.db_name}"
  username             = "${var.db_username}"
@ -71,13 +72,48 @@ resource "aws_db_instance" "k3s_db" {
  multi_az             = false
 }

+resource "aws_instance" "k3s_etcd" {
+  count = "${var.etcd_count * (var.db_engine == "etcd" ? 1 * var.server_ha : 0)}"
+  instance_type = replace(var.db_instance_type, "/db./", "")
+  ami           = data.aws_ami.ubuntu.id
+  user_data     = base64encode(templatefile("${path.module}/files/etcd.tmpl",
+  {
+    extra_ssh_keys = var.extra_ssh_keys,
+    db_version = var.db_version
+    etcd_count = var.etcd_count
+  }))
+  security_groups = [
+    aws_security_group.k3s.name,
+  ]
+
+   root_block_device {
+    volume_size = "100"
+    volume_type = "gp2"
+  }
+
+   tags = {
+    Name = "${local.name}-etcd-${count.index}"
+  }
+}
+
 resource "aws_lb" "k3s-master-nlb" {
  name               = "${local.name}-nlb"
  internal           = false
  load_balancer_type = "network"
-  subnets = [data.aws_subnet.selected.id]
+  subnets = data.aws_subnet_ids.available.ids
 }

+resource "aws_route53_record" "www" {
+   # currently there is the only way to use nlb dns name in k3s
+   # because the real dns name is too long and cause an issue
+   zone_id = "${var.zone_id}"
+   name = "${var.domain_name}"
+   type = "CNAME"
+   ttl = "30"
+   records = ["${aws_lb.k3s-master-nlb.dns_name}"]
+}
+
+
 resource "aws_lb_target_group" "k3s-master-nlb-tg" {
  name     = "${local.name}-nlb-tg"
  port     = "6443"
@ -104,35 +140,33 @@ resource "aws_lb_listener" "k3s-master-nlb-tg" {
 }

 resource "aws_lb_target_group_attachment" "test" {
-  count = "${var.master_count}"
+  count = "${var.server_count}"
  target_group_arn = "${aws_lb_target_group.k3s-master-nlb-tg.arn}"
-  target_id        = "${aws_spot_instance_request.k3s-server[count.index].spot_instance_id}"
+  target_id        = "${aws_instance.k3s-server[count.index].id}"
  port             = 6443
 }

-resource "aws_spot_instance_request" "k3s-server" {
-  count = "${var.master_count}"
+resource "aws_instance" "k3s-server" {
+  count = "${var.server_count}"
  instance_type = var.server_instance_type
  ami           = data.aws_ami.ubuntu.id
  user_data     = base64encode(templatefile("${path.module}/files/server_userdata.tmpl",
  {
    extra_ssh_keys = var.extra_ssh_keys,
-    metrics_yaml = base64encode(data.template_file.metrics.rendered),
-    prom_yaml = base64encode(data.template_file.k3s-prom-yaml.rendered),
    k3s_cluster_secret = local.k3s_cluster_secret,
    install_k3s_version = local.install_k3s_version,
    k3s_server_args = var.k3s_server_args,
-    db_address = aws_db_instance.k3s_db[0].address,
-    db_name = aws_db_instance.k3s_db[0].name,
-    db_username = aws_db_instance.k3s_db[0].username,
-    db_password = aws_db_instance.k3s_db[0].password,
-    use_ha = "${var.k3s_ha == 1 ? "true": "false"}",
+    db_engine = var.db_engine
+    db_address = "${var.db_engine == "etcd" ? join(",",aws_instance.k3s_etcd.*.private_ip) : aws_db_instance.k3s_db[0].address}",
+    db_name = var.db_name,
+    db_username = var.db_username,
+    db_password = var.db_password,
+    use_ha = "${var.server_ha == 1 ? "true": "false"}",
    master_index = count.index,
-    lb_address = aws_lb.k3s-master-nlb.dns_name,
+    lb_address = var.domain_name,
    prom_worker_node_count = local.prom_worker_node_count,
-    debug = var.debug,}))
-
-  wait_for_fulfillment = true
+    debug = var.debug,
+    k3s_cluster_secret = local.k3s_cluster_secret,}))
  security_groups = [
    aws_security_group.k3s.name,
  ]
@ -155,9 +189,9 @@ module "k3s-prom-worker-asg" {
  version       = "3.0.0"
  name          = "${local.name}-prom-worker"
  asg_name      = "${local.name}-prom-worker"
-  instance_type = "m5.large"
+  instance_type = local.prom_worker_instance_type
  image_id      = data.aws_ami.ubuntu.id
-  user_data     = base64encode(templatefile("${path.module}/files/worker_userdata.tmpl", { extra_ssh_keys = var.extra_ssh_keys, k3s_url = aws_lb.k3s-master-nlb.dns_name, k3s_cluster_secret = local.k3s_cluster_secret, install_k3s_version = local.install_k3s_version, k3s_exec = "--node-label prom=true" }))
+  user_data     = base64encode(templatefile("${path.module}/files/worker_userdata.tmpl", { extra_ssh_keys = var.extra_ssh_keys, k3s_url = var.domain_name, k3s_cluster_secret = local.k3s_cluster_secret, install_k3s_version = local.install_k3s_version, k3s_exec = "--node-label prom=true" }))

  desired_capacity    = local.prom_worker_node_count
  health_check_type   = "EC2"
@ -180,9 +214,22 @@ module "k3s-prom-worker-asg" {
  ]
 }

+resource "null_resource" "run_etcd" {
+  count = "${var.db_engine == "etcd" ? 1 : 0}"
+
+  triggers = {
+    etcd_instance_ids = "${join(",", aws_instance.k3s_etcd.*.id)}"
+  }
+
+  provisioner "local-exec" {
+    interpreter = ["bash", "-c"]
+    command     = "DB_VERSION=${var.db_version} SSH_KEY_PATH=${var.ssh_key_path} PUBLIC_IPS=${join(",",aws_instance.k3s_etcd.*.public_ip)} PRIVATE_IPS=${join(",",aws_instance.k3s_etcd.*.private_ip)} files/etcd_build.sh"
+  }
+}
+
 resource "null_resource" "get-kubeconfig" {
  provisioner "local-exec" {
    interpreter = ["bash", "-c"]
-    command     = "until ssh -i ${var.ssh_key_path} ubuntu@${aws_spot_instance_request.k3s-server[0].public_ip} 'sudo sed \"s/localhost/$aws_lb.k3s-master-nlb.dns_name}/g;s/127.0.0.1/${aws_lb.k3s-master-nlb.dns_name}/g\" /etc/rancher/k3s/k3s.yaml' >| ../tests/kubeconfig.yaml; do sleep 5; done"
+    command     = "until ssh -i ${var.ssh_key_path} ubuntu@${aws_instance.k3s-server[0].public_ip} 'sudo sed \"s/localhost/$var.domain_name}/g;s/127.0.0.1/${var.domain_name}/g\" /etc/rancher/k3s/k3s.yaml' >| ../tests/kubeconfig.yaml; do sleep 5; done"
  }
 }
--- a/tests/perf/server/outputs.tf
+++ b/tests/perf/server/outputs.tf
@ -1,5 +1,5 @@
 output "public_ip" {
-  value = aws_lb.k3s-master-nlb.dns_name
+  value = var.domain_name
 }

 output "install_k3s_version" {
@ -11,5 +11,5 @@ output "k3s_cluster_secret" {
 }

 output "k3s_server_ips" {
-  value = join(",", aws_spot_instance_request.k3s-server.*.public_ip)
+  value = join(",", aws_instance.k3s-server.*.public_ip)
 }
--- a/tests/perf/server/variables.tf
+++ b/tests/perf/server/variables.tf
@ -23,12 +23,7 @@ variable "k3s_cluster_secret" {
  type        = string
  description = "Cluster secret for k3s cluster registration"
 }
-variable "prom_host" {
-  default = ""
-}
-variable "graf_host" {
-  default = ""
-}
+
 variable "name" {
  default     = "k3s-loadtest"
  type        = string
@ -47,11 +42,19 @@ variable "extra_ssh_keys" {
  description = "Extra ssh keys to inject into Rancher instances"
 }

-variable "k3s_ha" {
+variable "server_ha" {
  default     = 0
  description = "Enable k3s in HA mode"
 }

+variable "etcd_count" {
+  default = 3
+}
+
+variable "db_engine" {
+  default = "postgres"
+}
+
 variable "db_instance_type" {
 }

@ -67,7 +70,9 @@ variable "db_password" {
  default = "b58bf234c4bd0133fc7a92b782e498a6"
 }

-variable "master_count" {
+variable "db_version" {}
+
+variable "server_count" {
  default     = 1
  description = "Count of k3s master servers"
 }
@ -76,3 +81,16 @@ variable "debug" {
  default     = 0
  description = "Enable Debug log"
 }
+
+variable "prom_worker_instance_type" {
+  default = "m5.large"
+  description = "Prometheus instance type"
+}
+
+variable "domain_name" {
+  description = "FQDN of the cluster"
+}
+
+variable "zone_id" {
+  description = "route53 zone id to register the domain name"
+}
--- a/tests/perf/tests/load/config.yaml
+++ b/tests/perf/tests/load/config.yaml
@ -7,7 +7,7 @@
 #Constants
 {{$NODE_MODE := DefaultParam .NODE_MODE "allnodes"}}
 {{$NODES_PER_NAMESPACE := DefaultParam .NODES_PER_NAMESPACE 100}}
-{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 30}}
+{{$PODS_PER_NODE := DefaultParam .PODS_PER_NODE 100}}
 {{$LOAD_TEST_THROUGHPUT := DefaultParam .LOAD_TEST_THROUGHPUT 10}}
 {{$BIG_GROUP_SIZE := 1000}}
 {{$MEDIUM_GROUP_SIZE := 500}}