Add standalone npd on GCI.

pull/6/head
Random-Liu 2017-01-19 01:20:43 -08:00
parent 56afb95641
commit d40c0a7099
16 changed files with 223 additions and 50 deletions

View File

@ -1,6 +1,6 @@
# Maintainers
Lantao Liu <lantaol@google.com>
Random-Liu <lantaol@google.com>
[![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/cluster/addons/node-problem-detector/MAINTAINERS.md?pixel)]()

View File

@ -1,38 +0,0 @@
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: node-problem-detector-v0.1
namespace: kube-system
labels:
k8s-app: node-problem-detector
version: v0.1
kubernetes.io/cluster-service: "true"
spec:
template:
metadata:
labels:
k8s-app: node-problem-detector
version: v0.1
kubernetes.io/cluster-service: "true"
spec:
hostNetwork: true
containers:
- name: node-problem-detector
image: gcr.io/google_containers/node-problem-detector:v0.1
securityContext:
privileged: true
resources:
limits:
cpu: "200m"
memory: "100Mi"
requests:
cpu: "20m"
memory: "20Mi"
volumeMounts:
- name: log
mountPath: /log
readOnly: true
volumes:
- name: log
hostPath:
path: /var/log/

View File

@ -0,0 +1,77 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-problem-detector
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
---
apiVersion: rbac.authorization.k8s.io/v1alpha1
kind: ClusterRoleBinding
metadata:
name: npd-binding
labels:
kubernetes.io/cluster-service: "true"
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:node-problem-detector
subjects:
- kind: ServiceAccount
name: node-problem-detector
namespace: kube-system
---
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: npd-v0.3.0-alpha.1
namespace: kube-system
labels:
k8s-app: node-problem-detector
version: v0.3.0-alpha.1
kubernetes.io/cluster-service: "true"
spec:
template:
metadata:
labels:
k8s-app: node-problem-detector
version: v0.3.0-alpha.1
kubernetes.io/cluster-service: "true"
spec:
containers:
- name: node-problem-detector
image: gcr.io/google_containers/node-problem-detector:v0.3.0-alpha.1
command:
- /node-problem-detector
- --logtostderr
# Pass both config to support both journald and syslog.
- --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json
securityContext:
privileged: true
resources:
limits:
cpu: "200m"
memory: "100Mi"
requests:
cpu: "20m"
memory: "20Mi"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log
readOnly: true
- name: localtime
mountPath: /etc/localtime
readOnly: true
volumes:
- name: log
hostPath:
path: /var/log/
- name: localtime
hostPath:
path: /etc/localtime
serviceAccountName: node-problem-detector

View File

@ -0,0 +1,14 @@
apiVersion: rbac.authorization.k8s.io/v1alpha1
kind: ClusterRoleBinding
metadata:
name: npd-binding
labels:
kubernetes.io/cluster-service: "true"
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:node-problem-detector
subjects:
- apiVersion: rbac/v1alpha1
kind: User
name: system:node-problem-detector

View File

@ -627,7 +627,7 @@ DOCKER_REGISTRY_MIRROR_URL: $(yaml-quote ${DOCKER_REGISTRY_MIRROR_URL:-})
ENABLE_L7_LOADBALANCING: $(yaml-quote ${ENABLE_L7_LOADBALANCING:-none})
ENABLE_CLUSTER_LOGGING: $(yaml-quote ${ENABLE_CLUSTER_LOGGING:-false})
ENABLE_CLUSTER_UI: $(yaml-quote ${ENABLE_CLUSTER_UI:-false})
ENABLE_NODE_PROBLEM_DETECTOR: $(yaml-quote ${ENABLE_NODE_PROBLEM_DETECTOR:-false})
ENABLE_NODE_PROBLEM_DETECTOR: $(yaml-quote ${ENABLE_NODE_PROBLEM_DETECTOR:-none})
ENABLE_NODE_LOGGING: $(yaml-quote ${ENABLE_NODE_LOGGING:-false})
ENABLE_RESCHEDULER: $(yaml-quote ${ENABLE_RESCHEDULER:-false})
LOGGING_DESTINATION: $(yaml-quote ${LOGGING_DESTINATION:-})
@ -641,6 +641,7 @@ DNS_DOMAIN: $(yaml-quote ${DNS_DOMAIN:-})
ENABLE_DNS_HORIZONTAL_AUTOSCALER: $(yaml-quote ${ENABLE_DNS_HORIZONTAL_AUTOSCALER:-false})
KUBELET_TOKEN: $(yaml-quote ${KUBELET_TOKEN:-})
KUBE_PROXY_TOKEN: $(yaml-quote ${KUBE_PROXY_TOKEN:-})
NODE_PROBLEM_DETECTOR_TOKEN: $(yaml-quote ${NODE_PROBLEM_DETECTOR_TOKEN:-})
ADMISSION_CONTROL: $(yaml-quote ${ADMISSION_CONTROL:-})
MASTER_IP_RANGE: $(yaml-quote ${MASTER_IP_RANGE})
RUNTIME_CONFIG: $(yaml-quote ${RUNTIME_CONFIG})
@ -1048,6 +1049,7 @@ function parse-master-env() {
local master_env=$(get-master-env)
KUBELET_TOKEN=$(get-env-val "${master_env}" "KUBELET_TOKEN")
KUBE_PROXY_TOKEN=$(get-env-val "${master_env}" "KUBE_PROXY_TOKEN")
NODE_PROBLEM_DETECTOR_TOKEN=$(get-env-val "${master_env}" "NODE_PROBLEM_DETECTOR_TOKEN")
CA_CERT_BASE64=$(get-env-val "${master_env}" "CA_CERT")
CA_KEY_BASE64=$(get-env-val "${master_env}" "CA_KEY")
KUBEAPISERVER_CERT_BASE64=$(get-env-val "${master_env}" "KUBEAPISERVER_CERT")

View File

@ -143,7 +143,16 @@ CLUSTER_REGISTRY_DISK_TYPE_GCE="${CLUSTER_REGISTRY_DISK_TYPE_GCE:-pd-standard}"
ENABLE_CLUSTER_UI="${KUBE_ENABLE_CLUSTER_UI:-true}"
# Optional: Install node problem detector.
ENABLE_NODE_PROBLEM_DETECTOR="${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-true}"
# none - Not run node problem detector.
# daemonset - Run node problem detector as daemonset.
# standalone - Run node problem detector as standalone system daemon.
if [[ "${NODE_OS_DISTRIBUTION}" == "gci" ]]; then
# Enable standalone mode by default for gci.
# TODO: Consider upgrade test.
ENABLE_NODE_PROBLEM_DETECTOR="${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-standalone}"
else
ENABLE_NODE_PROBLEM_DETECTOR="${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-daemonset}"
fi
# Optional: Create autoscaler for cluster's nodes.
ENABLE_CLUSTER_AUTOSCALER="${KUBE_ENABLE_CLUSTER_AUTOSCALER:-false}"

View File

@ -168,7 +168,16 @@ CLUSTER_REGISTRY_DISK_TYPE_GCE="${CLUSTER_REGISTRY_DISK_TYPE_GCE:-pd-standard}"
ENABLE_CLUSTER_UI="${KUBE_ENABLE_CLUSTER_UI:-true}"
# Optional: Install node problem detector.
ENABLE_NODE_PROBLEM_DETECTOR="${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-true}"
# none - Not run node problem detector.
# daemonset - Run node problem detector as daemonset.
# standalone - Run node problem detector as standalone system daemon.
if [[ "${NODE_OS_DISTRIBUTION}" == "gci" ]]; then
# Enable standalone mode by default for gci.
# TODO: Consider upgrade test.
ENABLE_NODE_PROBLEM_DETECTOR="${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-standalone}"
else
ENABLE_NODE_PROBLEM_DETECTOR="${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-daemonset}"
fi
# Optional: Create autoscaler for cluster's nodes.
ENABLE_CLUSTER_AUTOSCALER="${KUBE_ENABLE_CLUSTER_AUTOSCALER:-false}"

View File

@ -1174,7 +1174,7 @@ function start-kube-addons {
if [[ "${ENABLE_CLUSTER_UI:-}" == "true" ]]; then
setup-addon-manifests "addons" "dashboard"
fi
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "true" ]]; then
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "daemonset" ]]; then
setup-addon-manifests "addons" "node-problem-detector"
fi
if echo "${ADMISSION_CONTROL:-}" | grep -q "LimitRanger"; then

View File

@ -242,6 +242,9 @@ function create-master-auth {
if [[ -n "${KUBE_PROXY_TOKEN:-}" ]]; then
replace_prefixed_line "${known_tokens_csv}" "${KUBE_PROXY_TOKEN}," "system:kube-proxy,uid:kube_proxy"
fi
if [[ -n "${NODE_PROBLEM_DETECTOR_TOKEN:-}" ]]; then
replace_prefixed_line "${known_tokens_csv}" "${NODE_PROBLEM_DETECTOR_TOKEN}," "system:node-problem-detector,uid:node-problem-detector"
fi
local use_cloud_config="false"
cat <<EOF >/etc/gce.conf
[global]
@ -458,6 +461,29 @@ current-context: kube-scheduler
EOF
}
function create-node-problem-detector-kubeconfig {
echo "Creating node-problem-detector kubeconfig file"
mkdir -p /var/lib/node-problem-detector
cat <<EOF >/var/lib/node-problem-detector/kubeconfig
apiVersion: v1
kind: Config
users:
- name: node-problem-detector
user:
token: ${NODE_PROBLEM_DETECTOR_TOKEN}
clusters:
- name: local
cluster:
certificate-authority-data: ${CA_CERT}
contexts:
- context:
cluster: local
user: node-problem-detector
name: service-account-context
current-context: service-account-context
EOF
}
function create-master-etcd-auth {
if [[ -n "${ETCD_CA_CERT:-}" && -n "${ETCD_PEER_KEY:-}" && -n "${ETCD_PEER_CERT:-}" ]]; then
local -r auth_dir="/etc/srv/kubernetes"
@ -660,6 +686,37 @@ EOF
systemctl start kubelet.service
}
# This function assembles the node problem detector systemd service file and
# starts it using systemctl.
function start-node-problem-detector {
echo "Start node problem detector"
local -r npd_bin="${KUBE_HOME}/bin/node-problem-detector"
local -r km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor.json"
echo "Using node problem detector binary at ${npd_bin}"
local flags="${NPD_TEST_LOG_LEVEL:-"--v=2"} ${NPD_TEST_ARGS:-}"
flags+=" --logtostderr"
flags+=" --system-log-monitors=${km_config}"
flags+=" --apiserver-override=https://${KUBERNETES_MASTER_NAME}?inClusterConfig=false&auth=/var/lib/node-problem-detector/kubeconfig"
# Write the systemd service file for node problem detector.
cat <<EOF >/etc/systemd/system/node-problem-detector.service
[Unit]
Description=Kubernetes node problem detector
Requires=network-online.target
After=network-online.target
[Service]
Restart=always
RestartSec=10
ExecStart=${npd_bin} ${flags}
[Install]
WantedBy=multi-user.target
EOF
systemctl start node-problem-detector.service
}
# Create the log file and set its properties.
#
# $1 is the file to create.
@ -1249,9 +1306,13 @@ function start-kube-addons {
if [[ "${ENABLE_CLUSTER_UI:-}" == "true" ]]; then
setup-addon-manifests "addons" "dashboard"
fi
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "true" ]]; then
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "daemonset" ]]; then
setup-addon-manifests "addons" "node-problem-detector"
fi
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
# Setup role binding for standalone node problem detector.
setup-addon-manifests "addons" "node-problem-detector/standalone"
fi
if echo "${ADMISSION_CONTROL:-}" | grep -q "LimitRanger"; then
setup-addon-manifests "admission-controls" "limit-range"
fi
@ -1404,6 +1465,9 @@ if [[ "${KUBERNETES_MASTER:-}" == "true" ]]; then
else
create-kubelet-kubeconfig
create-kubeproxy-kubeconfig
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
create-node-problem-detector-kubeconfig
fi
fi
override-kubectl
@ -1434,6 +1498,9 @@ else
if [[ "${PREPULL_E2E_IMAGES:-}" == "true" ]]; then
start-image-puller
fi
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
start-node-problem-detector
fi
fi
reset-motd
echo "Done for the configuration for kubernetes"

View File

@ -130,6 +130,22 @@ function install-gci-mounter-tools {
chmod a+x "${rkt_dst}/rkt"
}
# Install node problem detector binary.
function install-node-problem-detector {
local -r npd_version="v0.3.0-alpha.1"
local -r npd_sha1="46f963fac14d92021c8b2a648a6cb0337c1bc833"
local -r npd_release_path="https://storage.googleapis.com/kubernetes-release"
local -r npd_tar="node-problem-detector-${npd_version}.tar.gz"
download-or-bust "${npd_sha1}" "${npd_release_path}/node-problem-detector/${npd_tar}"
local -r npd_dir="${KUBE_HOME}/node-problem-detector"
mkdir -p "${npd_dir}"
tar xzf "${KUBE_HOME}/${npd_tar}" -C "${npd_dir}" --overwrite
mv "${npd_dir}/bin"/* "${KUBE_HOME}/bin"
chmod a+x "${KUBE_HOME}/bin/node-problem-detector"
rmdir "${npd_dir}/bin"
rm -f "${KUBE_HOME}/${npd_tar}"
}
# Downloads kubernetes binaries and kube-system manifest tarball, unpacks them,
# and places them into suitable directories. Files are placed in /home/kubernetes.
function install-kube-binary-config {
@ -153,6 +169,9 @@ function install-kube-binary-config {
cp "${src_dir}/"*.docker_tag "${dst_dir}"
if [[ "${KUBERNETES_MASTER:-}" == "false" ]]; then
cp "${src_dir}/kube-proxy.tar" "${dst_dir}"
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
install-node-problem-detector
fi
else
cp "${src_dir}/kube-apiserver.tar" "${dst_dir}"
cp "${src_dir}/kube-controller-manager.tar" "${dst_dir}"

View File

@ -972,6 +972,9 @@ start_kube_addons() {
if [ "${ENABLE_CLUSTER_UI:-}" = "true" ]; then
setup_addon_manifests "addons" "dashboard"
fi
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "daemonset" ]]; then
setup-addon-manifests "addons" "node-problem-detector"
fi
if echo "${ADMISSION_CONTROL:-}" | grep -q "LimitRanger"; then
setup_addon_manifests "admission-controls" "limit-range"
fi

View File

@ -172,6 +172,7 @@ function get-node-os() {
# Vars set:
# KUBELET_TOKEN
# KUBE_PROXY_TOKEN
# NODE_PROBLEM_DETECTOR_TOKEN
# CA_CERT_BASE64
# EXTRA_DOCKER_OPTS
# KUBELET_CERT_BASE64
@ -206,6 +207,7 @@ fi
# INSTANCE_GROUPS
# KUBELET_TOKEN
# KUBE_PROXY_TOKEN
# NODE_PROBLEM_DETECTOR_TOKEN
# CA_CERT_BASE64
# EXTRA_DOCKER_OPTS
# KUBELET_CERT_BASE64
@ -228,6 +230,7 @@ function prepare-node-upgrade() {
local node_env=$(get-node-env)
KUBELET_TOKEN=$(get-env-val "${node_env}" "KUBELET_TOKEN")
KUBE_PROXY_TOKEN=$(get-env-val "${node_env}" "KUBE_PROXY_TOKEN")
NODE_PROBLEM_DETECTOR_TOKEN=$(get-env-val "${node_env}" "NODE_PROBLEM_DETECTOR_TOKEN")
CA_CERT_BASE64=$(get-env-val "${node_env}" "CA_CERT")
EXTRA_DOCKER_OPTS=$(get-env-val "${node_env}" "EXTRA_DOCKER_OPTS")
KUBELET_CERT_BASE64=$(get-env-val "${node_env}" "KUBELET_CERT")

View File

@ -825,6 +825,9 @@ function create-master() {
# http://issue.k8s.io/3168
KUBELET_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
KUBE_PROXY_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
NODE_PROBLEM_DETECTOR_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
fi
# Reserve the master's IP so that it can later be transferred to another VM
# without disrupting the kubelets.

View File

@ -161,10 +161,10 @@ addon-dir-create:
- file_mode: 644
{% endif %}
{% if pillar.get('enable_node_problem_detector', '').lower() == 'true' %}
/etc/kubernetes/addons/node-problem-detector/node-problem-detector.yaml:
{% if pillar.get('enable_node_problem_detector', '').lower() == 'daemonset' %}
/etc/kubernetes/addons/node-problem-detector/npd.yaml:
file.managed:
- source: salt://kube-addons/node-problem-detector/node-problem-detector.yaml
- source: salt://kube-addons/node-problem-detector/npd.yaml
- user: root
- group: root
- file_mode: 644

View File

@ -151,7 +151,7 @@
},
{
"name": "hollow-node-problem-detector",
"image": "gcr.io/google_containers/node-problem-detector:v0.3.0-alpha.0",
"image": "gcr.io/google_containers/node-problem-detector:v0.3.0-alpha.1",
"env": [
{
"name": "NODE_NAME",
@ -164,7 +164,7 @@
],
"command": [
"/node-problem-detector",
"--kernel-monitor=/config/kernel.monitor",
"--system-log-monitors=/config/kernel.monitor",
"--apiserver-override=https://{{master_ip}}:443?inClusterConfig=false&auth=/kubeconfig/npd.kubeconfig",
"--alsologtostderr",
"1>>/var/logs/npd_$(NODE_NAME).log 2>&1"

View File

@ -1,7 +1,12 @@
{
"plugin": "filelog",
"pluginConfig": {
"timestamp": "dummy",
"message": "dummy",
"timestampFormat": "dummy"
},
"logPath": "/log/faillog",
"lookback": "10m",
"startPattern": "Initializing cgroup subsys cpuset",
"bufferSize": 10,
"source": "kernel-monitor",
"conditions": [],