Merge pull request #59103 from Random-Liu/upload-container-runtime-log

Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Upload container runtime log to sd/es. I've verified this in my environment. My stackdriver has an extra `container-runtime` entry for node log, and it collects container runtime daemon log correctly. @yujuhong @feiskyer @crassirostris @piosz @kubernetes/sig-node-pr-reviews @kubernetes/sig-instrumentation-pr-reviews Signed-off-by: Lantao Liu <lantaol@google.com> **Release note**: ```release-note Container runtime daemon (e.g. dockerd) logs in GCE cluster will be uploaded to stackdriver and elasticsearch with tag `container-runtime` ```
2018-02-14 03:33:21 -08:00 · 2018-02-14 03:33:21 -08:00 · bc9c6df31d
parent f114f0e45c 8d920d095c
commit bc9c6df31d
9 changed files with 46 additions and 8 deletions
--- a/cluster/addons/fluentd-elasticsearch/fluentd-es-configmap.yaml
+++ b/cluster/addons/fluentd-elasticsearch/fluentd-es-configmap.yaml
@ -1,7 +1,7 @@
 kind: ConfigMap
 apiVersion: v1
 metadata:
-  name: fluentd-es-config-v0.1.3
+  name: fluentd-es-config-v0.1.4
  namespace: kube-system
  labels:
    addonmanager.kubernetes.io/mode: Reconcile
@ -160,6 +160,7 @@ data:
    # Examples:
    # time="2016-02-04T06:51:03.053580605Z" level=info msg="GET /containers/json"
    # time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404
+    # TODO(random-liu): Remove this after cri container runtime rolls out.
    <source>
      @id docker.log
      @type tail
@ -307,6 +308,7 @@ data:
    </source>

    # Logs from systemd-journal for interesting services.
+    # TODO(random-liu): Remove this after cri container runtime rolls out.
    <source>
      @id journald-docker
      @type systemd
@ -319,6 +321,18 @@ data:
      tag docker
    </source>

+    <source>
+      @id journald-container-runtime
+      @type systemd
+      filters [{ "_SYSTEMD_UNIT": "{{ container_runtime }}.service" }]
+      <storage>
+        @type local
+        persistent true
+      </storage>
+      read_from_head true
+      tag container-runtime
+    </source>
+
    <source>
      @id journald-kubelet
      @type systemd
--- a/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml
+++ b/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml
@ -113,4 +113,4 @@ spec:
          path: /usr/lib64
      - name: config-volume
        configMap:
-          name: fluentd-es-config-v0.1.3
+          name: fluentd-es-config-v0.1.4
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
@ -102,6 +102,7 @@ data:
    # Examples:
    # time="2016-02-04T06:51:03.053580605Z" level=info msg="GET /containers/json"
    # time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404
+    # TODO(random-liu): Remove this after cri container runtime rolls out.
    <source>
      type tail
      format /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<message>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/
@ -239,6 +240,8 @@ data:
    </source>

    # Logs from systemd-journal for interesting services.
+    # TODO(random-liu): Keep this for compatibility, remove this after
+    # cri container runtime rolls out.
    <source>
      type systemd
      filters [{ "_SYSTEMD_UNIT": "docker.service" }]
@ -247,6 +250,14 @@ data:
      tag docker
    </source>

+    <source>
+      type systemd
+      filters [{ "_SYSTEMD_UNIT": "{{ container_runtime }}.service" }]
+      pos_file /var/log/gcp-journald-container-runtime.pos
+      read_from_head true
+      tag container-runtime
+    </source>
+
    <source>
      type systemd
      filters [{ "_SYSTEMD_UNIT": "kubelet.service" }]
@ -387,7 +398,7 @@ data:
      num_threads 2
    </match>
 metadata:
-  name: fluentd-gcp-config-v1.2.3
+  name: fluentd-gcp-config-v1.2.4
  namespace: kube-system
  labels:
    addonmanager.kubernetes.io/mode: Reconcile
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
@ -123,4 +123,4 @@ spec:
          path: /usr/lib64
      - name: config-volume
        configMap:
-          name: fluentd-gcp-config-v1.2.3
+          name: fluentd-gcp-config-v1.2.4
--- a/cluster/common.sh
+++ b/cluster/common.sh
@ -562,6 +562,7 @@ ENABLE_PROMETHEUS_TO_SD: $(yaml-quote ${ENABLE_PROMETHEUS_TO_SD:-false})
 ENABLE_POD_PRIORITY: $(yaml-quote ${ENABLE_POD_PRIORITY:-})
 CONTAINER_RUNTIME: $(yaml-quote ${CONTAINER_RUNTIME:-})
 CONTAINER_RUNTIME_ENDPOINT: $(yaml-quote ${CONTAINER_RUNTIME_ENDPOINT:-})
+CONTAINER_RUNTIME_NAME: $(yaml-quote ${CONTAINER_RUNTIME_NAME:-})
 NODE_LOCAL_SSDS_EXT: $(yaml-quote ${NODE_LOCAL_SSDS_EXT:-})
 LOAD_IMAGE_COMMAND: $(yaml-quote ${LOAD_IMAGE_COMMAND:-})
 EOF
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@ -82,7 +82,8 @@ NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-cos-cloud}
 NODE_SERVICE_ACCOUNT=${KUBE_GCE_NODE_SERVICE_ACCOUNT:-default}
 CONTAINER_RUNTIME=${KUBE_CONTAINER_RUNTIME:-docker}
 CONTAINER_RUNTIME_ENDPOINT=${KUBE_CONTAINER_RUNTIME_ENDPOINT:-}
-LOAD_IMAGE_COMMAND=${KUBE_LOAD_IMAGE_COMMAND:-docker load -i}
+CONTAINER_RUNTIME_NAME=${KUBE_CONTAINER_RUNTIME_NAME:-}
+LOAD_IMAGE_COMMAND=${KUBE_LOAD_IMAGE_COMMAND:-}
 RKT_VERSION=${KUBE_RKT_VERSION:-1.23.0}
 RKT_STAGE1_IMAGE=${KUBE_RKT_STAGE1_IMAGE:-coreos.com/rkt/stage1-coreos}
 # MASTER_EXTRA_METADATA is the extra instance metadata on master instance separated by commas.
--- a/cluster/gce/config-test.sh
+++ b/cluster/gce/config-test.sh
@ -80,7 +80,8 @@ NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-cos-cloud}
 NODE_SERVICE_ACCOUNT=${KUBE_GCE_NODE_SERVICE_ACCOUNT:-default}
 CONTAINER_RUNTIME=${KUBE_CONTAINER_RUNTIME:-docker}
 CONTAINER_RUNTIME_ENDPOINT=${KUBE_CONTAINER_RUNTIME_ENDPOINT:-}
-LOAD_IMAGE_COMMAND=${KUBE_LOAD_IMAGE_COMMAND:-docker load -i}
+CONTAINER_RUNTIME_NAME=${KUBE_CONTAINER_RUNTIME_NAME:-}
+LOAD_IMAGE_COMMAND=${KUBE_LOAD_IMAGE_COMMAND:-}
 GCI_DOCKER_VERSION=${KUBE_GCI_DOCKER_VERSION:-}
 RKT_VERSION=${KUBE_RKT_VERSION:-1.23.0}
 RKT_STAGE1_IMAGE=${KUBE_RKT_STAGE1_IMAGE:-coreos.com/rkt/stage1-coreos}
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@ -2028,6 +2028,12 @@ function start-fluentd-resource-update {
  wait-for-apiserver-and-update-fluentd ${fluentd_gcp_yaml} &
 }

+# Update {{ container-runtime }} with actual container runtime name.
+function update-container-runtime {
+  local -r configmap_yaml="$1"
+  sed -i -e "s@{{ *container_runtime *}}@${CONTAINER_RUNTIME_NAME:-docker}@g" "${configmap_yaml}"
+}
+
 # Updates parameters in yaml file for prometheus-to-sd configuration, or
 # removes component if it is disabled.
 function update-prometheus-to-sd-parameters {
@ -2180,15 +2186,19 @@ EOF
     [[ "${LOGGING_DESTINATION:-}" == "elasticsearch" ]] && \
     [[ "${ENABLE_CLUSTER_LOGGING:-}" == "true" ]]; then
    setup-addon-manifests "addons" "fluentd-elasticsearch"
+    local -r fluentd_es_configmap_yaml="${dst_dir}/fluentd-elasticsearch/fluentd-es-configmap.yaml"
+    update-container-runtime ${fluentd_es_configmap_yaml}
  fi
  if [[ "${ENABLE_NODE_LOGGING:-}" == "true" ]] && \
     [[ "${LOGGING_DESTINATION:-}" == "gcp" ]]; then
    setup-addon-manifests "addons" "fluentd-gcp"
    local -r event_exporter_yaml="${dst_dir}/fluentd-gcp/event-exporter.yaml"
    local -r fluentd_gcp_yaml="${dst_dir}/fluentd-gcp/fluentd-gcp-ds.yaml"
+    local -r fluentd_gcp_configmap_yaml="${dst_dir}/fluentd-gcp/fluentd-gcp-configmap.yaml"
    update-prometheus-to-sd-parameters ${event_exporter_yaml}
    update-prometheus-to-sd-parameters ${fluentd_gcp_yaml}
    start-fluentd-resource-update ${fluentd_gcp_yaml}
+    update-container-runtime ${fluentd_gcp_configmap_yaml}
  fi
  if [[ "${ENABLE_CLUSTER_UI:-}" == "true" ]]; then
    setup-addon-manifests "addons" "dashboard"
--- a/test/e2e/instrumentation/logging/stackdrvier/basic.go
+++ b/test/e2e/instrumentation/logging/stackdrvier/basic.go
@ -178,9 +178,9 @@ var _ = instrumentation.SIGDescribe("Cluster level logging implemented by Stackd
 				framework.ExpectNoError(err)
 			})

-			ginkgo.By("Waiting for some docker logs to be ingested from each node", func() {
+			ginkgo.By("Waiting for some container runtime logs to be ingested from each node", func() {
 				nodeIds := utils.GetNodeIds(f.ClientSet)
-				log := fmt.Sprintf("projects/%s/logs/docker", framework.TestContext.CloudConfig.ProjectID)
+				log := fmt.Sprintf("projects/%s/logs/container-runtime", framework.TestContext.CloudConfig.ProjectID)
 				c := utils.NewLogChecker(p, utils.UntilFirstEntryFromLog(log), utils.JustTimeout, nodeIds...)
 				err := utils.WaitForLogs(c, ingestionInterval, ingestionTimeout)
 				framework.ExpectNoError(err)