Merge pull request #59128 from bmoyles0117/master

Automatic merge from submit-queue (batch tested with PRs 60433, 59982, 59128, 60243, 60440). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. [fluentd-gcp addon] Update to use Stackdriver Agent image. Update the fluentd DaemonSet to use the Stackdriver Logging Agent container image. The Stackdriver Logging Agent container image uses fluentd v0.14.25. We add a special label to each log record as a signal to logging backends to handle both new and legacy resource types. **Release note:** ```release-note [fluentd-gcp addon] Switch to the image, provided by Stackdriver. ```
2018-02-27 08:25:43 -08:00 · 2018-02-27 08:25:43 -08:00 · 7480face90
parent 40889e7429 84a86cffce
commit 7480face90
6 changed files with 64 additions and 50 deletions
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
@ -46,33 +46,42 @@ data:
    # CRI Log Example:
    # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here
    <source>
-      type tail
+      @type tail
      path /var/log/containers/*.log
      pos_file /var/log/gcp-containers.log.pos
      tag reform.*
      read_from_head true
-      format multi_format
-      <pattern>
-        format json
-        time_key time
-        time_format %Y-%m-%dT%H:%M:%S.%NZ
-      </pattern>
-      <pattern>
-        format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/
-        time_format %Y-%m-%dT%H:%M:%S.%N%:z
-      </pattern>
+      format none
    </source>

    <filter reform.**>
-      type parser
+      @type parser
+      key_name message
+      <parse>
+        @type multi_format
+        <pattern>
+          format json
+          time_key time
+          time_format %Y-%m-%dT%H:%M:%S.%NZ
+        </pattern>
+        <pattern>
+          format /^(?<time>.+) (?<stream>stdout|stderr) [^ ]* (?<log>.*)$/
+          time_format %Y-%m-%dT%H:%M:%S.%N%:z
+        </pattern>
+      </parse>
+    </filter>
+
+    <filter reform.**>
+      @type parser
      format /^(?<severity>\w)(?<time>\d{4} [^\s]*)\s+(?<pid>\d+)\s+(?<source>[^ \]]+)\] (?<log>.*)/
      reserve_data true
      suppress_parse_error_log true
+      emit_invalid_record_to_error false
      key_name log
    </filter>

    <match reform.**>
-      type record_reformer
+      @type record_reformer
      enable_ruby true
      tag raw.kubernetes.${tag_suffix[4].split('-')[0..-2].join('-')}
    </match>
@ -92,7 +101,7 @@ data:
    # Example:
    # Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script
    <source>
-      type tail
+      @type tail
      format syslog
      path /var/log/startupscript.log
      pos_file /var/log/gcp-startupscript.log.pos
@ -104,7 +113,7 @@ data:
    # time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404
    # TODO(random-liu): Remove this after cri container runtime rolls out.
    <source>
-      type tail
+      @type tail
      format /^time="(?<time>[^)]*)" level=(?<severity>[^ ]*) msg="(?<message>[^"]*)"( err="(?<error>[^"]*)")?( statusCode=($<status_code>\d+))?/
      path /var/log/docker.log
      pos_file /var/log/gcp-docker.log.pos
@ -114,7 +123,7 @@ data:
    # Example:
    # 2016/02/04 06:52:38 filePurge: successfully removed file /var/etcd/data/member/wal/00000000000006d0-00000000010a23d1.wal
    <source>
-      type tail
+      @type tail
      # Not parsing this, because it doesn't have anything particularly useful to
      # parse out of it (like severities).
      format none
@ -130,7 +139,7 @@ data:
    # Example:
    # I0204 07:32:30.020537    3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537]
    <source>
-      type tail
+      @type tail
      format multiline
      multiline_flush_interval 5s
      format_firstline /^\w\d{4}/
@ -144,7 +153,7 @@ data:
    # Example:
    # I1118 21:26:53.975789       6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
    <source>
-      type tail
+      @type tail
      format multiline
      multiline_flush_interval 5s
      format_firstline /^\w\d{4}/
@ -158,7 +167,7 @@ data:
    # Example:
    # I0204 07:00:19.604280       5 handlers.go:131] GET /api/v1/nodes: (1.624207ms) 200 [[kube-controller-manager/v1.1.3 (linux/amd64) kubernetes/6a81b50] 127.0.0.1:38266]
    <source>
-      type tail
+      @type tail
      format multiline
      multiline_flush_interval 5s
      format_firstline /^\w\d{4}/
@ -172,7 +181,7 @@ data:
    # Example:
    # I0204 06:55:31.872680       5 servicecontroller.go:277] LB already exists and doesn't need update for service kube-system/kube-ui
    <source>
-      type tail
+      @type tail
      format multiline
      multiline_flush_interval 5s
      format_firstline /^\w\d{4}/
@ -186,7 +195,7 @@ data:
    # Example:
    # W0204 06:49:18.239674       7 reflector.go:245] pkg/scheduler/factory/factory.go:193: watch of *api.Service ended with: 401: The event in requested index is outdated and cleared (the requested history has been cleared [2578313/2577886]) [2579312]
    <source>
-      type tail
+      @type tail
      format multiline
      multiline_flush_interval 5s
      format_firstline /^\w\d{4}/
@ -200,7 +209,7 @@ data:
    # Example:
    # I1104 10:36:20.242766       5 rescheduler.go:73] Running Rescheduler
    <source>
-      type tail
+      @type tail
      format multiline
      multiline_flush_interval 5s
      format_firstline /^\w\d{4}/
@ -214,7 +223,7 @@ data:
    # Example:
    # I0603 15:31:05.793605       6 cluster_manager.go:230] Reading config from path /etc/gce.conf
    <source>
-      type tail
+      @type tail
      format multiline
      multiline_flush_interval 5s
      format_firstline /^\w\d{4}/
@ -228,7 +237,7 @@ data:
    # Example:
    # I0603 15:31:05.793605       6 cluster_manager.go:230] Reading config from path /etc/gce.conf
    <source>
-      type tail
+      @type tail
      format multiline
      multiline_flush_interval 5s
      format_firstline /^\w\d{4}/
@ -243,7 +252,7 @@ data:
    # TODO(random-liu): Keep this for compatibility, remove this after
    # cri container runtime rolls out.
    <source>
-      type systemd
+      @type systemd
      filters [{ "_SYSTEMD_UNIT": "docker.service" }]
      pos_file /var/log/gcp-journald-docker.pos
      read_from_head true
@ -251,7 +260,7 @@ data:
    </source>

    <source>
-      type systemd
+      @type systemd
      filters [{ "_SYSTEMD_UNIT": "{{ container_runtime }}.service" }]
      pos_file /var/log/gcp-journald-container-runtime.pos
      read_from_head true
@ -259,7 +268,7 @@ data:
    </source>

    <source>
-      type systemd
+      @type systemd
      filters [{ "_SYSTEMD_UNIT": "kubelet.service" }]
      pos_file /var/log/gcp-journald-kubelet.pos
      read_from_head true
@ -267,23 +276,13 @@ data:
    </source>

    <source>
-      type systemd
+      @type systemd
      filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }]
      pos_file /var/log/gcp-journald-node-problem-detector.pos
      read_from_head true
      tag node-problem-detector
    </source>
  monitoring.conf: |-
-    # Prometheus monitoring
-    <source>
-      @type prometheus
-      port 31337
-    </source>
-
-    <source>
-      @type prometheus_monitor
-    </source>
-
    # This source is used to acquire approximate process start timestamp,
    # which purpose is explained before the corresponding output plugin.
    <source>
@ -356,6 +355,8 @@ data:
      # Collect metrics in Prometheus registry about plugin activity.
      enable_monitoring true
      monitoring_type prometheus
+      # Allow log entries from multiple containers to be sent in the same request.
+      split_logs_by_tag false
      # Set the buffer type to file to improve the reliability and reduce the memory consumption
      buffer_type file
      buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
@ -376,6 +377,11 @@ data:
      disable_retry_limit
      # Use multiple threads for processing.
      num_threads 2
+      labels {
+        # The logging backend will take responsibility for double writing to
+        # the necessary resource types when this label is set.
+        "logging.googleapis.com/k8s_compatibility": "true"
+      }
    </match>

    # Keep a smaller buffer here since these logs are less important than the user's
@ -386,6 +392,8 @@ data:
      detect_json true
      enable_monitoring true
      monitoring_type prometheus
+      # Allow entries from multiple system logs to be sent in the same request.
+      split_logs_by_tag false
      detect_subservice false
      buffer_type file
      buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
@ -396,6 +404,11 @@ data:
      max_retry_wait 30
      disable_retry_limit
      num_threads 2
+      labels {
+        # The logging backend will take responsibility for double writing to
+        # the necessary resource types when this label is set.
+        "logging.googleapis.com/k8s_compatibility": "true"
+      }
    </match>
 metadata:
  name: fluentd-gcp-config-v1.2.4
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
@ -1,13 +1,13 @@
 apiVersion: extensions/v1beta1
 kind: DaemonSet
 metadata:
-  name: fluentd-gcp-v2.0.16
+  name: fluentd-gcp-v3.0.0
  namespace: kube-system
  labels:
    k8s-app: fluentd-gcp
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
-    version: v2.0.16
+    version: v3.0.0
 spec:
  updateStrategy:
    type: RollingUpdate
@ -16,7 +16,7 @@ spec:
      labels:
        k8s-app: fluentd-gcp
        kubernetes.io/cluster-service: "true"
-        version: v2.0.16
+        version: v3.0.0
      # This annotation ensures that fluentd does not get evicted if the node
      # supports critical pod annotation based priority scheme.
      # Note that this does not guarantee admission on the nodes (#40573).
@ -28,10 +28,7 @@ spec:
      dnsPolicy: Default
      containers:
      - name: fluentd-gcp
-        image: k8s.gcr.io/fluentd-gcp:2.0.16
-        env:
-        - name: FLUENTD_ARGS
-          value: --no-supervisor -q
+        image: gcr.io/stackdriver-agents/stackdriver-logging-agent:{{ fluentd_gcp_version }}
        volumeMounts:
        - name: varlog
          mountPath: /var/log
@ -42,7 +39,7 @@ spec:
          mountPath: /host/lib
          readOnly: true
        - name: config-volume
-          mountPath: /etc/fluent/config.d
+          mountPath: /etc/google-fluentd/config.d
        # Liveness probe is aimed to help in situarions where fluentd
        # silently hangs for no apparent reasons until manual restart.
        # The idea of this probe is that if fluentd is not queueing or
@ -82,7 +79,7 @@ spec:
          - /monitor
          - --stackdriver-prefix={{ prometheus_to_sd_prefix }}/addons
          - --api-override={{ prometheus_to_sd_endpoint }}
-          - --source=fluentd:http://localhost:31337?whitelisted=stackdriver_successful_requests_count,stackdriver_failed_requests_count,stackdriver_ingested_entries_count,stackdriver_dropped_entries_count
+          - --source=fluentd:http://localhost:24231?whitelisted=stackdriver_successful_requests_count,stackdriver_failed_requests_count,stackdriver_ingested_entries_count,stackdriver_dropped_entries_count
          - --pod-id=$(POD_NAME)
          - --namespace-id=$(POD_NAMESPACE)
        env:
--- a/cluster/addons/fluentd-gcp/scaler-deployment.yaml
+++ b/cluster/addons/fluentd-gcp/scaler-deployment.yaml
@ -22,7 +22,7 @@ spec:
        image: gcr.io/google-containers/fluentd-gcp-scaler:0.1
        command:
          - /scaler.sh
-          - --ds-name=fluentd-gcp-v2.0.16
+          - --ds-name=fluentd-gcp-v3.0.0
          - --scaling-policy=fluentd-gcp-scaling-policy
        env:
        # Defaults, used if no overrides are found in fluentd-gcp-scaling-policy
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@ -361,6 +361,7 @@ if [[ -n "${LOGROTATE_MAX_SIZE:-}" ]]; then
 fi

 # Fluentd requirements
+FLUENTD_GCP_VERSION="${FLUENTD_GCP_VERSION:-0.2-1.5.28-1}"
 FLUENTD_GCP_MEMORY_LIMIT="${FLUENTD_GCP_MEMORY_LIMIT:-}"
 FLUENTD_GCP_CPU_REQUEST="${FLUENTD_GCP_CPU_REQUEST:-}"
 FLUENTD_GCP_MEMORY_REQUEST="${FLUENTD_GCP_MEMORY_REQUEST:-}"
@ -372,7 +373,7 @@ HEAPSTER_GCP_BASE_CPU="${HEAPSTER_GCP_BASE_CPU:-80m}"
 HEAPSTER_GCP_CPU_PER_NODE="${HEAPSTER_GCP_CPU_PER_NODE:-0.5}"

 # Adding to PROVIDER_VARS, since this is GCP-specific.
-PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE"
+PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE"

 # prometheus-to-sd configuration
 PROMETHEUS_TO_SD_ENDPOINT="${PROMETHEUS_TO_SD_ENDPOINT:-https://monitoring.googleapis.com/}"
--- a/cluster/gce/config-test.sh
+++ b/cluster/gce/config-test.sh
@ -401,6 +401,7 @@ if [[ -n "${LOGROTATE_MAX_SIZE:-}" ]]; then
 fi

 # Fluentd requirements
+FLUENTD_GCP_VERSION="${FLUENTD_GCP_VERSION:-0.2-1.5.28-1}"
 FLUENTD_GCP_MEMORY_LIMIT="${FLUENTD_GCP_MEMORY_LIMIT:-}"
 FLUENTD_GCP_CPU_REQUEST="${FLUENTD_GCP_CPU_REQUEST:-}"
 FLUENTD_GCP_MEMORY_REQUEST="${FLUENTD_GCP_MEMORY_REQUEST:-}"
@ -412,7 +413,7 @@ HEAPSTER_GCP_BASE_CPU="${HEAPSTER_GCP_BASE_CPU:-80m}"
 HEAPSTER_GCP_CPU_PER_NODE="${HEAPSTER_GCP_CPU_PER_NODE:-0.5}"

 # Adding to PROVIDER_VARS, since this is GCP-specific.
-PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE"
+PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE"

 # prometheus-to-sd configuration
 PROMETHEUS_TO_SD_ENDPOINT="${PROMETHEUS_TO_SD_ENDPOINT:-https://monitoring.googleapis.com/}"
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@ -2141,6 +2141,8 @@ EOF
    local -r fluentd_gcp_yaml="${dst_dir}/fluentd-gcp/fluentd-gcp-ds.yaml"
    local -r fluentd_gcp_configmap_yaml="${dst_dir}/fluentd-gcp/fluentd-gcp-configmap.yaml"
    update-event-exporter ${event_exporter_yaml}
+    fluentd_gcp_version="${FLUENTD_GCP_VERSION:-0.2-1.5.28-1}"
+    sed -i -e "s@{{ fluentd_gcp_version }}@${fluentd_gcp_version}@g" "${fluentd_gcp_yaml}"
    update-prometheus-to-sd-parameters ${event_exporter_yaml}
    update-prometheus-to-sd-parameters ${fluentd_gcp_yaml}
    start-fluentd-resource-update ${fluentd_gcp_yaml}