Merge pull request #66485 from bmoyles0117/apply-latest-stackdriver-fixes

Automatic merge from submit-queue (batch tested with PRs 59030, 64666, 66251, 66485, 66813). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. A large set of improvements to the Stackdriver components. **What this PR does / why we need it**: This PR delivers a large set of improvements for both the Stackdriver Logging agent and the Stackdriver Metadata agent. **Release note**: ```release-note Metadata Agent Improvements Bump metadata agent version to 0.2-0.0.21-1. Expand the metadata agent's access to all API groups. Remove metadata agent config maps in favor of command line flags. Update the metadata agent's liveness probe to a new /healthz handler. Logging Agent Improvements Bump logging agent version to 0.2-1.5.33-1-k8s-1. Appropriately set log severity for k8s_container. Fix detect exceptions plugin to analyze message field instead of log field. Fix detect exceptions plugin to analyze streams based on local resource id. Disable the metadata agent for monitored resource construction in logging. Disable timestamp adjustment in logs to optimize performance. Reduce logging agent buffer chunk limit to 512k to optimize performance. ```
2018-08-06 12:21:08 -07:00 · 2018-08-06 12:21:08 -07:00 · 51faf6ebdb
parent c06e76ce7f 32c2bfadfd
commit 51faf6ebdb
7 changed files with 47 additions and 73 deletions
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
@ -98,6 +98,8 @@ data:
        # instead of jsonPayload after extracting 'time', 'severity' and
        # 'stream' from the record.
        message ${record['log']}
+        # If 'severity' is not set, assume stderr is ERROR and stdout is INFO.
+        severity ${record['severity'] || if record['stream'] == 'stderr' then 'ERROR' else 'INFO' end}
      </record>
      tag ${if record['stream'] == 'stderr' then 'raw.stderr' else 'raw.stdout' end}
      remove_keys stream,log
@ -109,7 +111,7 @@ data:

      remove_tag_prefix raw
      message message
-      stream stream
+      stream "logging.googleapis.com/local_resource_id"
      multiline_flush_interval 5
      max_bytes 500000
      max_lines 1000
@ -408,9 +410,9 @@ data:
      buffer_queue_full_action block
      # Set the chunk limit conservatively to avoid exceeding the recommended
      # chunk size of 5MB per write request.
-      buffer_chunk_limit 1M
+      buffer_chunk_limit 512k
      # Cap the combined memory usage of this buffer and the one below to
-      # 1MiB/chunk * (6 + 2) chunks = 8 MiB
+      # 512KiB/chunk * (6 + 2) chunks = 4 MiB
      buffer_queue_limit 6
      # Never wait more than 5 seconds before flushing logs in the non-error case.
      flush_interval 5s
@ -421,8 +423,9 @@ data:
      # Use multiple threads for processing.
      num_threads 2
      use_grpc true
-      # Use Metadata Agent to get monitored resource.
-      enable_metadata_agent true
+      # Skip timestamp adjustment as this is in a controlled environment with
+      # known timestamp format. This helps with CPU usage.
+      adjust_invalid_timestamps false
    </match>

    # Attach local_resource_id for 'k8s_node' monitored resource.
@ -450,15 +453,16 @@ data:
      buffer_type file
      buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
      buffer_queue_full_action block
-      buffer_chunk_limit 1M
+      buffer_chunk_limit 512k
      buffer_queue_limit 2
      flush_interval 5s
      max_retry_wait 30
      disable_retry_limit
      num_threads 2
      use_grpc true
-      # Use Metadata Agent to get monitored resource.
-      enable_metadata_agent true
+      # Skip timestamp adjustment as this is in a controlled environment with
+      # known timestamp format. This helps with CPU usage.
+      adjust_invalid_timestamps false
    </match>
 metadata:
  name: fluentd-gcp-config-v1.2.5
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
@ -1,13 +1,13 @@
 apiVersion: extensions/v1beta1
 kind: DaemonSet
 metadata:
-  name: fluentd-gcp-v3.0.0
+  name: fluentd-gcp-{{ fluentd_gcp_yaml_version }}
  namespace: kube-system
  labels:
    k8s-app: fluentd-gcp
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
-    version: v3.0.0
+    version: {{ fluentd_gcp_yaml_version }}
 spec:
  updateStrategy:
    type: RollingUpdate
@ -16,7 +16,7 @@ spec:
      labels:
        k8s-app: fluentd-gcp
        kubernetes.io/cluster-service: "true"
-        version: v3.0.0
+        version: {{ fluentd_gcp_yaml_version }}
      # This annotation ensures that fluentd does not get evicted if the node
      # supports critical pod annotation based priority scheme.
      # Note that this does not guarantee admission on the nodes (#40573).
--- a/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml
+++ b/cluster/addons/metadata-agent/stackdriver/metadata-agent-rbac.yaml
@ -7,9 +7,7 @@ metadata:
    addonmanager.kubernetes.io/mode: Reconcile
 rules:
 - apiGroups:
-  - ""
-  - "apps"
-  - "extensions"
+  - "*"
  resources:
  - "*"
  verbs:
--- a/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml
+++ b/cluster/addons/metadata-agent/stackdriver/metadata-agent.yaml
@ -7,22 +7,6 @@ metadata:
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: Reconcile
 ---
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: metadata-agent-config
-  namespace: kube-system
-  labels:
-    kubernetes.io/cluster-service: "true"
-    addonmanager.kubernetes.io/mode: Reconcile
-data:
-  node_level.conf: |-
-    KubernetesUseWatch: true
-    KubernetesClusterLevelMetadata: false
-  cluster_level.conf: |-
-    KubernetesUseWatch: true
-    KubernetesClusterLevelMetadata: true
---
 kind: DaemonSet
 apiVersion: extensions/v1beta1
 metadata:
@ -45,27 +29,22 @@ spec:
    spec:
      serviceAccountName: metadata-agent
      containers:
-      - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.19-1
+      - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.21-1
        imagePullPolicy: IfNotPresent
        name: metadata-agent
        livenessProbe:
-          exec:
-            command:
-              - /bin/bash
-              - -c
-              - |
-                if [[ -f /var/run/metadata-agent/health/unhealthy ]]; then
-                  exit 1;
-                fi
-          periodSeconds: 10
+          httpGet:
+            path: /healthz
+            port: 8000
+          initialDelaySeconds: 30
+          periodSeconds: 60
+          timeoutSeconds: 5
          failureThreshold: 1
          successThreshold: 1
-        volumeMounts:
-          - name: metadata-agent-config-volume
-            mountPath: /etc/config
-        command:
-          - /opt/stackdriver/metadata/sbin/metadatad
-          - --config-file=/etc/config/node_level.conf
+        args:
+        - -o KubernetesUseWatch=true 
+        - -o KubernetesClusterLevelMetadata=false 
+        - -o MetadataReporterPurgeDeleted=true
        ports:
        - containerPort: 8000
          hostPort: 8799
@ -78,10 +57,6 @@ spec:
      restartPolicy: Always
      schedulerName: default-scheduler
      terminationGracePeriodSeconds: 30
-      volumes:
-        - name: metadata-agent-config-volume
-          configMap:
-            name: metadata-agent-config
  updateStrategy:
    rollingUpdate:
      maxUnavailable: 1
@ -110,27 +85,22 @@ spec:
    spec:
      serviceAccountName: metadata-agent
      containers:
-      - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.19-1
+      - image: gcr.io/stackdriver-agents/stackdriver-metadata-agent:0.2-0.0.21-1
        imagePullPolicy: IfNotPresent
        name: metadata-agent
        livenessProbe:
-          exec:
-            command:
-              - /bin/bash
-              - -c
-              - |
-                if [[ -f /var/run/metadata-agent/health/unhealthy ]]; then
-                  exit 1;
-                fi
-          periodSeconds: 10
+          httpGet:
+            path: /healthz
+            port: 8000
+          initialDelaySeconds: 30
+          periodSeconds: 60
+          timeoutSeconds: 5
          failureThreshold: 1
          successThreshold: 1
-        volumeMounts:
-          - name: metadata-agent-config-volume
-            mountPath: /etc/config
-        command:
-          - /opt/stackdriver/metadata/sbin/metadatad
-          - --config-file=/etc/config/cluster_level.conf
+        args:
+        - -o KubernetesUseWatch=true 
+        - -o KubernetesClusterLevelMetadata=true 
+        - -o MetadataReporterPurgeDeleted=true
        ports:
        - containerPort: 8000
          protocol: TCP
@ -142,10 +112,6 @@ spec:
      restartPolicy: Always
      schedulerName: default-scheduler
      terminationGracePeriodSeconds: 30
-      volumes:
-        - name: metadata-agent-config-volume
-          configMap:
-            name: metadata-agent-config
  strategy:
    rollingUpdate:
      maxUnavailable: 1
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@ -404,6 +404,8 @@ if [[ -n "${LOGROTATE_MAX_SIZE:-}" ]]; then
 fi

 # Fluentd requirements
+# YAML exists to trigger a configuration refresh when changes are made.
+FLUENTD_GCP_YAML_VERSION="v3.1.0"
 FLUENTD_GCP_VERSION="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}"
 FLUENTD_GCP_MEMORY_LIMIT="${FLUENTD_GCP_MEMORY_LIMIT:-}"
 FLUENTD_GCP_CPU_REQUEST="${FLUENTD_GCP_CPU_REQUEST:-}"
@ -422,7 +424,7 @@ CUSTOM_KUBE_DASHBOARD_BANNER="${CUSTOM_KUBE_DASHBOARD_BANNER:-}"
 LOGGING_STACKDRIVER_RESOURCE_TYPES="${LOGGING_STACKDRIVER_RESOURCE_TYPES:-old}"

 # Adding to PROVIDER_VARS, since this is GCP-specific.
-PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES"
+PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_YAML_VERSION FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES"

 # Fluentd configuration for node-journal
 ENABLE_NODE_JOURNAL="${ENABLE_NODE_JOURNAL:-false}"
--- a/cluster/gce/config-test.sh
+++ b/cluster/gce/config-test.sh
@ -420,6 +420,8 @@ if [[ -n "${LOGROTATE_MAX_SIZE:-}" ]]; then
 fi

 # Fluentd requirements
+# YAML exists to trigger a configuration refresh when changes are made.
+FLUENTD_GCP_YAML_VERSION="v3.1.0"
 FLUENTD_GCP_VERSION="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}"
 FLUENTD_GCP_MEMORY_LIMIT="${FLUENTD_GCP_MEMORY_LIMIT:-}"
 FLUENTD_GCP_CPU_REQUEST="${FLUENTD_GCP_CPU_REQUEST:-}"
@ -438,7 +440,7 @@ CUSTOM_KUBE_DASHBOARD_BANNER="${CUSTOM_KUBE_DASHBOARD_BANNER:-}"
 LOGGING_STACKDRIVER_RESOURCE_TYPES="${LOGGING_STACKDRIVER_RESOURCE_TYPES:-old}"

 # Adding to PROVIDER_VARS, since this is GCP-specific.
-PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES"
+PROVIDER_VARS="${PROVIDER_VARS:-} FLUENTD_GCP_YAML_VERSION FLUENTD_GCP_VERSION FLUENTD_GCP_MEMORY_LIMIT FLUENTD_GCP_CPU_REQUEST FLUENTD_GCP_MEMORY_REQUEST HEAPSTER_GCP_BASE_MEMORY HEAPSTER_GCP_MEMORY_PER_NODE HEAPSTER_GCP_BASE_CPU HEAPSTER_GCP_CPU_PER_NODE CUSTOM_KUBE_DASHBOARD_BANNER LOGGING_STACKDRIVER_RESOURCE_TYPES"

 # Fluentd configuration for node-journal
 ENABLE_NODE_JOURNAL="${ENABLE_NODE_JOURNAL:-false}"
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@ -2237,7 +2237,9 @@ function setup-fluentd {
    fluentd_gcp_configmap_name="fluentd-gcp-config-old"
  fi
  sed -i -e "s@{{ fluentd_gcp_configmap_name }}@${fluentd_gcp_configmap_name}@g" "${fluentd_gcp_yaml}"
-  fluentd_gcp_version="${FLUENTD_GCP_VERSION:-0.2-1.5.30-1-k8s}"
+  fluentd_gcp_yaml_version="${FLUENTD_GCP_YAML_VERSION:-v3.1.0}"
+  sed -i -e "s@{{ fluentd_gcp_yaml_version }}@${fluentd_gcp_yaml_version}@g" "${fluentd_gcp_yaml}"
+  fluentd_gcp_version="${FLUENTD_GCP_VERSION:-0.3-1.5.34-1-k8s-1}"
  sed -i -e "s@{{ fluentd_gcp_version }}@${fluentd_gcp_version}@g" "${fluentd_gcp_yaml}"
  update-prometheus-to-sd-parameters ${fluentd_gcp_yaml}
  start-fluentd-resource-update ${fluentd_gcp_yaml}