Merge pull request #48812 from crassirostris/change-fluentd-monitoring

Automatic merge from submit-queue (batch tested with PRs 48812, 48276) Change fluentd-gcp monitoring to use metrics exposed by SD plugin Following https://github.com/GoogleCloudPlatform/fluent-plugin-google-cloud/pull/135, make fluentd-gcp expose metrics in Prometheus registry and use them instead of counting records in the pipeline. /cc @piosz @igorpeshansky ```release-note Fluentd-gcp DaemonSet exposes different set of metrics. ```
2017-07-14 04:43:42 -07:00 · 2017-07-14 04:43:42 -07:00 · 455e44b616
parent 9099e8e95f 125f759907
commit 455e44b616
2 changed files with 47 additions and 87 deletions
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml
@ -70,27 +70,14 @@ data:

    # Detect exceptions in the log output and forward them as one log entry.
    <match raw.kubernetes.**>
-      @type copy
+      @type detect_exceptions

-      <store>
-        @type prometheus
-
-        <metric>
-          type counter
-          name logging_line_count
-          desc Total number of lines generated by application containers
-        </metric>
-      </store>
-      <store>
-        @type detect_exceptions
-
-        remove_tag_prefix raw
-        message log
-        stream stream
-        multiline_flush_interval 5
-        max_bytes 500000
-        max_lines 1000
-      </store>
+      remove_tag_prefix raw
+      message log
+      stream stream
+      multiline_flush_interval 5
+      max_bytes 500000
+      max_lines 1000
    </match>
  system.input.conf: |-
    # Example:
@ -342,77 +329,50 @@ data:
    # compute.googleapis.com service rather than container.googleapis.com to keep
    # them separate since most users don't care about the node logs.
    <match kubernetes.**>
-      @type copy
+      @type google_cloud

-      <store>
-        @type google_cloud
-
-        # Set the buffer type to file to improve the reliability and reduce the memory consumption
-        buffer_type file
-        buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
-        # Set queue_full action to block because we want to pause gracefully
-        # in case of the off-the-limits load instead of throwing an exception
-        buffer_queue_full_action block
-        # Set the chunk limit conservatively to avoid exceeding the GCL limit
-        # of 10MiB per write request.
-        buffer_chunk_limit 2M
-        # Cap the combined memory usage of this buffer and the one below to
-        # 2MiB/chunk * (6 + 2) chunks = 16 MiB
-        buffer_queue_limit 6
-        # Never wait more than 5 seconds before flushing logs in the non-error case.
-        flush_interval 5s
-        # Never wait longer than 30 seconds between retries.
-        max_retry_wait 30
-        # Disable the limit on the number of retries (retry forever).
-        disable_retry_limit
-        # Use multiple threads for processing.
-        num_threads 2
-      </store>
-      <store>
-        @type prometheus
-
-        <metric>
-          type counter
-          name logging_entry_count
-          desc Total number of log entries generated by either application containers or system components
-          <labels>
-            component container
-          </labels>
-        </metric>
-      </store>
+      # Collect metrics in Prometheus registry about plugin activity.
+      enable_monitoring true
+      monitoring_type prometheus
+      # Set the buffer type to file to improve the reliability and reduce the memory consumption
+      buffer_type file
+      buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
+      # Set queue_full action to block because we want to pause gracefully
+      # in case of the off-the-limits load instead of throwing an exception
+      buffer_queue_full_action block
+      # Set the chunk limit conservatively to avoid exceeding the GCL limit
+      # of 10MiB per write request.
+      buffer_chunk_limit 2M
+      # Cap the combined memory usage of this buffer and the one below to
+      # 2MiB/chunk * (6 + 2) chunks = 16 MiB
+      buffer_queue_limit 6
+      # Never wait more than 5 seconds before flushing logs in the non-error case.
+      flush_interval 5s
+      # Never wait longer than 30 seconds between retries.
+      max_retry_wait 30
+      # Disable the limit on the number of retries (retry forever).
+      disable_retry_limit
+      # Use multiple threads for processing.
+      num_threads 2
    </match>

    # Keep a smaller buffer here since these logs are less important than the user's
    # container logs.
    <match **>
-      @type copy
+      @type google_cloud

-      <store>
-        @type google_cloud
-
-        detect_subservice false
-        buffer_type file
-        buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
-        buffer_queue_full_action block
-        buffer_chunk_limit 2M
-        buffer_queue_limit 2
-        flush_interval 5s
-        max_retry_wait 30
-        disable_retry_limit
-        num_threads 2
-      </store>
-      <store>
-        @type prometheus
-
-        <metric>
-          type counter
-          name logging_entry_count
-          desc Total number of log entries generated by either application containers or system components
-          <labels>
-            component system
-          </labels>
-        </metric>
-      </store>
+      enable_monitoring true
+      monitoring_type prometheus
+      detect_subservice false
+      buffer_type file
+      buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
+      buffer_queue_full_action block
+      buffer_chunk_limit 2M
+      buffer_queue_limit 2
+      flush_interval 5s
+      max_retry_wait 30
+      disable_retry_limit
+      num_threads 2
    </match>
 metadata:
  name: fluentd-gcp-config-v1.1
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-ds.yaml
@ -27,7 +27,7 @@ spec:
      hostNetwork: true
      containers:
      - name: fluentd-gcp
-        image: gcr.io/google-containers/fluentd-gcp:2.0.7
+        image: gcr.io/google-containers/fluentd-gcp:2.0.8
        # If fluentd consumes its own logs, the following situation may happen:
        # fluentd fails to send a chunk to the server => writes it to the log =>
        # tries to send this message to the server => fails to send a chunk and so on.
@ -90,13 +90,13 @@ spec:
                exit 1;
              fi;
      - name: prometheus-to-sd-exporter
-        image: gcr.io/google-containers/prometheus-to-sd:v0.1.0
+        image: gcr.io/google-containers/prometheus-to-sd:v0.1.3
        command:
          - /monitor
          - --component=fluentd
          - --target-port=31337
          - --stackdriver-prefix=container.googleapis.com/internal/addons
-          - --whitelisted-metrics=logging_line_count,logging_entry_count
+          - --whitelisted-metrics=stackdriver_successful_requests_count,stackdriver_failed_requests_count,stackdriver_ingested_entries_count,stackdriver_dropped_entries_count
        volumeMounts:
        - name: ssl-certs
          mountPath: /etc/ssl/certs