Merge pull request #20185 from a-robinson/fluent

Fluentd improvements to lessen likelihood of buffers filling up and hanging
2016-01-26 17:02:08 -08:00 · 2016-01-26 17:02:08 -08:00 · f99cc645bb
parent 67939eb933 6acf297231
commit f99cc645bb
7 changed files with 28 additions and 18 deletions
--- a/cluster/addons/fluentd-elasticsearch/fluentd-es-image/Makefile
+++ b/cluster/addons/fluentd-elasticsearch/fluentd-es-image/Makefile
@ -1,7 +1,7 @@
 .PHONY:	build push

 IMAGE = fluentd-elasticsearch
-TAG = 1.12
+TAG = 1.13

 build:	
 	docker build -t gcr.io/google_containers/$(IMAGE):$(TAG) .
--- a/cluster/addons/fluentd-elasticsearch/fluentd-es-image/td-agent.conf
+++ b/cluster/addons/fluentd-elasticsearch/fluentd-es-image/td-agent.conf
@ -100,6 +100,11 @@
 # problem yet to be solved as secrets are not usable in static pods which the fluentd
 # pod must be until a per-node controller is available in Kubernetes.

+# Do not directly collect fluentd's own logs to avoid infinite loops.
+<match fluent.**>
+  type null
+</match>
+
 <source>
  type tail
  path /var/log/containers/*.log
@ -186,12 +191,12 @@
   port 9200
   logstash_format true
   # Set the chunk limit the same as for fluentd-gcp.
-   buffer_chunk_limit 512K
-   # Cap buffer memory usage to 512KB/chunk * 128 chunks = 65 MB
-   buffer_queue_limit 128
+   buffer_chunk_limit 2M
+   # Cap buffer memory usage to 2MiB/chunk * 32 chunks = 64 MiB
+   buffer_queue_limit 32
   flush_interval 5s
   # Never wait longer than 5 minutes between retries.
-   max_retry_wait 300
+   max_retry_wait 30
   # Disable the limit on the number of retries (retry forever).
   disable_retry_limit
 </match>
--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/Makefile
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/Makefile
@ -14,7 +14,7 @@

 .PHONY:	kbuild kpush

-TAG = 1.14
+TAG = 1.15

 # Rules for building the test image for deployment to Dockerhub with user kubernetes.

--- a/cluster/addons/fluentd-gcp/fluentd-gcp-image/google-fluentd.conf
+++ b/cluster/addons/fluentd-gcp/fluentd-gcp-image/google-fluentd.conf
@ -42,6 +42,11 @@
 # the name of the Kubernetes container regardless of how many times the
 # Kubernetes pod has been restarted (resulting in a several Docker container IDs).

+# Do not directly collect fluentd's own logs to avoid infinite loops.
+<match fluent.**>
+  type null
+</match>
+
 <source>
  type tail
  format json
@ -130,15 +135,15 @@
 <match kubernetes.**>
  type google_cloud
  # Set the chunk limit conservatively to avoid exceeding the GCL limit
-  # of 2MB per write request.
-  buffer_chunk_limit 512K
+  # of 10MiB per write request.
+  buffer_chunk_limit 2M
  # Cap the combined memory usage of this buffer and the one below to
-  # 512KB/chunk * (96 + 32) chunks = 65 MB
-  buffer_queue_limit 96
+  # 2MiB/chunk * (24 + 8) chunks = 64 MiB
+  buffer_queue_limit 24
  # Never wait more than 5 seconds before flushing logs in the non-error case.
  flush_interval 5s
-  # Never wait longer than 5 minutes between retries.
-  max_retry_wait 300
+  # Never wait longer than 30 seconds between retries.
+  max_retry_wait 30
  # Disable the limit on the number of retries (retry forever).
  disable_retry_limit
 </match>
@ -148,9 +153,9 @@
 <match **>
  type google_cloud
  detect_subservice false
-  buffer_chunk_limit 512K
-  buffer_queue_limit 32
+  buffer_chunk_limit 2M
+  buffer_queue_limit 8
  flush_interval 5s
-  max_retry_wait 300
+  max_retry_wait 30
  disable_retry_limit
 </match>
--- a/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml
+++ b/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml
@ -8,7 +8,7 @@ metadata:
 spec:
  containers:
  - name: fluentd-elasticsearch
-    image: gcr.io/google_containers/fluentd-elasticsearch:1.12
+    image: gcr.io/google_containers/fluentd-elasticsearch:1.13
    resources:
      limits:
        cpu: 100m
--- a/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml
+++ b/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml
@ -8,7 +8,7 @@ metadata:
 spec:
  containers:
  - name: fluentd-cloud-logging
-    image: gcr.io/google_containers/fluentd-gcp:1.14
+    image: gcr.io/google_containers/fluentd-gcp:1.15
    resources:
      limits:
        cpu: 100m
--- a/docs/getting-started-guides/logging.md
+++ b/docs/getting-started-guides/logging.md
@ -172,7 +172,7 @@ metadata:
 spec:
  containers:
  - name: fluentd-cloud-logging
-    image: gcr.io/google_containers/fluentd-gcp:1.14
+    image: gcr.io/google_containers/fluentd-gcp:1.15
    resources:
      limits:
        cpu: 100m