Merge pull request #48812 from crassirostris/change-fluentd-monitoring

Automatic merge from submit-queue (batch tested with PRs 48812, 48276)

Change fluentd-gcp monitoring to use metrics exposed by SD plugin

Following https://github.com/GoogleCloudPlatform/fluent-plugin-google-cloud/pull/135, make fluentd-gcp expose metrics in Prometheus registry and use them instead of counting records in the pipeline.

/cc @piosz @igorpeshansky

```release-note
Fluentd-gcp DaemonSet exposes different set of metrics.
```
pull/6/head
Kubernetes Submit Queue 2017-07-14 04:43:42 -07:00 committed by GitHub
commit 455e44b616
2 changed files with 47 additions and 87 deletions

View File

@ -70,27 +70,14 @@ data:
# Detect exceptions in the log output and forward them as one log entry.
<match raw.kubernetes.**>
@type copy
@type detect_exceptions
<store>
@type prometheus
<metric>
type counter
name logging_line_count
desc Total number of lines generated by application containers
</metric>
</store>
<store>
@type detect_exceptions
remove_tag_prefix raw
message log
stream stream
multiline_flush_interval 5
max_bytes 500000
max_lines 1000
</store>
remove_tag_prefix raw
message log
stream stream
multiline_flush_interval 5
max_bytes 500000
max_lines 1000
</match>
system.input.conf: |-
# Example:
@ -342,77 +329,50 @@ data:
# compute.googleapis.com service rather than container.googleapis.com to keep
# them separate since most users don't care about the node logs.
<match kubernetes.**>
@type copy
@type google_cloud
<store>
@type google_cloud
# Set the buffer type to file to improve the reliability and reduce the memory consumption
buffer_type file
buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
# Set queue_full action to block because we want to pause gracefully
# in case of the off-the-limits load instead of throwing an exception
buffer_queue_full_action block
# Set the chunk limit conservatively to avoid exceeding the GCL limit
# of 10MiB per write request.
buffer_chunk_limit 2M
# Cap the combined memory usage of this buffer and the one below to
# 2MiB/chunk * (6 + 2) chunks = 16 MiB
buffer_queue_limit 6
# Never wait more than 5 seconds before flushing logs in the non-error case.
flush_interval 5s
# Never wait longer than 30 seconds between retries.
max_retry_wait 30
# Disable the limit on the number of retries (retry forever).
disable_retry_limit
# Use multiple threads for processing.
num_threads 2
</store>
<store>
@type prometheus
<metric>
type counter
name logging_entry_count
desc Total number of log entries generated by either application containers or system components
<labels>
component container
</labels>
</metric>
</store>
# Collect metrics in Prometheus registry about plugin activity.
enable_monitoring true
monitoring_type prometheus
# Set the buffer type to file to improve the reliability and reduce the memory consumption
buffer_type file
buffer_path /var/log/fluentd-buffers/kubernetes.containers.buffer
# Set queue_full action to block because we want to pause gracefully
# in case of the off-the-limits load instead of throwing an exception
buffer_queue_full_action block
# Set the chunk limit conservatively to avoid exceeding the GCL limit
# of 10MiB per write request.
buffer_chunk_limit 2M
# Cap the combined memory usage of this buffer and the one below to
# 2MiB/chunk * (6 + 2) chunks = 16 MiB
buffer_queue_limit 6
# Never wait more than 5 seconds before flushing logs in the non-error case.
flush_interval 5s
# Never wait longer than 30 seconds between retries.
max_retry_wait 30
# Disable the limit on the number of retries (retry forever).
disable_retry_limit
# Use multiple threads for processing.
num_threads 2
</match>
# Keep a smaller buffer here since these logs are less important than the user's
# container logs.
<match **>
@type copy
@type google_cloud
<store>
@type google_cloud
detect_subservice false
buffer_type file
buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
buffer_queue_full_action block
buffer_chunk_limit 2M
buffer_queue_limit 2
flush_interval 5s
max_retry_wait 30
disable_retry_limit
num_threads 2
</store>
<store>
@type prometheus
<metric>
type counter
name logging_entry_count
desc Total number of log entries generated by either application containers or system components
<labels>
component system
</labels>
</metric>
</store>
enable_monitoring true
monitoring_type prometheus
detect_subservice false
buffer_type file
buffer_path /var/log/fluentd-buffers/kubernetes.system.buffer
buffer_queue_full_action block
buffer_chunk_limit 2M
buffer_queue_limit 2
flush_interval 5s
max_retry_wait 30
disable_retry_limit
num_threads 2
</match>
metadata:
name: fluentd-gcp-config-v1.1

View File

@ -27,7 +27,7 @@ spec:
hostNetwork: true
containers:
- name: fluentd-gcp
image: gcr.io/google-containers/fluentd-gcp:2.0.7
image: gcr.io/google-containers/fluentd-gcp:2.0.8
# If fluentd consumes its own logs, the following situation may happen:
# fluentd fails to send a chunk to the server => writes it to the log =>
# tries to send this message to the server => fails to send a chunk and so on.
@ -90,13 +90,13 @@ spec:
exit 1;
fi;
- name: prometheus-to-sd-exporter
image: gcr.io/google-containers/prometheus-to-sd:v0.1.0
image: gcr.io/google-containers/prometheus-to-sd:v0.1.3
command:
- /monitor
- --component=fluentd
- --target-port=31337
- --stackdriver-prefix=container.googleapis.com/internal/addons
- --whitelisted-metrics=logging_line_count,logging_entry_count
- --whitelisted-metrics=stackdriver_successful_requests_count,stackdriver_failed_requests_count,stackdriver_ingested_entries_count,stackdriver_dropped_entries_count
volumeMounts:
- name: ssl-certs
mountPath: /etc/ssl/certs