Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(helm): Allow configurable severity and thresholds for alerts #13730

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion production/helm/loki/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ Entries should include a reference to the pull request that introduced the chang

[//]: # (<AUTOMATED_UPDATES_LOCATOR> : do not remove this line. This locator is used by the CI pipeline to automatically create a changelog entry for each new Loki release. Add other chart versions and respective changelog entries bellow this line.)

## 6.12.1

- [ENHANCEMENT] Allow configuration of alert rule severities and thresholds

## 6.12.0

- [ENHANCEMENT] Replace Bloom Compactor component with Bloom Planner and Bloom Builder. These are the new components to build bloom blocks.
Expand Down Expand Up @@ -50,7 +54,7 @@ Entries should include a reference to the pull request that introduced the chang
## 6.7.3

- [BUGFIX] Removed Helm test binary

## 6.7.2

- [BUGFIX] Fix imagePullSecrets for statefulset-results-cache
Expand Down
2 changes: 1 addition & 1 deletion production/helm/loki/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: loki
description: Helm chart for Grafana Loki and Grafana Enterprise Logs supporting both simple, scalable and distributed modes.
type: application
appVersion: 3.1.1
version: 6.12.0
version: 6.12.1
home: https://grafana.github.io/helm-charts
sources:
- https://github.com/grafana/loki
Expand Down
72 changes: 42 additions & 30 deletions production/helm/loki/src/alerts.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -2,77 +2,89 @@
groups:
- name: "loki_alerts"
rules:
{{- if not (.Values.monitoring.rules.disabled.LokiRequestErrors | default false) }}
{{- with .Values.monitoring.rules }}
{{- if not (.disabled.LokiRequestErrors | default (not .configs.LokiRequestErrors.enabled)) }}
{{- with .configs.LokiRequestErrors }}
- alert: "LokiRequestErrors"
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[{{ .lookbackPeriod }}])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
> 10
for: "15m"
sum(rate(loki_request_duration_seconds_count[{{ .lookbackPeriod }}])) by (namespace, job, route)
> {{ .threshold }}
for: {{ .for }}
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
severity: {{ .severity }}
{{- end }}
{{- if .additionalRuleLabels }}
{{ toYaml .additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- if not (.Values.monitoring.rules.disabled.LokiRequestPanics | default false) }}
{{- if not (.disabled.LokiRequestPanics | default (not .configs.LokiRequestPanics.enabled)) }}
{{- with .configs.LokiRequestPanics }}
- alert: "LokiRequestPanics"
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
sum(increase(loki_panic_total[{{ .lookbackPeriod }}])) by (namespace, job) > {{ .threshold }}
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
severity: {{ .severity }}
{{- end }}
{{- if .additionalRuleLabels }}
{{ toYaml .additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- if not (.Values.monitoring.rules.disabled.LokiRequestLatency | default false) }}
{{- if not (.disabled.LokiRequestLatency | default (not .configs.LokiRequestLatency.enabled)) }}
{{- with .configs.LokiRequestLatency }}
- alert: "LokiRequestLatency"
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
for: "15m"
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > {{ .threshold }}
for: {{ .for }}
labels:
severity: "critical"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
severity: {{ .severity }}
{{- end }}
{{- if .additionalRuleLabels }}
{{ toYaml .additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- if not (.Values.monitoring.rules.disabled.LokiTooManyCompactorsRunning | default false) }}
{{- if not (.disabled.LokiTooManyCompactorsRunning | default (not .configs.LokiTooManyCompactorsRunning.enabled)) }}
{{- with .configs.LokiTooManyCompactorsRunning }}
- alert: "LokiTooManyCompactorsRunning"
annotations:
message: |
{{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
for: "5m"
for: {{ .for }}
labels:
severity: "warning"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
severity: {{ .severity }}
{{- end }}
{{- if .additionalRuleLabels }}
{{ toYaml .additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- if not (.Values.monitoring.rules.disabled.LokiCanaryLatency | default false) }}
{{- if not (.disabled.LokiCanaryLatency | default (not .configs.LokiCanaryLatency.enabled)) }}
{{- with .configs.LokiCanaryLatency }}
- name: "loki_canaries_alerts"
rules:
- alert: "LokiCanaryLatency"
annotations:
message: |
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
expr: |
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5
for: "15m"
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > {{ .threshold }}
for: {{ .for }}
labels:
severity: "warning"
{{- if .Values.monitoring.rules.additionalRuleLabels }}
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
severity: {{ .severity }}
{{- end }}
{{- if .additionalRuleLabels }}
{{ toYaml .additionalRuleLabels | indent 10 }}
{{- end }}
{{- end }}
{{- end }}
31 changes: 31 additions & 0 deletions production/helm/loki/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3344,9 +3344,40 @@ monitoring:
# -- Specify which individual alerts should be disabled
# -- Instead of turning off each alert one by one, set the .monitoring.rules.alerting value to false instead.
# -- If you disable all the alerts and keep .monitoring.rules.alerting set to true, the chart will fail to render.
#
# -- DEPRECATED: use monitoring.rules.configs.*.enabled instead
disabled: {}
# LokiRequestErrors: true
# LokiRequestPanics: true

configs:
LokiRequestErrors:
enabled: true
for: 15m
lookbackPeriod: 2m
severity: critical
threshold: 10
LokiRequestPanics:
enabled: true
lookbackPeriod: 10m
severity: critical
threshold: 0
LokiRequestLatency:
enabled: true
for: 15m
severity: critical
threshold: 1
LokiTooManyCompactorsRunning:
enabled: true
for: 5m
severity: warning
LokiCanaryLatency:
enabled: true
for: 15m
lookbackPeriod: 5m
severity: warning
threshold: 5

# -- Alternative namespace to create PrometheusRule resources in
namespace: null
# -- Additional annotations for the rules PrometheusRule resource
Expand Down