Skip to content

Commit d35288c

Browse files
committed
feat(helm): Allow configurable severity and thresholds for alerts
1 parent 08e61ca commit d35288c

File tree

4 files changed

+79
-32
lines changed

4 files changed

+79
-32
lines changed

production/helm/loki/CHANGELOG.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ Entries should include a reference to the pull request that introduced the chang
1313

1414
[//]: # (<AUTOMATED_UPDATES_LOCATOR> : do not remove this line. This locator is used by the CI pipeline to automatically create a changelog entry for each new Loki release. Add other chart versions and respective changelog entries bellow this line.)
1515

16+
## 6.12.1
17+
18+
- [ENHANCEMENT] Allow configuration of alert rule severities and thresholds
19+
1620
## 6.12.0
1721

1822
- [ENHANCEMENT] Replace Bloom Compactor component with Bloom Planner and Bloom Builder. These are the new components to build bloom blocks.
@@ -50,7 +54,7 @@ Entries should include a reference to the pull request that introduced the chang
5054
## 6.7.3
5155

5256
- [BUGFIX] Removed Helm test binary
53-
57+
5458
## 6.7.2
5559

5660
- [BUGFIX] Fix imagePullSecrets for statefulset-results-cache

production/helm/loki/Chart.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name: loki
33
description: Helm chart for Grafana Loki and Grafana Enterprise Logs supporting both simple, scalable and distributed modes.
44
type: application
55
appVersion: 3.1.1
6-
version: 6.12.0
6+
version: 6.12.1
77
home: https://grafana.github.io/helm-charts
88
sources:
99
- https://github.com/grafana/loki

production/helm/loki/src/alerts.yaml.tpl

+42-30
Original file line numberDiff line numberDiff line change
@@ -2,77 +2,89 @@
22
groups:
33
- name: "loki_alerts"
44
rules:
5-
{{- if not (.Values.monitoring.rules.disabled.LokiRequestErrors | default false) }}
5+
{{- with .Values.monitoring.rules }}
6+
{{- if not (.disabled.LokiRequestErrors | default (not .configs.LokiRequestErrors.enabled)) }}
7+
{{- with .configs.LokiRequestErrors }}
68
- alert: "LokiRequestErrors"
79
annotations:
810
message: |
911
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
1012
expr: |
11-
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
13+
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[{{ .lookbackPeriod }}])) by (namespace, job, route)
1214
/
13-
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
14-
> 10
15-
for: "15m"
15+
sum(rate(loki_request_duration_seconds_count[{{ .lookbackPeriod }}])) by (namespace, job, route)
16+
> {{ .threshold }}
17+
for: {{ .for }}
1618
labels:
17-
severity: "critical"
18-
{{- if .Values.monitoring.rules.additionalRuleLabels }}
19-
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
19+
severity: {{ .severity }}
20+
{{- end }}
21+
{{- if .additionalRuleLabels }}
22+
{{ toYaml .additionalRuleLabels | indent 10 }}
2023
{{- end }}
2124
{{- end }}
22-
{{- if not (.Values.monitoring.rules.disabled.LokiRequestPanics | default false) }}
25+
{{- if not (.disabled.LokiRequestPanics | default (not .configs.LokiRequestPanics.enabled)) }}
26+
{{- with .configs.LokiRequestPanics }}
2327
- alert: "LokiRequestPanics"
2428
annotations:
2529
message: |
2630
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics.
2731
expr: |
28-
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
32+
sum(increase(loki_panic_total[{{ .lookbackPeriod }}])) by (namespace, job) > {{ .threshold }}
2933
labels:
30-
severity: "critical"
31-
{{- if .Values.monitoring.rules.additionalRuleLabels }}
32-
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
34+
severity: {{ .severity }}
35+
{{- end }}
36+
{{- if .additionalRuleLabels }}
37+
{{ toYaml .additionalRuleLabels | indent 10 }}
3338
{{- end }}
3439
{{- end }}
35-
{{- if not (.Values.monitoring.rules.disabled.LokiRequestLatency | default false) }}
40+
{{- if not (.disabled.LokiRequestLatency | default (not .configs.LokiRequestLatency.enabled)) }}
41+
{{- with .configs.LokiRequestLatency }}
3642
- alert: "LokiRequestLatency"
3743
annotations:
3844
message: |
3945
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
4046
expr: |
41-
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
42-
for: "15m"
47+
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > {{ .threshold }}
48+
for: {{ .for }}
4349
labels:
44-
severity: "critical"
45-
{{- if .Values.monitoring.rules.additionalRuleLabels }}
46-
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
50+
severity: {{ .severity }}
51+
{{- end }}
52+
{{- if .additionalRuleLabels }}
53+
{{ toYaml .additionalRuleLabels | indent 10 }}
4754
{{- end }}
4855
{{- end }}
49-
{{- if not (.Values.monitoring.rules.disabled.LokiTooManyCompactorsRunning | default false) }}
56+
{{- if not (.disabled.LokiTooManyCompactorsRunning | default (not .configs.LokiTooManyCompactorsRunning.enabled)) }}
57+
{{- with .configs.LokiTooManyCompactorsRunning }}
5058
- alert: "LokiTooManyCompactorsRunning"
5159
annotations:
5260
message: |
5361
{{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
5462
expr: |
5563
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
56-
for: "5m"
64+
for: {{ .for }}
5765
labels:
58-
severity: "warning"
59-
{{- if .Values.monitoring.rules.additionalRuleLabels }}
60-
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
66+
severity: {{ .severity }}
67+
{{- end }}
68+
{{- if .additionalRuleLabels }}
69+
{{ toYaml .additionalRuleLabels | indent 10 }}
6170
{{- end }}
6271
{{- end }}
63-
{{- if not (.Values.monitoring.rules.disabled.LokiCanaryLatency | default false) }}
72+
{{- if not (.disabled.LokiCanaryLatency | default (not .configs.LokiCanaryLatency.enabled)) }}
73+
{{- with .configs.LokiCanaryLatency }}
6474
- name: "loki_canaries_alerts"
6575
rules:
6676
- alert: "LokiCanaryLatency"
6777
annotations:
6878
message: |
6979
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
7080
expr: |
71-
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5
72-
for: "15m"
81+
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > {{ .threshold }}
82+
for: {{ .for }}
7383
labels:
74-
severity: "warning"
75-
{{- if .Values.monitoring.rules.additionalRuleLabels }}
76-
{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
84+
severity: {{ .severity }}
85+
{{- end }}
86+
{{- if .additionalRuleLabels }}
87+
{{ toYaml .additionalRuleLabels | indent 10 }}
88+
{{- end }}
7789
{{- end }}
7890
{{- end }}

production/helm/loki/values.yaml

+31
Original file line numberDiff line numberDiff line change
@@ -3344,9 +3344,40 @@ monitoring:
33443344
# -- Specify which individual alerts should be disabled
33453345
# -- Instead of turning off each alert one by one, set the .monitoring.rules.alerting value to false instead.
33463346
# -- If you disable all the alerts and keep .monitoring.rules.alerting set to true, the chart will fail to render.
3347+
#
3348+
# -- DEPRECATED: use monitoring.rules.configs.*.enabled instead
33473349
disabled: {}
33483350
# LokiRequestErrors: true
33493351
# LokiRequestPanics: true
3352+
3353+
configs:
3354+
LokiRequestErrors:
3355+
enabled: true
3356+
for: 15m
3357+
lookbackPeriod: 2m
3358+
severity: critical
3359+
threshold: 10
3360+
LokiRequestPanics:
3361+
enabled: true
3362+
lookbackPeriod: 10m
3363+
severity: critical
3364+
threshold: 0
3365+
LokiRequestLatency:
3366+
enabled: true
3367+
for: 15m
3368+
severity: critical
3369+
threshold: 1
3370+
LokiTooManyCompactorsRunning:
3371+
enabled: true
3372+
for: 5m
3373+
severity: warning
3374+
LokiCanaryLatency:
3375+
enabled: true
3376+
for: 15m
3377+
lookbackPeriod: 5m
3378+
severity: warning
3379+
threshold: 5
3380+
33503381
# -- Alternative namespace to create PrometheusRule resources in
33513382
namespace: null
33523383
# -- Additional annotations for the rules PrometheusRule resource

0 commit comments

Comments
 (0)