|
2 | 2 | groups:
|
3 | 3 | - name: "loki_alerts"
|
4 | 4 | rules:
|
5 |
| -{{- if not (.Values.monitoring.rules.disabled.LokiRequestErrors | default false) }} |
| 5 | +{{- with .Values.monitoring.rules }} |
| 6 | +{{- if not (.disabled.LokiRequestErrors | default (not .configs.LokiRequestErrors.enabled)) }} |
| 7 | + {{- with .configs.LokiRequestErrors }} |
6 | 8 | - alert: "LokiRequestErrors"
|
7 | 9 | annotations:
|
8 | 10 | message: |
|
9 | 11 | {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
|
10 | 12 | expr: |
|
11 |
| - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) |
| 13 | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[{{ .lookbackPeriod }}])) by (namespace, job, route) |
12 | 14 | /
|
13 |
| - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) |
14 |
| - > 10 |
15 |
| - for: "15m" |
| 15 | + sum(rate(loki_request_duration_seconds_count[{{ .lookbackPeriod }}])) by (namespace, job, route) |
| 16 | + > {{ .threshold }} |
| 17 | + for: {{ .for }} |
16 | 18 | labels:
|
17 |
| - severity: "critical" |
18 |
| -{{- if .Values.monitoring.rules.additionalRuleLabels }} |
19 |
| -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} |
| 19 | + severity: {{ .severity }} |
| 20 | + {{- end }} |
| 21 | +{{- if .additionalRuleLabels }} |
| 22 | +{{ toYaml .additionalRuleLabels | indent 10 }} |
20 | 23 | {{- end }}
|
21 | 24 | {{- end }}
|
22 |
| -{{- if not (.Values.monitoring.rules.disabled.LokiRequestPanics | default false) }} |
| 25 | +{{- if not (.disabled.LokiRequestPanics | default (not .configs.LokiRequestPanics.enabled)) }} |
| 26 | + {{- with .configs.LokiRequestPanics }} |
23 | 27 | - alert: "LokiRequestPanics"
|
24 | 28 | annotations:
|
25 | 29 | message: |
|
26 | 30 | {{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics.
|
27 | 31 | expr: |
|
28 |
| - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 |
| 32 | + sum(increase(loki_panic_total[{{ .lookbackPeriod }}])) by (namespace, job) > {{ .threshold }} |
29 | 33 | labels:
|
30 |
| - severity: "critical" |
31 |
| -{{- if .Values.monitoring.rules.additionalRuleLabels }} |
32 |
| -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} |
| 34 | + severity: {{ .severity }} |
| 35 | + {{- end }} |
| 36 | +{{- if .additionalRuleLabels }} |
| 37 | +{{ toYaml .additionalRuleLabels | indent 10 }} |
33 | 38 | {{- end }}
|
34 | 39 | {{- end }}
|
35 |
| -{{- if not (.Values.monitoring.rules.disabled.LokiRequestLatency | default false) }} |
| 40 | +{{- if not (.disabled.LokiRequestLatency | default (not .configs.LokiRequestLatency.enabled)) }} |
| 41 | + {{- with .configs.LokiRequestLatency }} |
36 | 42 | - alert: "LokiRequestLatency"
|
37 | 43 | annotations:
|
38 | 44 | message: |
|
39 | 45 | {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
|
40 | 46 | expr: |
|
41 |
| - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1 |
42 |
| - for: "15m" |
| 47 | + namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > {{ .threshold }} |
| 48 | + for: {{ .for }} |
43 | 49 | labels:
|
44 |
| - severity: "critical" |
45 |
| -{{- if .Values.monitoring.rules.additionalRuleLabels }} |
46 |
| -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} |
| 50 | + severity: {{ .severity }} |
| 51 | + {{- end }} |
| 52 | +{{- if .additionalRuleLabels }} |
| 53 | +{{ toYaml .additionalRuleLabels | indent 10 }} |
47 | 54 | {{- end }}
|
48 | 55 | {{- end }}
|
49 |
| -{{- if not (.Values.monitoring.rules.disabled.LokiTooManyCompactorsRunning | default false) }} |
| 56 | +{{- if not (.disabled.LokiTooManyCompactorsRunning | default (not .configs.LokiTooManyCompactorsRunning.enabled)) }} |
| 57 | + {{- with .configs.LokiTooManyCompactorsRunning }} |
50 | 58 | - alert: "LokiTooManyCompactorsRunning"
|
51 | 59 | annotations:
|
52 | 60 | message: |
|
53 | 61 | {{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
|
54 | 62 | expr: |
|
55 | 63 | sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
|
56 |
| - for: "5m" |
| 64 | + for: {{ .for }} |
57 | 65 | labels:
|
58 |
| - severity: "warning" |
59 |
| -{{- if .Values.monitoring.rules.additionalRuleLabels }} |
60 |
| -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} |
| 66 | + severity: {{ .severity }} |
| 67 | + {{- end }} |
| 68 | +{{- if .additionalRuleLabels }} |
| 69 | +{{ toYaml .additionalRuleLabels | indent 10 }} |
61 | 70 | {{- end }}
|
62 | 71 | {{- end }}
|
63 |
| -{{- if not (.Values.monitoring.rules.disabled.LokiCanaryLatency | default false) }} |
| 72 | +{{- if not (.disabled.LokiCanaryLatency | default (not .configs.LokiCanaryLatency.enabled)) }} |
| 73 | + {{- with .configs.LokiCanaryLatency }} |
64 | 74 | - name: "loki_canaries_alerts"
|
65 | 75 | rules:
|
66 | 76 | - alert: "LokiCanaryLatency"
|
67 | 77 | annotations:
|
68 | 78 | message: |
|
69 | 79 | {{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
|
70 | 80 | expr: |
|
71 |
| - histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5 |
72 |
| - for: "15m" |
| 81 | + histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > {{ .threshold }} |
| 82 | + for: {{ .for }} |
73 | 83 | labels:
|
74 |
| - severity: "warning" |
75 |
| -{{- if .Values.monitoring.rules.additionalRuleLabels }} |
76 |
| -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} |
| 84 | + severity: {{ .severity }} |
| 85 | + {{- end }} |
| 86 | +{{- if .additionalRuleLabels }} |
| 87 | +{{ toYaml .additionalRuleLabels | indent 10 }} |
| 88 | +{{- end }} |
77 | 89 | {{- end }}
|
78 | 90 | {{- end }}
|
0 commit comments