grafana · schahal · Aug 1, 2024
diff --git a/production/helm/loki/CHANGELOG.md b/production/helm/loki/CHANGELOG.md
@@ -13,6 +13,10 @@ Entries should include a reference to the pull request that introduced the chang
 
 [//]: # (<AUTOMATED_UPDATES_LOCATOR> : do not remove this line. This locator is used by the CI pipeline to automatically create a changelog entry for each new Loki release. Add other chart versions and respective changelog entries bellow this line.)
 
+## 6.12.1
+
+- [ENHANCEMENT] Allow configuration of alert rule severities and thresholds
+
 ## 6.12.0
 
 - [ENHANCEMENT] Replace Bloom Compactor component with Bloom Planner and Bloom Builder. These are the new components to build bloom blocks.
@@ -50,7 +54,7 @@ Entries should include a reference to the pull request that introduced the chang
 ## 6.7.3
 
 - [BUGFIX] Removed Helm test binary
-  
+
 ## 6.7.2
 
 - [BUGFIX] Fix imagePullSecrets for statefulset-results-cache

@@ -3,7 +3,7 @@ name: loki
 description: Helm chart for Grafana Loki and Grafana Enterprise Logs supporting both simple, scalable and distributed modes.
 type: application
 appVersion: 3.1.1
-version: 6.12.0
+version: 6.12.1
 home: https://grafana.github.io/helm-charts
 sources:
   - https://github.com/grafana/loki

@@ -2,77 +2,89 @@
 groups:
   - name: "loki_alerts"
     rules:
-{{- if not (.Values.monitoring.rules.disabled.LokiRequestErrors | default false) }}
+{{- with .Values.monitoring.rules }}
+{{- if not (.disabled.LokiRequestErrors | default (not .configs.LokiRequestErrors.enabled)) }}
+      {{- with .configs.LokiRequestErrors }}
       - alert: "LokiRequestErrors"
         annotations:
           message: |
             {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
         expr: |
-          100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
+          100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[{{ .lookbackPeriod }}])) by (namespace, job, route)
             /
-          sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
-            > 10
-        for: "15m"
+          sum(rate(loki_request_duration_seconds_count[{{ .lookbackPeriod }}])) by (namespace, job, route)
+            > {{ .threshold }}
+        for: {{ .for }}
         labels:
-          severity: "critical"
-{{- if .Values.monitoring.rules.additionalRuleLabels }}
-{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
+          severity: {{ .severity }}
+      {{- end }}
+{{- if .additionalRuleLabels }}
+{{ toYaml .additionalRuleLabels | indent 10 }}
 {{- end }}
 {{- end }}
-{{- if not (.Values.monitoring.rules.disabled.LokiRequestPanics | default false) }}
+{{- if not (.disabled.LokiRequestPanics | default (not .configs.LokiRequestPanics.enabled)) }}
+      {{- with .configs.LokiRequestPanics }}
       - alert: "LokiRequestPanics"
         annotations:
           message: |
             {{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics.
         expr: |
-          sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
+          sum(increase(loki_panic_total[{{ .lookbackPeriod }}])) by (namespace, job) > {{ .threshold }}
         labels:
-          severity: "critical"
-{{- if .Values.monitoring.rules.additionalRuleLabels }}
-{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
+          severity: {{ .severity }}
+      {{- end }}
+{{- if .additionalRuleLabels }}
+{{ toYaml .additionalRuleLabels | indent 10 }}
 {{- end }}
 {{- end }}
-{{- if not (.Values.monitoring.rules.disabled.LokiRequestLatency | default false) }}
+{{- if not (.disabled.LokiRequestLatency | default (not .configs.LokiRequestLatency.enabled)) }}
+      {{- with .configs.LokiRequestLatency }}
       - alert: "LokiRequestLatency"
         annotations:
           message: |
             {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
         expr: |
-          namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1
-        for: "15m"
+          namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > {{ .threshold }}
+        for: {{ .for }}
         labels:
-          severity: "critical"
-{{- if .Values.monitoring.rules.additionalRuleLabels }}
-{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
+          severity: {{ .severity }}
+      {{- end }}
+{{- if .additionalRuleLabels }}
+{{ toYaml .additionalRuleLabels | indent 10 }}
 {{- end }}
 {{- end }}
-{{- if not (.Values.monitoring.rules.disabled.LokiTooManyCompactorsRunning | default false) }}
+{{- if not (.disabled.LokiTooManyCompactorsRunning | default (not .configs.LokiTooManyCompactorsRunning.enabled)) }}
+      {{- with .configs.LokiTooManyCompactorsRunning }}
       - alert: "LokiTooManyCompactorsRunning"
         annotations:
           message: |
             {{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
         expr: |
           sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
-        for: "5m"
+        for: {{ .for }}
         labels:
-          severity: "warning"
-{{- if .Values.monitoring.rules.additionalRuleLabels }}
-{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
+          severity: {{ .severity }}
+      {{- end }}
+{{- if .additionalRuleLabels }}
+{{ toYaml .additionalRuleLabels | indent 10 }}
 {{- end }}
 {{- end }}
-{{- if not (.Values.monitoring.rules.disabled.LokiCanaryLatency | default false) }}
+{{- if not (.disabled.LokiCanaryLatency | default (not .configs.LokiCanaryLatency.enabled)) }}
+  {{- with .configs.LokiCanaryLatency }}
   - name: "loki_canaries_alerts"
     rules:
       - alert: "LokiCanaryLatency"
         annotations:
           message: |
             {{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
         expr: |
-          histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5
-        for: "15m"
+          histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > {{ .threshold }}
+        for: {{ .for }}
         labels:
-          severity: "warning"
-{{- if .Values.monitoring.rules.additionalRuleLabels }}
-{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }}
+          severity: {{ .severity }}
+  {{- end }}
+{{- if .additionalRuleLabels }}
+{{ toYaml .additionalRuleLabels | indent 10 }}
+{{- end }}
 {{- end }}
 {{- end }}
@@ -3344,9 +3344,40 @@ monitoring:
     # -- Specify which individual alerts should be disabled
     # -- Instead of turning off each alert one by one, set the .monitoring.rules.alerting value to false instead.
     # -- If you disable all the alerts and keep .monitoring.rules.alerting set to true, the chart will fail to render.
+    #
+    # -- DEPRECATED: use monitoring.rules.configs.*.enabled instead
     disabled: {}
     #  LokiRequestErrors: true
     #  LokiRequestPanics: true
+
+    configs:
+      LokiRequestErrors:
+        enabled: true
+        for: 15m
+        lookbackPeriod: 2m
+        severity: critical
+        threshold: 10
+      LokiRequestPanics:
+        enabled: true
+        lookbackPeriod: 10m
+        severity: critical
+        threshold: 0
+      LokiRequestLatency:
+        enabled: true
+        for: 15m
+        severity: critical
+        threshold: 1
+      LokiTooManyCompactorsRunning:
+        enabled: true
+        for: 5m
+        severity: warning
+      LokiCanaryLatency:
+        enabled: true
+        for: 15m
+        lookbackPeriod: 5m
+        severity: warning
+        threshold: 5
+
     # -- Alternative namespace to create PrometheusRule resources in
     namespace: null
     # -- Additional annotations for the rules PrometheusRule resource