diff --git a/production/helm/loki/CHANGELOG.md b/production/helm/loki/CHANGELOG.md index 1b68e6a5b1ca8..efc64ff9c3f85 100644 --- a/production/helm/loki/CHANGELOG.md +++ b/production/helm/loki/CHANGELOG.md @@ -13,6 +13,10 @@ Entries should include a reference to the pull request that introduced the chang [//]: # (<AUTOMATED_UPDATES_LOCATOR> : do not remove this line. This locator is used by the CI pipeline to automatically create a changelog entry for each new Loki release. Add other chart versions and respective changelog entries bellow this line.) +## 6.12.1 + +- [ENHANCEMENT] Allow configuration of alert rule severities and thresholds + ## 6.12.0 - [ENHANCEMENT] Replace Bloom Compactor component with Bloom Planner and Bloom Builder. These are the new components to build bloom blocks. @@ -50,7 +54,7 @@ Entries should include a reference to the pull request that introduced the chang ## 6.7.3 - [BUGFIX] Removed Helm test binary - + ## 6.7.2 - [BUGFIX] Fix imagePullSecrets for statefulset-results-cache diff --git a/production/helm/loki/Chart.yaml b/production/helm/loki/Chart.yaml index dcef3406eaac2..b282e580f908a 100644 --- a/production/helm/loki/Chart.yaml +++ b/production/helm/loki/Chart.yaml @@ -3,7 +3,7 @@ name: loki description: Helm chart for Grafana Loki and Grafana Enterprise Logs supporting both simple, scalable and distributed modes. type: application appVersion: 3.1.1 -version: 6.12.0 +version: 6.12.1 home: https://grafana.github.io/helm-charts sources: - https://github.com/grafana/loki diff --git a/production/helm/loki/src/alerts.yaml.tpl b/production/helm/loki/src/alerts.yaml.tpl index 144e263f7061f..d8cc70eb98914 100644 --- a/production/helm/loki/src/alerts.yaml.tpl +++ b/production/helm/loki/src/alerts.yaml.tpl @@ -2,65 +2,75 @@ groups: - name: "loki_alerts" rules: -{{- if not (.Values.monitoring.rules.disabled.LokiRequestErrors | default false) }} +{{- with .Values.monitoring.rules }} +{{- if not (.disabled.LokiRequestErrors | default (not .configs.LokiRequestErrors.enabled)) }} + {{- with .configs.LokiRequestErrors }} - alert: "LokiRequestErrors" annotations: message: | {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors. expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[{{ .lookbackPeriod }}])) by (namespace, job, route) / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) - > 10 - for: "15m" + sum(rate(loki_request_duration_seconds_count[{{ .lookbackPeriod }}])) by (namespace, job, route) + > {{ .threshold }} + for: {{ .for }} labels: - severity: "critical" -{{- if .Values.monitoring.rules.additionalRuleLabels }} -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} + severity: {{ .severity }} + {{- end }} +{{- if .additionalRuleLabels }} +{{ toYaml .additionalRuleLabels | indent 10 }} {{- end }} {{- end }} -{{- if not (.Values.monitoring.rules.disabled.LokiRequestPanics | default false) }} +{{- if not (.disabled.LokiRequestPanics | default (not .configs.LokiRequestPanics.enabled)) }} + {{- with .configs.LokiRequestPanics }} - alert: "LokiRequestPanics" annotations: message: | {{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics. expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + sum(increase(loki_panic_total[{{ .lookbackPeriod }}])) by (namespace, job) > {{ .threshold }} labels: - severity: "critical" -{{- if .Values.monitoring.rules.additionalRuleLabels }} -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} + severity: {{ .severity }} + {{- end }} +{{- if .additionalRuleLabels }} +{{ toYaml .additionalRuleLabels | indent 10 }} {{- end }} {{- end }} -{{- if not (.Values.monitoring.rules.disabled.LokiRequestLatency | default false) }} +{{- if not (.disabled.LokiRequestLatency | default (not .configs.LokiRequestLatency.enabled)) }} + {{- with .configs.LokiRequestLatency }} - alert: "LokiRequestLatency" annotations: message: | {{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency. expr: | - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1 - for: "15m" + namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > {{ .threshold }} + for: {{ .for }} labels: - severity: "critical" -{{- if .Values.monitoring.rules.additionalRuleLabels }} -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} + severity: {{ .severity }} + {{- end }} +{{- if .additionalRuleLabels }} +{{ toYaml .additionalRuleLabels | indent 10 }} {{- end }} {{- end }} -{{- if not (.Values.monitoring.rules.disabled.LokiTooManyCompactorsRunning | default false) }} +{{- if not (.disabled.LokiTooManyCompactorsRunning | default (not .configs.LokiTooManyCompactorsRunning.enabled)) }} + {{- with .configs.LokiTooManyCompactorsRunning }} - alert: "LokiTooManyCompactorsRunning" annotations: message: | {{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time. expr: | sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - for: "5m" + for: {{ .for }} labels: - severity: "warning" -{{- if .Values.monitoring.rules.additionalRuleLabels }} -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} + severity: {{ .severity }} + {{- end }} +{{- if .additionalRuleLabels }} +{{ toYaml .additionalRuleLabels | indent 10 }} {{- end }} {{- end }} -{{- if not (.Values.monitoring.rules.disabled.LokiCanaryLatency | default false) }} +{{- if not (.disabled.LokiCanaryLatency | default (not .configs.LokiCanaryLatency.enabled)) }} + {{- with .configs.LokiCanaryLatency }} - name: "loki_canaries_alerts" rules: - alert: "LokiCanaryLatency" @@ -68,11 +78,13 @@ groups: message: | {{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency. expr: | - histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > 5 - for: "15m" + histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > {{ .threshold }} + for: {{ .for }} labels: - severity: "warning" -{{- if .Values.monitoring.rules.additionalRuleLabels }} -{{ toYaml .Values.monitoring.rules.additionalRuleLabels | indent 10 }} + severity: {{ .severity }} + {{- end }} +{{- if .additionalRuleLabels }} +{{ toYaml .additionalRuleLabels | indent 10 }} +{{- end }} {{- end }} {{- end }} diff --git a/production/helm/loki/values.yaml b/production/helm/loki/values.yaml index ed65339cb33ad..8778810c8103b 100644 --- a/production/helm/loki/values.yaml +++ b/production/helm/loki/values.yaml @@ -3344,9 +3344,40 @@ monitoring: # -- Specify which individual alerts should be disabled # -- Instead of turning off each alert one by one, set the .monitoring.rules.alerting value to false instead. # -- If you disable all the alerts and keep .monitoring.rules.alerting set to true, the chart will fail to render. + # + # -- DEPRECATED: use monitoring.rules.configs.*.enabled instead disabled: {} # LokiRequestErrors: true # LokiRequestPanics: true + + configs: + LokiRequestErrors: + enabled: true + for: 15m + lookbackPeriod: 2m + severity: critical + threshold: 10 + LokiRequestPanics: + enabled: true + lookbackPeriod: 10m + severity: critical + threshold: 0 + LokiRequestLatency: + enabled: true + for: 15m + severity: critical + threshold: 1 + LokiTooManyCompactorsRunning: + enabled: true + for: 5m + severity: warning + LokiCanaryLatency: + enabled: true + for: 15m + lookbackPeriod: 5m + severity: warning + threshold: 5 + # -- Alternative namespace to create PrometheusRule resources in namespace: null # -- Additional annotations for the rules PrometheusRule resource