Skip to content

Commit 7847d49

Browse files
Add obsctl reloader alerting rules (#603)
* Add obsctl-reloader alert rules * Refactor obsctl-reloader alert rules * Fix typo * Prefix alerts for easy eye grepping
1 parent 3bbcaf0 commit 7847d49

5 files changed

+149
-2
lines changed

jsonnetfile.lock.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -395,8 +395,8 @@
395395
"subdir": "jsonnet/lib"
396396
}
397397
},
398-
"version": "c720483113f66880d5d318adf707baf788e7fcfa",
399-
"sum": "NGOtOzgw5pgFCt9+wxrDMTzRcymPxICzarVQPwf7Upk="
398+
"version": "1df7a85a21606d7e1c42262a386dc6b377eb18b7",
399+
"sum": "x00LDrH1x0wQWO95LiFaoUuFIPMWb1Acaem6ARPwaEk="
400400
},
401401
{
402402
"source": {

observability/config.libsonnet

+5
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ local var = import 'utils.jsonnet';
4040
instance_name_filter: var.instance_name_filter,
4141
},
4242
},
43+
obsctlReloader: {
44+
_config+:: {
45+
obsctlReloaderSelector: 'job="rules-obsctl-reloader"',
46+
},
47+
},
4348
alertmanager: (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/config.libsonnet') {
4449
title: 'Alertmanager / Overview',
4550
_config+:: {

observability/prometheusrules.jsonnet

+8
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ local appSREOverwrites(environment) = {
9393
std.startsWith(name, 'rhobs-mst') && environment == 'stage' then '92520ea4d6976f30d1618164e186ef9b'
9494
else if
9595
std.startsWith(name, 'gubernator') then 'no-dashboard'
96+
else if
97+
std.startsWith(name, 'obsctl-reloader') then 'no-dashboard'
9698
else if
9799
std.startsWith(name, 'alertmanager') then 'alertmanager-overview'
98100
else error 'no dashboard id for group %s' % name,
@@ -466,3 +468,9 @@ local renderAlerts(name, environment, mixin) = {
466468
'observatorium-http-traffic-stage.prometheusrules': renderAlerts('observatorium-http-traffic-stage', 'stage', httpTrafficMonitoringAlerts),
467469
'observatorium-http-traffic-production.prometheusrules': renderAlerts('observatorium-http-traffic-production', 'production', httpTrafficMonitoringAlerts),
468470
}
471+
472+
{
473+
local obsctlReloader = (import 'github.com/rhobs/obsctl-reloader/jsonnet/lib/alerts.libsonnet') + config.obsctlReloader,
474+
'observatorium-obsctl-reloader-stage.prometheusrules': renderAlerts('obsctl-reloader-stage', 'stage', obsctlReloader),
475+
'observatorium-obsctl-reloader-production.prometheusrules': renderAlerts('obsctl-reloader-production', 'production', obsctlReloader),
476+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
---
2+
$schema: /openshift/prometheus-rule-1.yml
3+
apiVersion: monitoring.coreos.com/v1
4+
kind: PrometheusRule
5+
metadata:
6+
labels:
7+
prometheus: app-sre
8+
role: alert-rules
9+
name: obsctl-reloader-production
10+
spec:
11+
groups:
12+
- name: obsctl-reloader.rules
13+
rules:
14+
- alert: ObsCtlRulesStoreServerError
15+
annotations:
16+
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
17+
description: Failed to send rules from tenant {{ $labels.tenant }} to store {{ $value | humanizePercentage }}% of the time with a 5xx or 4xx status code.
18+
message: Failed to send rules from tenant {{ $labels.tenant }} to store {{ $value | humanizePercentage }}% of the time with a 5xx or 4xx status code.
19+
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlrulesstoreservererror
20+
summary: Failing to send rules to Observatorium.
21+
expr: |
22+
(
23+
sum_over_time(obsctl_reloader_prom_rules_store_ops_total{status_code=~"5..|4..", job="rules-obsctl-reloader"}[5m])
24+
/
25+
sum(sum_over_time(obsctl_reloader_prom_rules_store_ops_total{job="rules-obsctl-reloader"}[5m]))
26+
) or vector(0)
27+
> 0.10
28+
for: 10m
29+
labels:
30+
service: telemeter
31+
severity: critical
32+
- alert: ObsCtlRulesSetFailure
33+
annotations:
34+
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
35+
description: obsctl-reloader is failing to set rules for tenant {{ $labels.tenant }} before reaching Observatorium {{ $value | humanizePercentage }}% of the time due to {{ $labels.reason }}.
36+
message: obsctl-reloader is failing to set rules for tenant {{ $labels.tenant }} before reaching Observatorium {{ $value | humanizePercentage }}% of the time due to {{ $labels.reason }}.
37+
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlrulessetfailure
38+
summary: Failing to set rules due to issue before talking to Observatorium.
39+
expr: |
40+
(
41+
sum_over_time(obsctl_reloader_prom_rule_set_failures_total{reason!="rules_store_error", job="rules-obsctl-reloader"}[5m])
42+
/
43+
sum_over_time(obsctl_reloader_prom_rule_set_total{job="rules-obsctl-reloader"}[5m])
44+
) or vector(0)
45+
> 0.10
46+
for: 10m
47+
labels:
48+
service: telemeter
49+
severity: medium
50+
- alert: ObsCtlFetchRulesFailed
51+
annotations:
52+
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
53+
description: obsctl-reloader is failing to fetch rules via the PrometheusRule CRD in the local cluster.
54+
message: obsctl-reloader is failing to fetch rules via the PrometheusRule CRD in the local cluster.
55+
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlfetchrulesfailed
56+
summary: Failing to fetch rules from the local cluster.
57+
expr: |
58+
(
59+
sum_over_time(obsctl_reloader_prom_rule_fetch_failures_total{job="rules-obsctl-reloader"}[5m])
60+
/
61+
sum_over_time(obsctl_reloader_prom_rule_fetches_total{job="rules-obsctl-reloader"}[5m])
62+
) or vector(0)
63+
> 0.20
64+
for: 5m
65+
labels:
66+
service: telemeter
67+
severity: critical
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
---
2+
$schema: /openshift/prometheus-rule-1.yml
3+
apiVersion: monitoring.coreos.com/v1
4+
kind: PrometheusRule
5+
metadata:
6+
labels:
7+
prometheus: app-sre
8+
role: alert-rules
9+
name: obsctl-reloader-stage
10+
spec:
11+
groups:
12+
- name: obsctl-reloader.rules
13+
rules:
14+
- alert: ObsCtlRulesStoreServerError
15+
annotations:
16+
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
17+
description: Failed to send rules from tenant {{ $labels.tenant }} to store {{ $value | humanizePercentage }}% of the time with a 5xx or 4xx status code.
18+
message: Failed to send rules from tenant {{ $labels.tenant }} to store {{ $value | humanizePercentage }}% of the time with a 5xx or 4xx status code.
19+
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlrulesstoreservererror
20+
summary: Failing to send rules to Observatorium.
21+
expr: |
22+
(
23+
sum_over_time(obsctl_reloader_prom_rules_store_ops_total{status_code=~"5..|4..", job="rules-obsctl-reloader"}[5m])
24+
/
25+
sum(sum_over_time(obsctl_reloader_prom_rules_store_ops_total{job="rules-obsctl-reloader"}[5m]))
26+
) or vector(0)
27+
> 0.10
28+
for: 10m
29+
labels:
30+
service: telemeter
31+
severity: high
32+
- alert: ObsCtlRulesSetFailure
33+
annotations:
34+
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
35+
description: obsctl-reloader is failing to set rules for tenant {{ $labels.tenant }} before reaching Observatorium {{ $value | humanizePercentage }}% of the time due to {{ $labels.reason }}.
36+
message: obsctl-reloader is failing to set rules for tenant {{ $labels.tenant }} before reaching Observatorium {{ $value | humanizePercentage }}% of the time due to {{ $labels.reason }}.
37+
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlrulessetfailure
38+
summary: Failing to set rules due to issue before talking to Observatorium.
39+
expr: |
40+
(
41+
sum_over_time(obsctl_reloader_prom_rule_set_failures_total{reason!="rules_store_error", job="rules-obsctl-reloader"}[5m])
42+
/
43+
sum_over_time(obsctl_reloader_prom_rule_set_total{job="rules-obsctl-reloader"}[5m])
44+
) or vector(0)
45+
> 0.10
46+
for: 10m
47+
labels:
48+
service: telemeter
49+
severity: medium
50+
- alert: ObsCtlFetchRulesFailed
51+
annotations:
52+
dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/obsctl-reloader.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
53+
description: obsctl-reloader is failing to fetch rules via the PrometheusRule CRD in the local cluster.
54+
message: obsctl-reloader is failing to fetch rules via the PrometheusRule CRD in the local cluster.
55+
runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#obsctlfetchrulesfailed
56+
summary: Failing to fetch rules from the local cluster.
57+
expr: |
58+
(
59+
sum_over_time(obsctl_reloader_prom_rule_fetch_failures_total{job="rules-obsctl-reloader"}[5m])
60+
/
61+
sum_over_time(obsctl_reloader_prom_rule_fetches_total{job="rules-obsctl-reloader"}[5m])
62+
) or vector(0)
63+
> 0.20
64+
for: 5m
65+
labels:
66+
service: telemeter
67+
severity: high

0 commit comments

Comments
 (0)