red-hat-storage · aruniiird · Oct 22, 2024
diff --git a/metrics/deploy/prometheus-ocs-rules.yaml b/metrics/deploy/prometheus-ocs-rules.yaml
@@ -407,13 +407,19 @@ spec:
     - alert: MDSCPUUsageHigh
       annotations:
         description: |-
-          Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage.
-          Please consider increasing the CPU request for the {{ $labels.pod }} pod as described in the runbook.
+          Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage
+          {{if query "rate(ceph_mds_request[6h]) >= 1000"}} and cannot cope
+          up with the current rate of mds requests. Please consider Horizontal
+          scaling, by adding another MDS pod{{else}}. Please consider Vertical
+          scaling, by adding more resources to the existing MDS pod{{end}}.
+          Please see 'runbook_url' for more details.
         message: Ceph metadata server pod ({{ $labels.pod }}) has high cpu usage
-        runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephMdsCpuUsageHigh.md
+        runbook_url: '{{if query "rate(ceph_mds_request[6h]) >= 1000"}}https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephMdsCpuUsageHighNeedsHorizontalScaling.md
+        {{else}}https://github.com/openshift/runbooks/blob/master/alerts/openshift-container-storage-operator/CephMdsCpuUsageHighNeedsVerticalScaling.md
+        {{end}}'
         severity_level: warning
       expr: |
-        pod:container_cpu_usage:sum{pod=~"rook-ceph-mds.*"}/ on(pod) kube_pod_resource_limit{resource='cpu',pod=~"rook-ceph-mds.*"} > 0.67
+        label_replace(pod:container_cpu_usage:sum{pod=~"rook-ceph-mds.*"}/ on(pod, namespace) kube_pod_resource_limit{resource='cpu',pod=~"rook-ceph-mds.*"}, "ceph_daemon", "mds.$1", "pod", "rook-ceph-mds-(.*)-(.*)") + on (ceph_daemon, namespace) group_left(managedBy) (0 * (ceph_mds_metadata ==1)) > 0.67
       for: 6h
       labels:
         severity: warning