TFA fix to ignore bluestore slowness alert during health checks

Suma R · Suma R · commit 31ffd6533ab1 · 2025-03-12T13:15:17.000+05:30
Signed-off-by: Suma R &lt;sumar@Sumas-MacBook-Pro.local&gt;
diff --git a/tests/cephfs/lib/cephfs_common_lib.py b/tests/cephfs/lib/cephfs_common_lib.py
@@ -0,0 +1,59 @@
+"""
+This is cephfs utilsV1 extension to include further common reusable methods for FS regression testing
+
+"""
+
+import datetime
+import time
+
+from tests.cephfs.cephfs_utilsV1 import FsUtils
+from utility.log import Log
+
+log = Log(__name__)
+
+
+class CephFSCommonUtils(FsUtils):
+    def __init__(self, ceph_cluster):
+        """
+        FS Utility V2 object
+        Args:
+            ceph_cluster (ceph.ceph.Ceph): ceph cluster
+        """
+        self.ceph_cluster = ceph_cluster
+        super().__init__(ceph_cluster)
+
+    def wait_for_healthy_ceph(self, client, wait_time):
+        """
+        This method will run ceph status and if its not HEALTH_OK, will wait for wait_time for it to be healthy
+        Args:
+        Required:
+        Client : Client object to run command
+        wait_time : Time to wait for HEALTH_OK, in seconds
+
+        Returns 0 if healthy, 1 if unhealthy even after wait_time
+        """
+        ceph_healthy = 0
+        end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time)
+        while ceph_healthy == 0 and (datetime.datetime.now() < end_time):
+            try:
+                self.get_ceph_health_status(client)
+                ceph_healthy = 1
+            except Exception as ex:
+                log.info(ex)
+                out, rc = client.exec_command(sudo=True, cmd="ceph health detail")
+                if "experiencing slow operations in BlueStore" in str(out):
+                    log.info("Ignoring the known warning for Bluestore Slow ops")
+                    ceph_healthy = 1
+                else:
+                    log.info(
+                        f"Wait for sometime to check if Cluster health can be OK, current state : {ex}"
+                    )
+                    time.sleep(5)
+
+        if ceph_healthy == 0:
+            client.exec_command(
+                sudo=True,
+                cmd="ceph fs status;ceph -s;ceph health detail",
+            )
+            return 1
+        return 0
diff --git a/tests/cephfs/snapshot_clone/cg_snap_system_test.py b/tests/cephfs/snapshot_clone/cg_snap_system_test.py
@@ -7,6 +7,7 @@
 from threading import Thread
 
 from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
+from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
 from tests.cephfs.snapshot_clone.cephfs_cg_io import CG_snap_IO
 from tests.cephfs.snapshot_clone.cg_snap_utils import CG_Snap_Utils
 from utility.log import Log
@@ -29,12 +30,13 @@ def run(ceph_cluster, **kw):
     cancel,reset,include,exclude
 
     Clean Up:
-    1.
+    1. Umount mountpoints, Destroy subvolumes and subvolumegroups.
 
     """
     try:
         fs_util_v1 = FsUtilsv1(ceph_cluster)
         cg_snap_util = CG_Snap_Utils(ceph_cluster)
+        cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
         cg_snap_io = CG_snap_IO(ceph_cluster)
         config = kw.get("config")
         clients = ceph_cluster.get_ceph_objects("client")
@@ -110,7 +112,11 @@ def run(ceph_cluster, **kw):
         )
         crash_status_before = fs_util_v1.get_crash_ls_new(client1)
         log.info(f"Crash status before Test: {crash_status_before}")
-        fs_util_v1.get_ceph_health_status(client1)
+        wait_time_secs = 300
+        if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
+            assert (
+                False
+            ), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
         sv_mixed_list = []
         qs_cnt -= 1
         for i in range(0, qs_cnt):
@@ -161,25 +167,11 @@ def run(ceph_cluster, **kw):
         log.info("Clean Up in progess")
         crash_status_after = fs_util_v1.get_crash_ls_new(client1)
         log.info(f"Crash status after Test: {crash_status_after}")
-        health_wait = 300
-        end_time = datetime.datetime.now() + datetime.timedelta(seconds=health_wait)
-        health_ok = 0
-        wait_time = 0
-        while (datetime.datetime.now() < end_time) and health_ok == 0:
-            try:
-                fs_util_v1.get_ceph_health_status(client1)
-                health_ok = 1
-            except Exception as ex:
-                log.info(
-                    f"Wait for sometime to check if Cluster health can be OK, current state : {ex}"
-                )
-                time.sleep(10)
-                wait_time += 10
-        if health_ok == 0:
+        wait_time_secs = 300
+        if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
             assert (
                 False
-            ), f"Cluster health is not OK even after waiting for {health_wait}secs"
-        log.info(f"Cluster Health is OK in {wait_time}secs")
+            ), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
 
         if len(crash_status_after) > len(crash_status_before):
             assert False, "Post test validation failed, please check crash report above"
diff --git a/tests/cephfs/snapshot_clone/cg_snap_test.py b/tests/cephfs/snapshot_clone/cg_snap_test.py
@@ -14,6 +14,7 @@
 from ceph.ceph import CommandFailed
 from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
 from tests.cephfs.cephfs_volume_management import wait_for_process
+from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
 from tests.cephfs.snapshot_clone.cephfs_cg_io import CG_snap_IO
 from tests.cephfs.snapshot_clone.cg_snap_utils import CG_Snap_Utils
 from utility.log import Log
@@ -106,12 +107,14 @@ def run(ceph_cluster, **kw):
     Run QS IO validation tool on selected quiesce set
     1.Run multiple parallel quiesce calls to same set
     2.Create snapshots when quiesced, wait for sometime and release quiesce
-    Clean Up:
+
+    Clean Up: Umount mountpoints, destroy subvolumes and subvolumegroup
 
     """
     try:
         test_data = kw.get("test_data")
         fs_util_v1 = FsUtilsv1(ceph_cluster, test_data=test_data)
+        cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
         erasure = (
             FsUtilsv1.get_custom_config_value(test_data, "erasure")
             if test_data
@@ -237,22 +240,17 @@ def run(ceph_cluster, **kw):
         crash_status_before = fs_util_v1.get_crash_ls_new(client1)
 
         log.info(f"Crash status before Test: {crash_status_before}")
-        end_time = datetime.datetime.now() + datetime.timedelta(seconds=300)
-        ceph_healthy = 0
-        while (datetime.datetime.now() < end_time) and (ceph_healthy == 0):
-            try:
-                fs_util_v1.get_ceph_health_status(client1)
-                ceph_healthy = 1
-            except Exception as ex:
-                log.info(ex)
-                time.sleep(5)
-        if ceph_healthy == 0:
-            assert False, "Ceph Cluster remains unhealthy even after 5mins"
+        wait_time_secs = 300
+        if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
+            assert (
+                False
+            ), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
         cg_test_params = {
             "ceph_cluster": ceph_cluster,
             "fs_name": default_fs,
             "fs_util": fs_util_v1,
             "cg_snap_util": cg_snap_util,
+            "cephfs_common_utils": cephfs_common_utils,
             "cg_snap_io": cg_snap_io,
             "clients": qs_clients,
             "mgr_node": mgr_node,
@@ -305,11 +303,7 @@ def run(ceph_cluster, **kw):
     finally:
         log.info("Clean Up in progess")
         wait_time_secs = 300
-        if wait_for_healthy_ceph(client1, fs_util_v1, wait_time_secs) == 0:
-            client1.exec_command(
-                sudo=True,
-                cmd="ceph fs status;ceph status -s;ceph health detail",
-            )
+        if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
             assert (
                 False
             ), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
@@ -2143,6 +2137,7 @@ def cg_snap_interop_1(cg_test_params):
     qs_clients = [client, client1]
     qs_sets = cg_test_params["qs_sets"]
     cg_snap_util = cg_test_params["cg_snap_util"]
+    cephfs_common_utils = cg_test_params["cephfs_common_utils"]
     cg_snap_io = cg_test_params["cg_snap_io"]
     fs_util = cg_test_params["fs_util"]
 
@@ -2164,13 +2159,13 @@ def cg_snap_interop_1(cg_test_params):
         cmd += '"'
         log.info("Adding 7 MDS to cluster")
         out, rc = client.exec_command(sudo=True, cmd=cmd)
-    if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
+    if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
         return 1
     client.exec_command(
         sudo=True,
         cmd=f"ceph fs set {fs_name} max_mds 4",
     )
-    if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
+    if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
         return 1
     test_fail = 0
     qs_set = random.choice(qs_sets)
@@ -2256,7 +2251,7 @@ def cg_snap_interop_1(cg_test_params):
             )
             time.sleep(10)
 
-            if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
+            if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
                 log.error("Ceph cluster is not healthy after MDS failover")
                 return 1
             log.info("Verify quiesce lifecycle can suceed after mds failover")
@@ -2302,7 +2297,7 @@ def cg_snap_interop_1(cg_test_params):
 
             log.info("MDS failover when quiesced: quiesce state is CANCELED")
             time.sleep(10)
-            if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
+            if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
                 log.error("Ceph cluster is not healthy after MDS failover")
                 return 1
             log.info("Verify quiesce lifecycle can suceed after mds failover")
@@ -2345,7 +2340,7 @@ def cg_snap_interop_1(cg_test_params):
                 log.error(f"qs set {qs_id_val} not reached RELEASED state")
 
             time.sleep(10)
-            if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
+            if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
                 log.error("Ceph cluster is not healthy after MDS failover")
                 return 1
             log.info("Verify quiesce lifecycle can suceed after mds failover")
@@ -2389,13 +2384,9 @@ def cg_snap_interop_1(cg_test_params):
         sudo=True,
         cmd=f"ceph fs set {fs_name} max_mds 2",
     )
-    if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
+    if cephfs_common_utils.wait_for_healthy_ceph(client1, 300):
         log.error("Ceph cluster is not healthy after max_mds set to 2")
         test_fail += 1
-        client.exec_command(
-            sudo=True,
-            cmd=f"ceph fs status {fs_name};ceph status -s;ceph health detail",
-        )
     if cg_test_io_status.value == 1:
         log.error(
             f"CG IO test exits with failure during quiesce test on qs_set-{qs_id_val}"
@@ -3071,23 +3062,3 @@ def wait_for_two_active_mds(client1, fs_name, max_wait_time=180, retry_interval=
             time.sleep(retry_interval)  # Retry after the specified interval
 
     return False
-
-
-def wait_for_healthy_ceph(client1, fs_util, wait_time_secs):
-    # Returns 1 if healthy, 0 if unhealthy
-    ceph_healthy = 0
-    end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time_secs)
-    while ceph_healthy == 0 and (datetime.datetime.now() < end_time):
-        try:
-            fs_util.get_ceph_health_status(client1)
-            ceph_healthy = 1
-        except Exception as ex:
-            log.info(ex)
-            log.info(
-                f"Wait for sometime to check if Cluster health can be OK, current state : {ex}"
-            )
-            time.sleep(5)
-
-    if ceph_healthy == 0:
-        return 0
-    return 1
diff --git a/tests/cephfs/snapshot_clone/snap_schedule_retention_vol_subvol.py b/tests/cephfs/snapshot_clone/snap_schedule_retention_vol_subvol.py
@@ -1,4 +1,3 @@
-import datetime
 import json
 import random
 import re
@@ -13,6 +12,7 @@
 from ceph.ceph import CommandFailed
 from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
 from tests.cephfs.cephfs_volume_management import wait_for_process
+from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
 from tests.cephfs.snapshot_clone.cephfs_snap_utils import SnapUtils
 from utility.log import Log
 from utility.utils import get_ceph_version_from_cluster
@@ -26,6 +26,7 @@ def run(ceph_cluster, **kw):
     Verify .snap across kernel, fuse and nfs mounts(future-ready) for snaps created by schedule.
 
     Type - Functional
+
     Workflow1 - snap_sched_vol: Verify Snapshot schedule on volume.Validate snaphots in .snap across
     mount types - kernel,fuse,nfs
     Steps:
@@ -34,9 +35,11 @@ def run(ceph_cluster, **kw):
     3. Verify scheduled snapshtos are getting created.
     4. Validate snapshot schedule by checking if snapshots are created as per schedule.
     5. Verify snap list across all mount types - kernel,nfs,fuse
+
     Workflow2 - snap_sched_subvol: Verify Snapshot schedule on subvolume.Validate snaphots in .snap across
     mount types - kernel,fuse,nfs
     Steps : Repeat workflow1 steps on subvolume
+
     Workflow3 - snap_retention_vol: Verify Snapshot Retention on volume
     Steps:
     1. Create Snapshot schedule on ceph FS volume. Verify snapshot schedule created and active
@@ -86,6 +89,7 @@ def run(ceph_cluster, **kw):
     try:
         test_data = kw.get("test_data")
         fs_util_v1 = FsUtilsv1(ceph_cluster, test_data=test_data)
+        cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
         erasure = (
             FsUtilsv1.get_custom_config_value(test_data, "erasure")
             if test_data
@@ -205,18 +209,11 @@ def run(ceph_cluster, **kw):
             log.info(
                 f"Verify Ceph Status is healthy before starting test {test_case_name}"
             )
-            ceph_healthy = 0
-            end_time = datetime.datetime.now() + datetime.timedelta(seconds=300)
-            while (datetime.datetime.now() < end_time) and (ceph_healthy == 0):
-                try:
-                    fs_util_v1.get_ceph_health_status(client1)
-                    ceph_healthy = 1
-                except Exception as ex:
-                    log.info(ex)
-                    log.info("Wait for few secs and recheck ceph status")
-                    time.sleep(5)
-            if ceph_healthy == 0:
-                assert False, "Ceph remains unhealthy even after wait for 300secs"
+            wait_time_secs = 300
+            if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
+                assert (
+                    False
+                ), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
             cleanup_params = run_snap_test(snap_test_params)
             log.info(f"post_test_params:{cleanup_params}")
             snap_test_params["export_created"] = cleanup_params["export_created"]