diff --git a/tests/cephfs/lib/cephfs_common_lib.py b/tests/cephfs/lib/cephfs_common_lib.py new file mode 100644 index 00000000000..c312af19c79 --- /dev/null +++ b/tests/cephfs/lib/cephfs_common_lib.py @@ -0,0 +1,60 @@ +""" +This is cephfs utilsV1 extension to include further common reusable methods for FS regression testing + +""" + +import datetime +import time + +from tests.cephfs.cephfs_utilsV1 import FsUtils +from utility.log import Log + +log = Log(__name__) + + +class CephFSCommonUtils(FsUtils): + def __init__(self, ceph_cluster): + """ + FS Utility V2 object + Args: + ceph_cluster (ceph.ceph.Ceph): ceph cluster + """ + self.ceph_cluster = ceph_cluster + super().__init__(ceph_cluster) + + def wait_for_healthy_ceph(self, client, wait_time): + """ + This method will run ceph status and if its not HEALTH_OK, will wait for wait_time for it to be healthy + Args: + Required: + Client : Client object to run command + wait_time : Time to wait for HEALTH_OK, in seconds + + Returns 0 if healthy, 1 if unhealthy even after wait_time + """ + ceph_healthy = 0 + end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time) + while ceph_healthy == 0 and (datetime.datetime.now() < end_time): + try: + self.get_ceph_health_status(client) + ceph_healthy = 1 + except Exception as ex: + log.info(ex) + out, rc = client.exec_command(sudo=True, cmd="ceph health detail") + if "experiencing slow operations in BlueStore" in str(out): + log.info("Ignoring the known warning for Bluestore Slow ops") + ceph_healthy = 1 + else: + log.info( + "Wait for sometime to check if Cluster health can be OK, current state : %s", + ex, + ) + time.sleep(5) + + if ceph_healthy == 0: + client.exec_command( + sudo=True, + cmd="ceph fs status;ceph -s;ceph health detail", + ) + return 1 + return 0 diff --git a/tests/cephfs/snapshot_clone/cg_snap_system_test.py b/tests/cephfs/snapshot_clone/cg_snap_system_test.py index 89baaa93fdf..eed12200c8b 100644 --- a/tests/cephfs/snapshot_clone/cg_snap_system_test.py +++ b/tests/cephfs/snapshot_clone/cg_snap_system_test.py @@ -7,6 +7,7 @@ from threading import Thread from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1 +from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils from tests.cephfs.snapshot_clone.cephfs_cg_io import CG_snap_IO from tests.cephfs.snapshot_clone.cg_snap_utils import CG_Snap_Utils from utility.log import Log @@ -29,12 +30,13 @@ def run(ceph_cluster, **kw): cancel,reset,include,exclude Clean Up: - 1. + 1. Umount mountpoints, Destroy subvolumes and subvolumegroups. """ try: fs_util_v1 = FsUtilsv1(ceph_cluster) cg_snap_util = CG_Snap_Utils(ceph_cluster) + cephfs_common_utils = CephFSCommonUtils(ceph_cluster) cg_snap_io = CG_snap_IO(ceph_cluster) config = kw.get("config") clients = ceph_cluster.get_ceph_objects("client") @@ -110,7 +112,9 @@ def run(ceph_cluster, **kw): ) crash_status_before = fs_util_v1.get_crash_ls_new(client1) log.info(f"Crash status before Test: {crash_status_before}") - fs_util_v1.get_ceph_health_status(client1) + wait_time_secs = 300 + if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs): + assert False, "Cluster health is not OK even after waiting for sometime" sv_mixed_list = [] qs_cnt -= 1 for i in range(0, qs_cnt): @@ -161,25 +165,9 @@ def run(ceph_cluster, **kw): log.info("Clean Up in progess") crash_status_after = fs_util_v1.get_crash_ls_new(client1) log.info(f"Crash status after Test: {crash_status_after}") - health_wait = 300 - end_time = datetime.datetime.now() + datetime.timedelta(seconds=health_wait) - health_ok = 0 - wait_time = 0 - while (datetime.datetime.now() < end_time) and health_ok == 0: - try: - fs_util_v1.get_ceph_health_status(client1) - health_ok = 1 - except Exception as ex: - log.info( - f"Wait for sometime to check if Cluster health can be OK, current state : {ex}" - ) - time.sleep(10) - wait_time += 10 - if health_ok == 0: - assert ( - False - ), f"Cluster health is not OK even after waiting for {health_wait}secs" - log.info(f"Cluster Health is OK in {wait_time}secs") + wait_time_secs = 300 + if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs): + assert False, "Cluster health is not OK even after waiting for sometime" if len(crash_status_after) > len(crash_status_before): assert False, "Post test validation failed, please check crash report above" diff --git a/tests/cephfs/snapshot_clone/cg_snap_test.py b/tests/cephfs/snapshot_clone/cg_snap_test.py index 8c91bf47035..131661c9423 100644 --- a/tests/cephfs/snapshot_clone/cg_snap_test.py +++ b/tests/cephfs/snapshot_clone/cg_snap_test.py @@ -14,6 +14,7 @@ from ceph.ceph import CommandFailed from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1 from tests.cephfs.cephfs_volume_management import wait_for_process +from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils from tests.cephfs.snapshot_clone.cephfs_cg_io import CG_snap_IO from tests.cephfs.snapshot_clone.cg_snap_utils import CG_Snap_Utils from utility.log import Log @@ -106,12 +107,14 @@ def run(ceph_cluster, **kw): Run QS IO validation tool on selected quiesce set 1.Run multiple parallel quiesce calls to same set 2.Create snapshots when quiesced, wait for sometime and release quiesce - Clean Up: + + Clean Up: Umount mountpoints, destroy subvolumes and subvolumegroup """ try: test_data = kw.get("test_data") fs_util_v1 = FsUtilsv1(ceph_cluster, test_data=test_data) + cephfs_common_utils = CephFSCommonUtils(ceph_cluster) erasure = ( FsUtilsv1.get_custom_config_value(test_data, "erasure") if test_data @@ -237,22 +240,15 @@ def run(ceph_cluster, **kw): crash_status_before = fs_util_v1.get_crash_ls_new(client1) log.info(f"Crash status before Test: {crash_status_before}") - end_time = datetime.datetime.now() + datetime.timedelta(seconds=300) - ceph_healthy = 0 - while (datetime.datetime.now() < end_time) and (ceph_healthy == 0): - try: - fs_util_v1.get_ceph_health_status(client1) - ceph_healthy = 1 - except Exception as ex: - log.info(ex) - time.sleep(5) - if ceph_healthy == 0: - assert False, "Ceph Cluster remains unhealthy even after 5mins" + wait_time_secs = 300 + if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs): + assert False, "Cluster health is not OK even after waiting for sometime" cg_test_params = { "ceph_cluster": ceph_cluster, "fs_name": default_fs, "fs_util": fs_util_v1, "cg_snap_util": cg_snap_util, + "cephfs_common_utils": cephfs_common_utils, "cg_snap_io": cg_snap_io, "clients": qs_clients, "mgr_node": mgr_node, @@ -305,11 +301,7 @@ def run(ceph_cluster, **kw): finally: log.info("Clean Up in progess") wait_time_secs = 300 - if wait_for_healthy_ceph(client1, fs_util_v1, wait_time_secs) == 0: - client1.exec_command( - sudo=True, - cmd="ceph fs status;ceph status -s;ceph health detail", - ) + if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs): assert ( False ), f"Cluster health is not OK even after waiting for {wait_time_secs}secs" @@ -2143,6 +2135,7 @@ def cg_snap_interop_1(cg_test_params): qs_clients = [client, client1] qs_sets = cg_test_params["qs_sets"] cg_snap_util = cg_test_params["cg_snap_util"] + cephfs_common_utils = cg_test_params["cephfs_common_utils"] cg_snap_io = cg_test_params["cg_snap_io"] fs_util = cg_test_params["fs_util"] @@ -2164,13 +2157,13 @@ def cg_snap_interop_1(cg_test_params): cmd += '"' log.info("Adding 7 MDS to cluster") out, rc = client.exec_command(sudo=True, cmd=cmd) - if wait_for_healthy_ceph(client1, fs_util, 300) == 0: + if cephfs_common_utils.wait_for_healthy_ceph(client, 300): return 1 client.exec_command( sudo=True, cmd=f"ceph fs set {fs_name} max_mds 4", ) - if wait_for_healthy_ceph(client1, fs_util, 300) == 0: + if cephfs_common_utils.wait_for_healthy_ceph(client, 300): return 1 test_fail = 0 qs_set = random.choice(qs_sets) @@ -2256,7 +2249,7 @@ def cg_snap_interop_1(cg_test_params): ) time.sleep(10) - if wait_for_healthy_ceph(client1, fs_util, 300) == 0: + if cephfs_common_utils.wait_for_healthy_ceph(client, 300): log.error("Ceph cluster is not healthy after MDS failover") return 1 log.info("Verify quiesce lifecycle can suceed after mds failover") @@ -2302,7 +2295,7 @@ def cg_snap_interop_1(cg_test_params): log.info("MDS failover when quiesced: quiesce state is CANCELED") time.sleep(10) - if wait_for_healthy_ceph(client1, fs_util, 300) == 0: + if cephfs_common_utils.wait_for_healthy_ceph(client, 300): log.error("Ceph cluster is not healthy after MDS failover") return 1 log.info("Verify quiesce lifecycle can suceed after mds failover") @@ -2345,7 +2338,7 @@ def cg_snap_interop_1(cg_test_params): log.error(f"qs set {qs_id_val} not reached RELEASED state") time.sleep(10) - if wait_for_healthy_ceph(client1, fs_util, 300) == 0: + if cephfs_common_utils.wait_for_healthy_ceph(client, 300): log.error("Ceph cluster is not healthy after MDS failover") return 1 log.info("Verify quiesce lifecycle can suceed after mds failover") @@ -2389,13 +2382,9 @@ def cg_snap_interop_1(cg_test_params): sudo=True, cmd=f"ceph fs set {fs_name} max_mds 2", ) - if wait_for_healthy_ceph(client1, fs_util, 300) == 0: + if cephfs_common_utils.wait_for_healthy_ceph(client, 300): log.error("Ceph cluster is not healthy after max_mds set to 2") test_fail += 1 - client.exec_command( - sudo=True, - cmd=f"ceph fs status {fs_name};ceph status -s;ceph health detail", - ) if cg_test_io_status.value == 1: log.error( f"CG IO test exits with failure during quiesce test on qs_set-{qs_id_val}" @@ -3071,23 +3060,3 @@ def wait_for_two_active_mds(client1, fs_name, max_wait_time=180, retry_interval= time.sleep(retry_interval) # Retry after the specified interval return False - - -def wait_for_healthy_ceph(client1, fs_util, wait_time_secs): - # Returns 1 if healthy, 0 if unhealthy - ceph_healthy = 0 - end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time_secs) - while ceph_healthy == 0 and (datetime.datetime.now() < end_time): - try: - fs_util.get_ceph_health_status(client1) - ceph_healthy = 1 - except Exception as ex: - log.info(ex) - log.info( - f"Wait for sometime to check if Cluster health can be OK, current state : {ex}" - ) - time.sleep(5) - - if ceph_healthy == 0: - return 0 - return 1 diff --git a/tests/cephfs/snapshot_clone/snap_schedule_retention_vol_subvol.py b/tests/cephfs/snapshot_clone/snap_schedule_retention_vol_subvol.py index 2bc7064f745..9b60ba2644d 100644 --- a/tests/cephfs/snapshot_clone/snap_schedule_retention_vol_subvol.py +++ b/tests/cephfs/snapshot_clone/snap_schedule_retention_vol_subvol.py @@ -1,4 +1,3 @@ -import datetime import json import random import re @@ -13,6 +12,7 @@ from ceph.ceph import CommandFailed from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1 from tests.cephfs.cephfs_volume_management import wait_for_process +from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils from tests.cephfs.snapshot_clone.cephfs_snap_utils import SnapUtils from utility.log import Log from utility.utils import get_ceph_version_from_cluster @@ -26,6 +26,7 @@ def run(ceph_cluster, **kw): Verify .snap across kernel, fuse and nfs mounts(future-ready) for snaps created by schedule. Type - Functional + Workflow1 - snap_sched_vol: Verify Snapshot schedule on volume.Validate snaphots in .snap across mount types - kernel,fuse,nfs Steps: @@ -34,9 +35,11 @@ def run(ceph_cluster, **kw): 3. Verify scheduled snapshtos are getting created. 4. Validate snapshot schedule by checking if snapshots are created as per schedule. 5. Verify snap list across all mount types - kernel,nfs,fuse + Workflow2 - snap_sched_subvol: Verify Snapshot schedule on subvolume.Validate snaphots in .snap across mount types - kernel,fuse,nfs Steps : Repeat workflow1 steps on subvolume + Workflow3 - snap_retention_vol: Verify Snapshot Retention on volume Steps: 1. Create Snapshot schedule on ceph FS volume. Verify snapshot schedule created and active @@ -86,6 +89,7 @@ def run(ceph_cluster, **kw): try: test_data = kw.get("test_data") fs_util_v1 = FsUtilsv1(ceph_cluster, test_data=test_data) + cephfs_common_utils = CephFSCommonUtils(ceph_cluster) erasure = ( FsUtilsv1.get_custom_config_value(test_data, "erasure") if test_data @@ -205,18 +209,9 @@ def run(ceph_cluster, **kw): log.info( f"Verify Ceph Status is healthy before starting test {test_case_name}" ) - ceph_healthy = 0 - end_time = datetime.datetime.now() + datetime.timedelta(seconds=300) - while (datetime.datetime.now() < end_time) and (ceph_healthy == 0): - try: - fs_util_v1.get_ceph_health_status(client1) - ceph_healthy = 1 - except Exception as ex: - log.info(ex) - log.info("Wait for few secs and recheck ceph status") - time.sleep(5) - if ceph_healthy == 0: - assert False, "Ceph remains unhealthy even after wait for 300secs" + wait_time_secs = 300 + if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs): + assert False, "Cluster health is not OK even after waiting for sometime" cleanup_params = run_snap_test(snap_test_params) log.info(f"post_test_params:{cleanup_params}") snap_test_params["export_created"] = cleanup_params["export_created"]