Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TFA fix to ignore bluestore slowness alert during health checks #4536

Merged
merged 1 commit into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions tests/cephfs/lib/cephfs_common_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
This is cephfs utilsV1 extension to include further common reusable methods for FS regression testing

"""

import datetime
import time

from tests.cephfs.cephfs_utilsV1 import FsUtils
from utility.log import Log

log = Log(__name__)


class CephFSCommonUtils(FsUtils):
def __init__(self, ceph_cluster):
"""
FS Utility V2 object
Args:
ceph_cluster (ceph.ceph.Ceph): ceph cluster
"""
self.ceph_cluster = ceph_cluster
super().__init__(ceph_cluster)

def wait_for_healthy_ceph(self, client, wait_time):
"""
This method will run ceph status and if its not HEALTH_OK, will wait for wait_time for it to be healthy
Args:
Required:
Client : Client object to run command
wait_time : Time to wait for HEALTH_OK, in seconds

Returns 0 if healthy, 1 if unhealthy even after wait_time
"""
ceph_healthy = 0
end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time)
while ceph_healthy == 0 and (datetime.datetime.now() < end_time):
try:
self.get_ceph_health_status(client)
ceph_healthy = 1
except Exception as ex:
log.info(ex)
out, rc = client.exec_command(sudo=True, cmd="ceph health detail")
if "experiencing slow operations in BlueStore" in str(out):
log.info("Ignoring the known warning for Bluestore Slow ops")
ceph_healthy = 1
else:
log.info(
"Wait for sometime to check if Cluster health can be OK, current state : %s",
ex,
)
time.sleep(5)

if ceph_healthy == 0:
client.exec_command(
sudo=True,
cmd="ceph fs status;ceph -s;ceph health detail",
)
return 1
return 0
30 changes: 9 additions & 21 deletions tests/cephfs/snapshot_clone/cg_snap_system_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from threading import Thread

from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
from tests.cephfs.snapshot_clone.cephfs_cg_io import CG_snap_IO
from tests.cephfs.snapshot_clone.cg_snap_utils import CG_Snap_Utils
from utility.log import Log
Expand All @@ -29,12 +30,13 @@ def run(ceph_cluster, **kw):
cancel,reset,include,exclude

Clean Up:
1.
1. Umount mountpoints, Destroy subvolumes and subvolumegroups.

"""
try:
fs_util_v1 = FsUtilsv1(ceph_cluster)
cg_snap_util = CG_Snap_Utils(ceph_cluster)
cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
cg_snap_io = CG_snap_IO(ceph_cluster)
config = kw.get("config")
clients = ceph_cluster.get_ceph_objects("client")
Expand Down Expand Up @@ -110,7 +112,9 @@ def run(ceph_cluster, **kw):
)
crash_status_before = fs_util_v1.get_crash_ls_new(client1)
log.info(f"Crash status before Test: {crash_status_before}")
fs_util_v1.get_ceph_health_status(client1)
wait_time_secs = 300
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
assert False, "Cluster health is not OK even after waiting for sometime"
sv_mixed_list = []
qs_cnt -= 1
for i in range(0, qs_cnt):
Expand Down Expand Up @@ -161,25 +165,9 @@ def run(ceph_cluster, **kw):
log.info("Clean Up in progess")
crash_status_after = fs_util_v1.get_crash_ls_new(client1)
log.info(f"Crash status after Test: {crash_status_after}")
health_wait = 300
end_time = datetime.datetime.now() + datetime.timedelta(seconds=health_wait)
health_ok = 0
wait_time = 0
while (datetime.datetime.now() < end_time) and health_ok == 0:
try:
fs_util_v1.get_ceph_health_status(client1)
health_ok = 1
except Exception as ex:
log.info(
f"Wait for sometime to check if Cluster health can be OK, current state : {ex}"
)
time.sleep(10)
wait_time += 10
if health_ok == 0:
assert (
False
), f"Cluster health is not OK even after waiting for {health_wait}secs"
log.info(f"Cluster Health is OK in {wait_time}secs")
wait_time_secs = 300
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
assert False, "Cluster health is not OK even after waiting for sometime"

if len(crash_status_after) > len(crash_status_before):
assert False, "Post test validation failed, please check crash report above"
Expand Down
63 changes: 16 additions & 47 deletions tests/cephfs/snapshot_clone/cg_snap_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ceph.ceph import CommandFailed
from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
from tests.cephfs.cephfs_volume_management import wait_for_process
from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
from tests.cephfs.snapshot_clone.cephfs_cg_io import CG_snap_IO
from tests.cephfs.snapshot_clone.cg_snap_utils import CG_Snap_Utils
from utility.log import Log
Expand Down Expand Up @@ -106,12 +107,14 @@ def run(ceph_cluster, **kw):
Run QS IO validation tool on selected quiesce set
1.Run multiple parallel quiesce calls to same set
2.Create snapshots when quiesced, wait for sometime and release quiesce
Clean Up:

Clean Up: Umount mountpoints, destroy subvolumes and subvolumegroup

"""
try:
test_data = kw.get("test_data")
fs_util_v1 = FsUtilsv1(ceph_cluster, test_data=test_data)
cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
erasure = (
FsUtilsv1.get_custom_config_value(test_data, "erasure")
if test_data
Expand Down Expand Up @@ -237,22 +240,15 @@ def run(ceph_cluster, **kw):
crash_status_before = fs_util_v1.get_crash_ls_new(client1)

log.info(f"Crash status before Test: {crash_status_before}")
end_time = datetime.datetime.now() + datetime.timedelta(seconds=300)
ceph_healthy = 0
while (datetime.datetime.now() < end_time) and (ceph_healthy == 0):
try:
fs_util_v1.get_ceph_health_status(client1)
ceph_healthy = 1
except Exception as ex:
log.info(ex)
time.sleep(5)
if ceph_healthy == 0:
assert False, "Ceph Cluster remains unhealthy even after 5mins"
wait_time_secs = 300
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
assert False, "Cluster health is not OK even after waiting for sometime"
cg_test_params = {
"ceph_cluster": ceph_cluster,
"fs_name": default_fs,
"fs_util": fs_util_v1,
"cg_snap_util": cg_snap_util,
"cephfs_common_utils": cephfs_common_utils,
"cg_snap_io": cg_snap_io,
"clients": qs_clients,
"mgr_node": mgr_node,
Expand Down Expand Up @@ -305,11 +301,7 @@ def run(ceph_cluster, **kw):
finally:
log.info("Clean Up in progess")
wait_time_secs = 300
if wait_for_healthy_ceph(client1, fs_util_v1, wait_time_secs) == 0:
client1.exec_command(
sudo=True,
cmd="ceph fs status;ceph status -s;ceph health detail",
)
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
assert (
False
), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
Expand Down Expand Up @@ -2143,6 +2135,7 @@ def cg_snap_interop_1(cg_test_params):
qs_clients = [client, client1]
qs_sets = cg_test_params["qs_sets"]
cg_snap_util = cg_test_params["cg_snap_util"]
cephfs_common_utils = cg_test_params["cephfs_common_utils"]
cg_snap_io = cg_test_params["cg_snap_io"]
fs_util = cg_test_params["fs_util"]

Expand All @@ -2164,13 +2157,13 @@ def cg_snap_interop_1(cg_test_params):
cmd += '"'
log.info("Adding 7 MDS to cluster")
out, rc = client.exec_command(sudo=True, cmd=cmd)
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
return 1
client.exec_command(
sudo=True,
cmd=f"ceph fs set {fs_name} max_mds 4",
)
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
return 1
test_fail = 0
qs_set = random.choice(qs_sets)
Expand Down Expand Up @@ -2256,7 +2249,7 @@ def cg_snap_interop_1(cg_test_params):
)
time.sleep(10)

if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
log.error("Ceph cluster is not healthy after MDS failover")
return 1
log.info("Verify quiesce lifecycle can suceed after mds failover")
Expand Down Expand Up @@ -2302,7 +2295,7 @@ def cg_snap_interop_1(cg_test_params):

log.info("MDS failover when quiesced: quiesce state is CANCELED")
time.sleep(10)
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
log.error("Ceph cluster is not healthy after MDS failover")
return 1
log.info("Verify quiesce lifecycle can suceed after mds failover")
Expand Down Expand Up @@ -2345,7 +2338,7 @@ def cg_snap_interop_1(cg_test_params):
log.error(f"qs set {qs_id_val} not reached RELEASED state")

time.sleep(10)
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
log.error("Ceph cluster is not healthy after MDS failover")
return 1
log.info("Verify quiesce lifecycle can suceed after mds failover")
Expand Down Expand Up @@ -2389,13 +2382,9 @@ def cg_snap_interop_1(cg_test_params):
sudo=True,
cmd=f"ceph fs set {fs_name} max_mds 2",
)
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
log.error("Ceph cluster is not healthy after max_mds set to 2")
test_fail += 1
client.exec_command(
sudo=True,
cmd=f"ceph fs status {fs_name};ceph status -s;ceph health detail",
)
if cg_test_io_status.value == 1:
log.error(
f"CG IO test exits with failure during quiesce test on qs_set-{qs_id_val}"
Expand Down Expand Up @@ -3071,23 +3060,3 @@ def wait_for_two_active_mds(client1, fs_name, max_wait_time=180, retry_interval=
time.sleep(retry_interval) # Retry after the specified interval

return False


def wait_for_healthy_ceph(client1, fs_util, wait_time_secs):
# Returns 1 if healthy, 0 if unhealthy
ceph_healthy = 0
end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time_secs)
while ceph_healthy == 0 and (datetime.datetime.now() < end_time):
try:
fs_util.get_ceph_health_status(client1)
ceph_healthy = 1
except Exception as ex:
log.info(ex)
log.info(
f"Wait for sometime to check if Cluster health can be OK, current state : {ex}"
)
time.sleep(5)

if ceph_healthy == 0:
return 0
return 1
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import datetime
import json
import random
import re
Expand All @@ -13,6 +12,7 @@
from ceph.ceph import CommandFailed
from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
from tests.cephfs.cephfs_volume_management import wait_for_process
from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
from tests.cephfs.snapshot_clone.cephfs_snap_utils import SnapUtils
from utility.log import Log
from utility.utils import get_ceph_version_from_cluster
Expand All @@ -26,6 +26,7 @@ def run(ceph_cluster, **kw):
Verify .snap across kernel, fuse and nfs mounts(future-ready) for snaps created by schedule.

Type - Functional

Workflow1 - snap_sched_vol: Verify Snapshot schedule on volume.Validate snaphots in .snap across
mount types - kernel,fuse,nfs
Steps:
Expand All @@ -34,9 +35,11 @@ def run(ceph_cluster, **kw):
3. Verify scheduled snapshtos are getting created.
4. Validate snapshot schedule by checking if snapshots are created as per schedule.
5. Verify snap list across all mount types - kernel,nfs,fuse

Workflow2 - snap_sched_subvol: Verify Snapshot schedule on subvolume.Validate snaphots in .snap across
mount types - kernel,fuse,nfs
Steps : Repeat workflow1 steps on subvolume

Workflow3 - snap_retention_vol: Verify Snapshot Retention on volume
Steps:
1. Create Snapshot schedule on ceph FS volume. Verify snapshot schedule created and active
Expand Down Expand Up @@ -86,6 +89,7 @@ def run(ceph_cluster, **kw):
try:
test_data = kw.get("test_data")
fs_util_v1 = FsUtilsv1(ceph_cluster, test_data=test_data)
cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
erasure = (
FsUtilsv1.get_custom_config_value(test_data, "erasure")
if test_data
Expand Down Expand Up @@ -205,18 +209,9 @@ def run(ceph_cluster, **kw):
log.info(
f"Verify Ceph Status is healthy before starting test {test_case_name}"
)
ceph_healthy = 0
end_time = datetime.datetime.now() + datetime.timedelta(seconds=300)
while (datetime.datetime.now() < end_time) and (ceph_healthy == 0):
try:
fs_util_v1.get_ceph_health_status(client1)
ceph_healthy = 1
except Exception as ex:
log.info(ex)
log.info("Wait for few secs and recheck ceph status")
time.sleep(5)
if ceph_healthy == 0:
assert False, "Ceph remains unhealthy even after wait for 300secs"
wait_time_secs = 300
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
assert False, "Cluster health is not OK even after waiting for sometime"
cleanup_params = run_snap_test(snap_test_params)
log.info(f"post_test_params:{cleanup_params}")
snap_test_params["export_created"] = cleanup_params["export_created"]
Expand Down