Skip to content

Commit 31ffd65

Browse files
Suma RSuma R
Suma R
authored and
Suma R
committed
TFA fix to ignore bluestore slowness alert during health checks
Signed-off-by: Suma R <[email protected]>
1 parent 0d0ff63 commit 31ffd65

File tree

4 files changed

+98
-79
lines changed

4 files changed

+98
-79
lines changed

tests/cephfs/lib/cephfs_common_lib.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""
2+
This is cephfs utilsV1 extension to include further common reusable methods for FS regression testing
3+
4+
"""
5+
6+
import datetime
7+
import time
8+
9+
from tests.cephfs.cephfs_utilsV1 import FsUtils
10+
from utility.log import Log
11+
12+
log = Log(__name__)
13+
14+
15+
class CephFSCommonUtils(FsUtils):
16+
def __init__(self, ceph_cluster):
17+
"""
18+
FS Utility V2 object
19+
Args:
20+
ceph_cluster (ceph.ceph.Ceph): ceph cluster
21+
"""
22+
self.ceph_cluster = ceph_cluster
23+
super().__init__(ceph_cluster)
24+
25+
def wait_for_healthy_ceph(self, client, wait_time):
26+
"""
27+
This method will run ceph status and if its not HEALTH_OK, will wait for wait_time for it to be healthy
28+
Args:
29+
Required:
30+
Client : Client object to run command
31+
wait_time : Time to wait for HEALTH_OK, in seconds
32+
33+
Returns 0 if healthy, 1 if unhealthy even after wait_time
34+
"""
35+
ceph_healthy = 0
36+
end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time)
37+
while ceph_healthy == 0 and (datetime.datetime.now() < end_time):
38+
try:
39+
self.get_ceph_health_status(client)
40+
ceph_healthy = 1
41+
except Exception as ex:
42+
log.info(ex)
43+
out, rc = client.exec_command(sudo=True, cmd="ceph health detail")
44+
if "experiencing slow operations in BlueStore" in str(out):
45+
log.info("Ignoring the known warning for Bluestore Slow ops")
46+
ceph_healthy = 1
47+
else:
48+
log.info(
49+
f"Wait for sometime to check if Cluster health can be OK, current state : {ex}"
50+
)
51+
time.sleep(5)
52+
53+
if ceph_healthy == 0:
54+
client.exec_command(
55+
sudo=True,
56+
cmd="ceph fs status;ceph -s;ceph health detail",
57+
)
58+
return 1
59+
return 0

tests/cephfs/snapshot_clone/cg_snap_system_test.py

+11-19
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from threading import Thread
88

99
from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
10+
from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
1011
from tests.cephfs.snapshot_clone.cephfs_cg_io import CG_snap_IO
1112
from tests.cephfs.snapshot_clone.cg_snap_utils import CG_Snap_Utils
1213
from utility.log import Log
@@ -29,12 +30,13 @@ def run(ceph_cluster, **kw):
2930
cancel,reset,include,exclude
3031
3132
Clean Up:
32-
1.
33+
1. Umount mountpoints, Destroy subvolumes and subvolumegroups.
3334
3435
"""
3536
try:
3637
fs_util_v1 = FsUtilsv1(ceph_cluster)
3738
cg_snap_util = CG_Snap_Utils(ceph_cluster)
39+
cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
3840
cg_snap_io = CG_snap_IO(ceph_cluster)
3941
config = kw.get("config")
4042
clients = ceph_cluster.get_ceph_objects("client")
@@ -110,7 +112,11 @@ def run(ceph_cluster, **kw):
110112
)
111113
crash_status_before = fs_util_v1.get_crash_ls_new(client1)
112114
log.info(f"Crash status before Test: {crash_status_before}")
113-
fs_util_v1.get_ceph_health_status(client1)
115+
wait_time_secs = 300
116+
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
117+
assert (
118+
False
119+
), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
114120
sv_mixed_list = []
115121
qs_cnt -= 1
116122
for i in range(0, qs_cnt):
@@ -161,25 +167,11 @@ def run(ceph_cluster, **kw):
161167
log.info("Clean Up in progess")
162168
crash_status_after = fs_util_v1.get_crash_ls_new(client1)
163169
log.info(f"Crash status after Test: {crash_status_after}")
164-
health_wait = 300
165-
end_time = datetime.datetime.now() + datetime.timedelta(seconds=health_wait)
166-
health_ok = 0
167-
wait_time = 0
168-
while (datetime.datetime.now() < end_time) and health_ok == 0:
169-
try:
170-
fs_util_v1.get_ceph_health_status(client1)
171-
health_ok = 1
172-
except Exception as ex:
173-
log.info(
174-
f"Wait for sometime to check if Cluster health can be OK, current state : {ex}"
175-
)
176-
time.sleep(10)
177-
wait_time += 10
178-
if health_ok == 0:
170+
wait_time_secs = 300
171+
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
179172
assert (
180173
False
181-
), f"Cluster health is not OK even after waiting for {health_wait}secs"
182-
log.info(f"Cluster Health is OK in {wait_time}secs")
174+
), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
183175

184176
if len(crash_status_after) > len(crash_status_before):
185177
assert False, "Post test validation failed, please check crash report above"

tests/cephfs/snapshot_clone/cg_snap_test.py

+18-47
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from ceph.ceph import CommandFailed
1515
from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
1616
from tests.cephfs.cephfs_volume_management import wait_for_process
17+
from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
1718
from tests.cephfs.snapshot_clone.cephfs_cg_io import CG_snap_IO
1819
from tests.cephfs.snapshot_clone.cg_snap_utils import CG_Snap_Utils
1920
from utility.log import Log
@@ -106,12 +107,14 @@ def run(ceph_cluster, **kw):
106107
Run QS IO validation tool on selected quiesce set
107108
1.Run multiple parallel quiesce calls to same set
108109
2.Create snapshots when quiesced, wait for sometime and release quiesce
109-
Clean Up:
110+
111+
Clean Up: Umount mountpoints, destroy subvolumes and subvolumegroup
110112
111113
"""
112114
try:
113115
test_data = kw.get("test_data")
114116
fs_util_v1 = FsUtilsv1(ceph_cluster, test_data=test_data)
117+
cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
115118
erasure = (
116119
FsUtilsv1.get_custom_config_value(test_data, "erasure")
117120
if test_data
@@ -237,22 +240,17 @@ def run(ceph_cluster, **kw):
237240
crash_status_before = fs_util_v1.get_crash_ls_new(client1)
238241

239242
log.info(f"Crash status before Test: {crash_status_before}")
240-
end_time = datetime.datetime.now() + datetime.timedelta(seconds=300)
241-
ceph_healthy = 0
242-
while (datetime.datetime.now() < end_time) and (ceph_healthy == 0):
243-
try:
244-
fs_util_v1.get_ceph_health_status(client1)
245-
ceph_healthy = 1
246-
except Exception as ex:
247-
log.info(ex)
248-
time.sleep(5)
249-
if ceph_healthy == 0:
250-
assert False, "Ceph Cluster remains unhealthy even after 5mins"
243+
wait_time_secs = 300
244+
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
245+
assert (
246+
False
247+
), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
251248
cg_test_params = {
252249
"ceph_cluster": ceph_cluster,
253250
"fs_name": default_fs,
254251
"fs_util": fs_util_v1,
255252
"cg_snap_util": cg_snap_util,
253+
"cephfs_common_utils": cephfs_common_utils,
256254
"cg_snap_io": cg_snap_io,
257255
"clients": qs_clients,
258256
"mgr_node": mgr_node,
@@ -305,11 +303,7 @@ def run(ceph_cluster, **kw):
305303
finally:
306304
log.info("Clean Up in progess")
307305
wait_time_secs = 300
308-
if wait_for_healthy_ceph(client1, fs_util_v1, wait_time_secs) == 0:
309-
client1.exec_command(
310-
sudo=True,
311-
cmd="ceph fs status;ceph status -s;ceph health detail",
312-
)
306+
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
313307
assert (
314308
False
315309
), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
@@ -2143,6 +2137,7 @@ def cg_snap_interop_1(cg_test_params):
21432137
qs_clients = [client, client1]
21442138
qs_sets = cg_test_params["qs_sets"]
21452139
cg_snap_util = cg_test_params["cg_snap_util"]
2140+
cephfs_common_utils = cg_test_params["cephfs_common_utils"]
21462141
cg_snap_io = cg_test_params["cg_snap_io"]
21472142
fs_util = cg_test_params["fs_util"]
21482143

@@ -2164,13 +2159,13 @@ def cg_snap_interop_1(cg_test_params):
21642159
cmd += '"'
21652160
log.info("Adding 7 MDS to cluster")
21662161
out, rc = client.exec_command(sudo=True, cmd=cmd)
2167-
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
2162+
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
21682163
return 1
21692164
client.exec_command(
21702165
sudo=True,
21712166
cmd=f"ceph fs set {fs_name} max_mds 4",
21722167
)
2173-
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
2168+
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
21742169
return 1
21752170
test_fail = 0
21762171
qs_set = random.choice(qs_sets)
@@ -2256,7 +2251,7 @@ def cg_snap_interop_1(cg_test_params):
22562251
)
22572252
time.sleep(10)
22582253

2259-
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
2254+
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
22602255
log.error("Ceph cluster is not healthy after MDS failover")
22612256
return 1
22622257
log.info("Verify quiesce lifecycle can suceed after mds failover")
@@ -2302,7 +2297,7 @@ def cg_snap_interop_1(cg_test_params):
23022297

23032298
log.info("MDS failover when quiesced: quiesce state is CANCELED")
23042299
time.sleep(10)
2305-
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
2300+
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
23062301
log.error("Ceph cluster is not healthy after MDS failover")
23072302
return 1
23082303
log.info("Verify quiesce lifecycle can suceed after mds failover")
@@ -2345,7 +2340,7 @@ def cg_snap_interop_1(cg_test_params):
23452340
log.error(f"qs set {qs_id_val} not reached RELEASED state")
23462341

23472342
time.sleep(10)
2348-
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
2343+
if cephfs_common_utils.wait_for_healthy_ceph(client, 300):
23492344
log.error("Ceph cluster is not healthy after MDS failover")
23502345
return 1
23512346
log.info("Verify quiesce lifecycle can suceed after mds failover")
@@ -2389,13 +2384,9 @@ def cg_snap_interop_1(cg_test_params):
23892384
sudo=True,
23902385
cmd=f"ceph fs set {fs_name} max_mds 2",
23912386
)
2392-
if wait_for_healthy_ceph(client1, fs_util, 300) == 0:
2387+
if cephfs_common_utils.wait_for_healthy_ceph(client1, 300):
23932388
log.error("Ceph cluster is not healthy after max_mds set to 2")
23942389
test_fail += 1
2395-
client.exec_command(
2396-
sudo=True,
2397-
cmd=f"ceph fs status {fs_name};ceph status -s;ceph health detail",
2398-
)
23992390
if cg_test_io_status.value == 1:
24002391
log.error(
24012392
f"CG IO test exits with failure during quiesce test on qs_set-{qs_id_val}"
@@ -3071,23 +3062,3 @@ def wait_for_two_active_mds(client1, fs_name, max_wait_time=180, retry_interval=
30713062
time.sleep(retry_interval) # Retry after the specified interval
30723063

30733064
return False
3074-
3075-
3076-
def wait_for_healthy_ceph(client1, fs_util, wait_time_secs):
3077-
# Returns 1 if healthy, 0 if unhealthy
3078-
ceph_healthy = 0
3079-
end_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_time_secs)
3080-
while ceph_healthy == 0 and (datetime.datetime.now() < end_time):
3081-
try:
3082-
fs_util.get_ceph_health_status(client1)
3083-
ceph_healthy = 1
3084-
except Exception as ex:
3085-
log.info(ex)
3086-
log.info(
3087-
f"Wait for sometime to check if Cluster health can be OK, current state : {ex}"
3088-
)
3089-
time.sleep(5)
3090-
3091-
if ceph_healthy == 0:
3092-
return 0
3093-
return 1

tests/cephfs/snapshot_clone/snap_schedule_retention_vol_subvol.py

+10-13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import datetime
21
import json
32
import random
43
import re
@@ -13,6 +12,7 @@
1312
from ceph.ceph import CommandFailed
1413
from tests.cephfs.cephfs_utilsV1 import FsUtils as FsUtilsv1
1514
from tests.cephfs.cephfs_volume_management import wait_for_process
15+
from tests.cephfs.lib.cephfs_common_lib import CephFSCommonUtils
1616
from tests.cephfs.snapshot_clone.cephfs_snap_utils import SnapUtils
1717
from utility.log import Log
1818
from utility.utils import get_ceph_version_from_cluster
@@ -26,6 +26,7 @@ def run(ceph_cluster, **kw):
2626
Verify .snap across kernel, fuse and nfs mounts(future-ready) for snaps created by schedule.
2727
2828
Type - Functional
29+
2930
Workflow1 - snap_sched_vol: Verify Snapshot schedule on volume.Validate snaphots in .snap across
3031
mount types - kernel,fuse,nfs
3132
Steps:
@@ -34,9 +35,11 @@ def run(ceph_cluster, **kw):
3435
3. Verify scheduled snapshtos are getting created.
3536
4. Validate snapshot schedule by checking if snapshots are created as per schedule.
3637
5. Verify snap list across all mount types - kernel,nfs,fuse
38+
3739
Workflow2 - snap_sched_subvol: Verify Snapshot schedule on subvolume.Validate snaphots in .snap across
3840
mount types - kernel,fuse,nfs
3941
Steps : Repeat workflow1 steps on subvolume
42+
4043
Workflow3 - snap_retention_vol: Verify Snapshot Retention on volume
4144
Steps:
4245
1. Create Snapshot schedule on ceph FS volume. Verify snapshot schedule created and active
@@ -86,6 +89,7 @@ def run(ceph_cluster, **kw):
8689
try:
8790
test_data = kw.get("test_data")
8891
fs_util_v1 = FsUtilsv1(ceph_cluster, test_data=test_data)
92+
cephfs_common_utils = CephFSCommonUtils(ceph_cluster)
8993
erasure = (
9094
FsUtilsv1.get_custom_config_value(test_data, "erasure")
9195
if test_data
@@ -205,18 +209,11 @@ def run(ceph_cluster, **kw):
205209
log.info(
206210
f"Verify Ceph Status is healthy before starting test {test_case_name}"
207211
)
208-
ceph_healthy = 0
209-
end_time = datetime.datetime.now() + datetime.timedelta(seconds=300)
210-
while (datetime.datetime.now() < end_time) and (ceph_healthy == 0):
211-
try:
212-
fs_util_v1.get_ceph_health_status(client1)
213-
ceph_healthy = 1
214-
except Exception as ex:
215-
log.info(ex)
216-
log.info("Wait for few secs and recheck ceph status")
217-
time.sleep(5)
218-
if ceph_healthy == 0:
219-
assert False, "Ceph remains unhealthy even after wait for 300secs"
212+
wait_time_secs = 300
213+
if cephfs_common_utils.wait_for_healthy_ceph(client1, wait_time_secs):
214+
assert (
215+
False
216+
), f"Cluster health is not OK even after waiting for {wait_time_secs}secs"
220217
cleanup_params = run_snap_test(snap_test_params)
221218
log.info(f"post_test_params:{cleanup_params}")
222219
snap_test_params["export_created"] = cleanup_params["export_created"]

0 commit comments

Comments
 (0)