-
Notifications
You must be signed in to change notification settings - Fork 170
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Validate the resiliency of the ODF + OpenShift Virtualization system in case of Worker node failure #11552
base: master
Are you sure you want to change the base?
Validate the resiliency of the ODF + OpenShift Virtualization system in case of Worker node failure #11552
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
import logging | ||
import random | ||
|
||
import pytest | ||
|
||
from ocs_ci.framework import config | ||
from ocs_ci.framework.pytest_customization.marks import ( | ||
magenta_squad, | ||
workloads, | ||
ignore_leftovers, | ||
) | ||
from ocs_ci.framework.testlib import E2ETest | ||
from ocs_ci.ocs import constants, node | ||
from ocs_ci.ocs.resources import pod | ||
from ocs_ci.ocs.resources.pod import wait_for_pods_to_be_running | ||
from ocs_ci.utility.utils import TimeoutSampler, ceph_health_check | ||
from ocs_ci.ocs.exceptions import ResourceWrongStatusException | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
@magenta_squad | ||
@workloads | ||
@ignore_leftovers | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the left over here? |
||
@pytest.mark.polarion_id("OCS-") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please create a test case in polarion and add the ID here |
||
class TestVmWorkerNodeResiliency(E2ETest): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are doing a single worker node failure, please repharse it accordingly |
||
""" | ||
Test case for ensuring that both OpenShift Virtualization | ||
and ODF can recover from a worker node failure that hosts critical pods | ||
(such as OpenShift Virtualization VMs, OSD pods, or mon pods) | ||
""" | ||
|
||
short_nw_fail_time = 300 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are stopping and starting the node. This constant can be removed |
||
|
||
def test_vm_worker_node_failure( | ||
self, setup_cnv, nodes, project_factory, multi_cnv_workload | ||
): | ||
""" | ||
Test case to ensure that both OpenShift Virtualization and ODF | ||
can recover from a worker node failure that | ||
hosts critical pods (such as OpenShift Virtualization VMs, | ||
OSD pods, or mon pods) | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add test steps |
||
|
||
odf_namespace = constants.OPENSHIFT_STORAGE_NAMESPACE | ||
cnv_namespace = constants.CNV_NAMESPACE | ||
|
||
proj_obj = project_factory() | ||
vm_objs_def, vm_objs_aggr, sc_objs_def, sc_objs_aggr = multi_cnv_workload( | ||
namespace=proj_obj.namespace | ||
) | ||
vm_list = vm_objs_def + vm_objs_aggr | ||
|
||
log.info(f"Total VMs to process: {len(vm_list)}") | ||
|
||
initial_vm_states = { | ||
vm_obj.name: [vm_obj.printableStatus(), vm_obj.get_vmi_instance().node()] | ||
for vm_obj in vm_objs_def + vm_objs_aggr | ||
} | ||
log.info(f"Initial VM states: {initial_vm_states}") | ||
|
||
sample = TimeoutSampler( | ||
timeout=600, | ||
sleep=10, | ||
func=wait_for_pods_to_be_running, | ||
namespace=odf_namespace, | ||
) | ||
assert sample.wait_for_func_status( | ||
result=True | ||
), f"Not all pods are running in {odf_namespace} before node failure" | ||
|
||
sample = TimeoutSampler( | ||
timeout=600, | ||
sleep=10, | ||
func=wait_for_pods_to_be_running, | ||
namespace=cnv_namespace, | ||
) | ||
assert sample.wait_for_func_status( | ||
result=True | ||
), f"Not all pods are running in {cnv_namespace} before node failure" | ||
|
||
ceph_health_check(tries=80) | ||
Comment on lines
+62
to
+82
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as discussed, this will be taken care at the start of the test run by the framework. It can be removed |
||
|
||
worker_nodes = node.get_osd_running_nodes() | ||
node_name = random.sample(worker_nodes, 1) | ||
node_name = node_name[0] | ||
Comment on lines
+84
to
+86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how are you making sure that the randomly selected node is having VM running on it? |
||
|
||
log.info(f"Attempting to restart node: {node_name}") | ||
node_obj = node.get_node_objs([node_name]) | ||
if config.ENV_DATA["platform"].lower() == constants.GCP_PLATFORM: | ||
nodes.restart_nodes_by_stop_and_start(node_obj, force=False) | ||
Comment on lines
+90
to
+91
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. GCP is already handled, wouldn't the platform run according to the platform? |
||
else: | ||
nodes.restart_nodes_by_stop_and_start(node_obj) | ||
|
||
log.info(f"Waiting for node {node_name} to return to Ready state") | ||
try: | ||
node.wait_for_nodes_status( | ||
node_names=[node_name], | ||
status=constants.NODE_READY, | ||
) | ||
log.info("Verifying all pods are running after node recovery") | ||
if not pod.wait_for_pods_to_be_running(timeout=720): | ||
raise ResourceWrongStatusException( | ||
"Not all pods returned to running state after node recovery" | ||
) | ||
except ResourceWrongStatusException as e: | ||
log.error( | ||
f"Pods did not return to running state, attempting node restart: {e}" | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do you need to restart the node again? |
||
nodes.restart_nodes(node.get_node_objs([node_name])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dont we need to check again pod status after restart? |
||
|
||
ceph_health_check(tries=80) | ||
|
||
log.info("Performing post-failure health checks for ODF and CNV namespaces") | ||
sample = TimeoutSampler( | ||
timeout=600, | ||
sleep=10, | ||
func=wait_for_pods_to_be_running, | ||
namespace=odf_namespace, | ||
) | ||
assert sample.wait_for_func_status( | ||
result=True | ||
), f"Not all pods are running in {odf_namespace} after node failure and recovery" | ||
|
||
sample = TimeoutSampler( | ||
timeout=600, | ||
sleep=10, | ||
func=wait_for_pods_to_be_running, | ||
namespace=cnv_namespace, | ||
) | ||
assert sample.wait_for_func_status( | ||
result=True | ||
), f"Not all pods are running in {cnv_namespace} after node failure and recovery" | ||
|
||
final_vm_states = { | ||
vm_obj.name: [vm_obj.printableStatus(), vm_obj.get_vmi_instance().node()] | ||
for vm_obj in vm_objs_def + vm_objs_aggr | ||
} | ||
log.info(f"Final VM states: {final_vm_states}") | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add code to check data integrity after recovery |
||
for vm_name in initial_vm_states: | ||
assert initial_vm_states[vm_name][0] == final_vm_states[vm_name][0], ( | ||
f"VM {vm_name}: State mismatch. Initial: {initial_vm_states[vm_name][0]}, " | ||
f"Final: {final_vm_states[vm_name][0]}" | ||
) | ||
if initial_vm_states[vm_name][1] == node_name: | ||
assert initial_vm_states[vm_name][1] != final_vm_states[vm_name][1], ( | ||
f"VM {vm_name}: Rescheduling failed. Initially, VM is scheduled" | ||
f" on node {node_name}, still on the same node" | ||
) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also write some IO after node recovery |
||
ceph_health_check(tries=80) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are checking this at line 112 ,then why you are checking it here again? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
teardown code is missing, please add