Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Update strategy for ODF upgrade testing #10840

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ocs_ci/framework/conf/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,10 @@ UPGRADE:
ocp_upgrade_path: "quay.io/openshift-release-dev/ocp-release"
ocp_arch: "x86_64"
upgrade_logging_channel: "4.17"
# None value means that value in Rook operator config is used.
# Otherwise it is changed to the provided value before ODF upgrade.
csi_rbd_plugin_update_strategy_max_unavailable: null
csi_cephfs_plugin_update_strategy_max_unavailable: null

# This section stores secret and uploaded from home dir or s3
# for entry into this section, please email ecosystem team
Expand Down
95 changes: 94 additions & 1 deletion ocs_ci/ocs/ocs_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from ocs_ci.ocs.node import get_nodes
from ocs_ci.ocs.resources.catalog_source import CatalogSource, disable_specific_source
from ocs_ci.ocs.resources.csv import CSV, check_all_csvs_are_succeeded
from ocs_ci.ocs.resources.daemonset import DaemonSet
from ocs_ci.ocs.resources.install_plan import wait_for_install_plan_and_approve
from ocs_ci.ocs.resources.pod import get_noobaa_pods, verify_pods_upgraded
from ocs_ci.ocs.resources.packagemanifest import (
Expand Down Expand Up @@ -567,6 +568,7 @@ def set_upgrade_images(self):

def run_ocs_upgrade(
operation=None,
upgrade_stats=None,
*operation_args,
**operation_kwargs,
):
Expand All @@ -575,6 +577,8 @@ def run_ocs_upgrade(

Args:
operation: (function): Function to run
upgrade_stats: (dict): Dictionary where can be stored statistics
gathered during the upgrade
operation_args: (iterable): Function's arguments
operation_kwargs: (map): Function's keyword arguments

Expand All @@ -598,6 +602,21 @@ def run_ocs_upgrade(
f"{upgrade_ocs.version_before_upgrade}"
)

# Update values CSI_RBD_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE and CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE
# in rook-ceph-operator-config configmap
set_update_strategy()
if upgrade_stats:
cephfs_daemonset = DaemonSet(
resource_name="csi-cephfsplugin",
namespace=config.ENV_DATA["cluster_namespace"],
)
rbd_daemonset = DaemonSet(
resource_name="csi-rbdplugin",
namespace=config.ENV_DATA["cluster_namespace"],
)
upgrade_stats["odf_upgrade"]["rbd_max_unavailable"] = 0
upgrade_stats["odf_upgrade"]["cephfs_max_unavailable"] = 0

# create external cluster object
if config.DEPLOYMENT["external_mode"]:
host, user, password, ssh_key = get_external_cluster_client()
Expand Down Expand Up @@ -638,6 +657,7 @@ def run_ocs_upgrade(
upgrade_ocs.set_upgrade_images()
live_deployment = config.DEPLOYMENT["live_deployment"]
disable_addon = config.DEPLOYMENT.get("ibmcloud_disable_addon")
start_time = time.time()
if (
config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM
and live_deployment
Expand Down Expand Up @@ -721,12 +741,44 @@ def run_ocs_upgrade(
channel=channel,
csv_name_pre_upgrade=csv_name_pre_upgrade,
):
if upgrade_stats:
rbd_daemonset_status = rbd_daemonset.get_status()
cephfs_daemonset_status = cephfs_daemonset.get_status()
rbd_unavailable = (
rbd_daemonset_status["desiredNumberScheduled"]
- rbd_daemonset_status["numberReady"]
)
cephfs_unavailable = (
cephfs_daemonset_status["desiredNumberScheduled"]
- cephfs_daemonset_status["numberReady"]
)
if (
rbd_unavailable
> upgrade_stats["odf_upgrade"]["rbd_max_unavailable"]
):
upgrade_stats["odf_upgrade"][
"rbd_max_unavailable"
] = rbd_unavailable
if (
cephfs_unavailable
> upgrade_stats["odf_upgrade"]["cephfs_max_unavailable"]
):
upgrade_stats["odf_upgrade"][
"cephfs_max_unavailable"
] = cephfs_unavailable
log.debug(f"rbd daemonset status: {rbd_daemonset_status}")
log.debug(f"cephfs daemonset status: {cephfs_daemonset_status}")
try:
if sample:
log.info("Upgrade success!")
break
except TimeoutException:
raise TimeoutException("No new CSV found after upgrade!")
stop_time = time.time()
time_taken = stop_time - start_time
log.info(f"Upgrade took {time_taken} seconds to complete")
if upgrade_stats:
upgrade_stats["odf_upgrade"]["upgrade_time"] = time_taken
old_image = upgrade_ocs.get_images_post_upgrade(
channel, pre_upgrade_images, upgrade_version
)
Expand Down Expand Up @@ -813,7 +865,7 @@ def ocs_odf_upgrade_ui():
Pass proper versions and upgrade_ui.yaml while running this function for validation to pass

"""

set_update_strategy()
login_ui()
val_obj = ValidationUI()
pagenav_obj = ValidationUI()
Expand Down Expand Up @@ -858,3 +910,44 @@ def ocs_odf_upgrade_ui():
val_obj.take_screenshot()
pagenav_obj.odf_overview_ui()
pagenav_obj.odf_storagesystems_ui()


def set_update_strategy(rbd_max_unavailable=None, cephfs_max_unavailable=None):
"""
Update rook-ceph-operator-config configmap with parameters:
CSI_RBD_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE and CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE.
If values are not provided as parameters of this function then values are taken
from ocs-ci config. If the values are not set in ocs-ci config or function
parameters then they are updated.

Args:
rbd_max_unavailable (int, str): Value of CSI_RBD_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE
to be updated in rook-ceph-operator-config configmap.
cephfs_max_unavailable (int, str): Value of CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE
to be updated in rook-ceph-operator-config configmap.

"""
rbd_max = rbd_max_unavailable or config.ENV_DATA.get(
"csi_rbd_plugin_update_strategy_max_unavailable"
)
cephfs_max = cephfs_max_unavailable or config.ENV_DATA.get(
"csi_cephfs_plugin_update_strategy_max_unavailable"
)
if rbd_max:
config_map_patch = f'\'\\{"data": \\{"CSI_RBD_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE": "{rbd_max}"\\}\\}\''
exec_cmd(
f"oc patch configmap -n {self.namespace} "
f"{constants.ROOK_OPERATOR_CONFIGMAP} -p {config_map_patch}"
)
logger.info(
f"CSI_RBD_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE is set to {rbd_max}"
)
if cephfs_max:
config_map_patch = f'\'\\{"data": \\{"CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE": "{cephfs_max}"\\}\\}\''
exec_cmd(
f"oc patch configmap -n {self.namespace} "
f"{constants.ROOK_OPERATOR_CONFIGMAP} -p {config_map_patch}"
)
logger.info(
f"CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE is set to {rbd_max}"
)
35 changes: 35 additions & 0 deletions ocs_ci/ocs/resources/daemonset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
DaemonSet related functionalities
"""
import logging

from ocs_ci.ocs.ocp import OCP

log = logging.getLogger(__name__)


class DaemonSet(OCP):
"""
This class represent DaemonSet and contains methods for operations with
DaemonSets.
"""

def get_status(self):
"""
Get infromation related to resource status.

Returns:
dict: DaemonSet resource status
"""
resource_data = self.get()
return resource_data["status"]

def get_update_strategy(self):
"""
Get infromation related to update strategy.

Returns:
dict: DaemonSet resource update strategy
"""
resource_data = self.get()
return resource_data["spec"]["updateStrategy"]
11 changes: 11 additions & 0 deletions tests/functional/upgrade/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,3 +654,14 @@ def fs_md5(fs_pod):
)
log.info(f"Ceph FS md5: {md5}")
return md5


@pytest.fixture(scope="session")
def upgrade_stats():
"""

Returns:
dict: List of statistics gathered during performed upgrade.

"""
return {"odf_upgrade": {}, "ocp_upgrade": {}}
39 changes: 39 additions & 0 deletions tests/functional/upgrade/test_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

import pytest

from ocs_ci.framework import config
from ocs_ci.framework.pytest_customization.marks import (
pre_upgrade,
post_upgrade,
brown_squad,
)
from ocs_ci.ocs import constants
from ocs_ci.ocs.ocp import OCP
from ocs_ci.ocs.resources import pod

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -58,3 +61,39 @@ def test_crush_map_unchanged(pre_upgrade_crush_map):
upgrade.
"""
pre_upgrade_crush_map == get_crush_map()


@post_upgrade
@pytest.mark.polarion_id()
def test_max_unavaialable_rbd(upgrade_stats):
"""
Test that the number of unavailable RBD daemonset plugin pods during ODF
upgrade corresponds to the value set in rook-ceph-operator-config configmap.
"""
configmap = OCP(
kind=constants.CONFIGMAP,
namespace=config.ENV_DATA["cluster_namespace"],
resource_name=constants.ROOK_OPERATOR_CONFIGMAP,
).get()
config_value = configmap.get("data").get(
"CSI_RBD_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE"
)
assert config_value == upgrade_stats["odf_upgrade"]["max_unavailable_rbd"]


@post_upgrade
@pytest.mark.polarion_id()
def test_max_unavaialable_cephfs(upgrade_stats):
"""
Test that the number of unavailable CephFS daemonset plugin pods during ODF
upgrade corresponds to the value set in rook-ceph-operator-config configmap.
"""
configmap = OCP(
kind=constants.CONFIGMAP,
namespace=config.ENV_DATA["cluster_namespace"],
resource_name=constants.ROOK_OPERATOR_CONFIGMAP,
).get()
config_value = configmap.get("data").get(
"CSI_CEPHFS_PLUGIN_UPDATE_STRATEGY_MAX_UNAVAILABLE"
)
assert config_value == upgrade_stats["odf_upgrade"]["max_unavailable_cephfs"]
20 changes: 12 additions & 8 deletions tests/functional/upgrade/test_upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,46 +28,50 @@ def finalizer():

@purple_squad
@pytest.mark.polarion_id("OCS-1579")
def test_worker_node_abrupt_shutdown(teardown):
def test_worker_node_abrupt_shutdown(teardown, upgrade_stats):
"""
Test OCS upgrade with disruption of shutting down worker node,
for 5.5 minutes

"""
log.info("Starting disruptive function: test_worker_node_abrupt_shutdown")
run_ocs_upgrade(operation=worker_node_shutdown, abrupt=True)
run_ocs_upgrade(
operation=worker_node_shutdown, abrupt=True, upgrade_stats=upgrade_stats
)


@purple_squad
@pytest.mark.polarion_id("OCS-1575")
def test_worker_node_permanent_shutdown(teardown):
def test_worker_node_permanent_shutdown(teardown, upgrade_stats):
"""
Test OCS upgrade with disruption of shutting down worker node

"""
log.info("Starting disruptive function: test_worker_node_permanent_shutdown")
run_ocs_upgrade(operation=worker_node_shutdown, abrupt=False)
run_ocs_upgrade(
operation=worker_node_shutdown, abrupt=False, upgrade_stats=upgrade_stats
)


@purple_squad
@pytest.mark.polarion_id("OCS-1558")
def test_osd_reboot(teardown):
def test_osd_reboot(teardown, upgrade_stats):
"""
OCS Upgrade with node reboot: with 1 OSD going down and back up while upgrade is running

"""

log.info("Starting disruptive function: test_osd_reboot")
run_ocs_upgrade(operation=osd_node_reboot)
run_ocs_upgrade(operation=osd_node_reboot, upgrade_stats=upgrade_stats)


@purple_squad
@ocs_upgrade
@polarion_id(get_polarion_id(upgrade=True))
def test_upgrade():
def test_upgrade(upgrade_stats):
"""
Tests upgrade procedure of OCS cluster

"""

run_ocs_upgrade()
run_ocs_upgrade(upgrade_stats=upgrade_stats)
Loading