Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test_smallfile_workload[4-5000-22-5-33-CephBlockPool] is failing with error smallfile-client-1-benchmark-bbfd9296-sg7bb Failed to run - (Failed) #10798

Open
pintojoy opened this issue Nov 4, 2024 · 0 comments
Assignees

Comments

@pintojoy
Copy link
Contributor

pintojoy commented Nov 4, 2024

self = <test_small_file_workload.TestSmallFileWorkload object at 0x7f243c4f0910>
file_size = 4, files = 5000, threads = 22, samples = 5, clients = 33
interface = 'CephBlockPool'

@pytest.mark.parametrize(
argnames=["file_size", "files", "threads", "samples", "clients", "interface"],
argvalues=[
pytest.param([4, 5000, 22, 5, 33, constants.CEPHBLOCKPOOL]),
pytest.param(
[16, 5000, 8, 5, 21, constants.CEPHBLOCKPOOL]),
pytest.param([4, 2500, 4, 5, 9, constants.CEPHFILESYSTEM]),
pytest.param(
[16, 1500, 4, 5, 9, constants.CEPHFILESYSTEM]),
],
)
@pytest.mark.polarion_id("OCS-1295")
def test_smallfile_workload(
self, file_size, files, threads, samples, clients, interface
):
"""
Run SmallFile Workload

Args:
    file_size (int) : the size of the file to be used
    files (int) : number of files to use
    threads (int) : number of threads to be use in the test
    samples (int) : how meany samples to run for each test
    interface (str) : the volume type (rbd / cephfs)

"""
if config.PERF.get("deploy_internal_es"):
    self.es = ElasticSearch()
else:
    if config.PERF.get("internal_es_server") == "":
        self.es = None
        return
    else:
        url = (
            f"{config.PERF.get('internal_es_scheme')}://{config.PERF.get('internal_es_server')}"
            f":{config.PERF.get('internal_es_port')}",
        )
        self.es = {
            "server": config.PERF.get("internal_es_server"),
            "port": config.PERF.get("internal_es_port"),
            "scheme": config.PERF.get("internal_es_scheme"),
            "url": url,
        }
        # verify that the connection to the elasticsearch server is OK
        if not super(TestSmallFileWorkload, self).es_connect():
            self.es = None
            return

# deploy the benchmark-operator
self.deploy_benchmark_operator()

# verify that there is an elasticsearch server for the benchmark
if not self.es:
    log.error("This test must have an Elasticsearch server")
    return False

# Getting the full path for the test logs
self.full_log_path = get_full_test_logs_path(cname=self)
self.results_path = get_full_test_logs_path(cname=self)
self.full_log_path += (
    f"-{file_size}-{files}-{threads}-{samples}-{clients}-{interface}"
)
log.info(f"Logs file path name is : {self.full_log_path}")

# Loading the main template yaml file for the benchmark
log.info("Create resource file for small_files workload")
self.crd_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML)

# Saving the Original elastic-search IP and PORT - if defined in yaml
self.es_info_backup(self.es)

self.set_storageclass(interface=interface)

# Setting the data set to 40% of the total storage capacity
self.setting_storage_usage(file_size, files, threads, samples, clients)

self.get_env_info()

if not self.run():

tests/cross_functional/performance/io_workload/test_small_file_workload.py:625:

tests/cross_functional/performance/io_workload/test_small_file_workload.py:518: in run
self.wait_for_wl_to_finish(sleep=30)

self = <test_small_file_workload.TestSmallFileWorkload object at 0x7f243c4f0910>
timeout = 18000, sleep = 30

def wait_for_wl_to_finish(self, timeout=18000, sleep=300):
"""
Waiting until the workload is finished and get the test log

Args:
    timeout (int): time in second to wait until the benchmark start
    sleep (int): Sleep interval seconds

Raise:
    exception for too much restarts of the test.
    ResourceWrongStatusException : test Failed / Error
    TimeoutExpiredError : test did not completed on time.

"""
log.info(f"Waiting for {self.client_pod_name} to complete")

Finished = 0
restarts = 0
total_time = timeout
while not Finished and total_time > 0:
    results = run_oc_command(
        "get pod --no-headers -o custom-columns=:metadata.name,:status.phase",
        namespace=benchmark_operator.BMO_NAME,
    )
    (fname, status) = ["", ""]
    for name in results:
        # looking for the pod which run the benchmark (not the IO)
        # this pod contain the `client` in his name, and there is only one
        # pod like this, other pods have the `server` in the name.
        (fname, status) = name.split()
        if re.search("client", fname):
            break
        else:
            (fname, status) = ["", ""]

    if fname == "":  # there is no `client` pod !
        err_msg = f"{self.client_pod} Failed to run !!!"
        log.error(err_msg)
        raise Exception(err_msg)

    if not fname == self.client_pod:
        # The client pod name is different from previous check, it was restarted
        log.info(
            f"The pod {self.client_pod} was restart. the new client pod is {fname}"
        )
        self.client_pod = fname
        restarts += 1
        # in case of restarting the benchmark, reset the timeout as well
        total_time = timeout

    if restarts > 3:  # we are tolerating only 3 restarts
        err_msg = f"Too much restarts of the benchmark ({restarts})"
        log.error(err_msg)
        raise Exception(err_msg)

    if status == "Succeeded":
        # Getting the end time of the benchmark - for reporting.
        self.end_time = self.get_time()
        self.test_logs = self.pod_obj.exec_oc_cmd(
            f"logs {self.client_pod}", out_yaml_format=False
        )
        log.info(f"{self.client_pod} completed successfully")
        Finished = 1
    elif (
        status != constants.STATUS_RUNNING
        and status != constants.STATUS_PENDING
    ):
        # if the benchmark pod is not in Running state (and not Completed/Pending),
        # no need to wait for timeout.
        # Note: the pod can be in pending state in case of restart.
        err_msg = f"{self.client_pod} Failed to run - ({status})"
        log.error(err_msg)

      raise exceptions.ResourceWrongStatusException(
            self.client_pod,
            describe_out=err_msg,
            column="Status",
            expected="Succeeded",
            got=status,
        )

E ocs_ci.ocs.exceptions.ResourceWrongStatusException: Resource smallfile-client-1-benchmark-bbfd9296-sg7bb in column Status was in state Failed but expected Succeeded describe output: smallfile-client-1-benchmark-bbfd9296-sg7bb Failed to run - (Failed)

@pintojoy pintojoy self-assigned this Nov 4, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

1 participant