From 52398d3c68a9b7b1e3f57a102f33fd39dcbfa6c0 Mon Sep 17 00:00:00 2001 From: nick863 <30440255+nick863@users.noreply.github.com> Date: Sun, 14 Jul 2024 19:30:23 -0700 Subject: [PATCH] Add local e2e test gate (#3534) # Description Mark e2e tests, which can be ran without azure infrastructure local and create a gate, which ensures that the azure packages are not installed. See work item 3336565. **Note:** Checked that we have 54 e2e tests, 20 are azure and 34 local (the number matches with the one before split). # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [x] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [x] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [x] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [x] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [x] Pull request includes test coverage for the included changes. --- ...ml => promptflow-evals-e2e-test-azure.yml} | 10 +- .../promptflow-evals-e2e-test-local.yml | 98 +++++++++++++++++++ .gitignore | 1 + scripts/code_qa/assert_local_install.py | 21 ++++ .../promptflow/evals/evaluate/_eval_run.py | 17 +++- src/promptflow-evals/tests/evals/conftest.py | 53 +++++++--- .../evals/e2etests/test_adv_simulator.py | 2 +- .../evals/e2etests/test_builtin_evaluators.py | 5 +- .../tests/evals/e2etests/test_evaluate.py | 7 +- .../evals/e2etests/test_metrics_upload.py | 8 +- 10 files changed, 197 insertions(+), 25 deletions(-) rename .github/workflows/{promptflow-evals-e2e-test.yml => promptflow-evals-e2e-test-azure.yml} (90%) create mode 100644 .github/workflows/promptflow-evals-e2e-test-local.yml create mode 100644 scripts/code_qa/assert_local_install.py diff --git a/.github/workflows/promptflow-evals-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test-azure.yml similarity index 90% rename from .github/workflows/promptflow-evals-e2e-test.yml rename to .github/workflows/promptflow-evals-e2e-test-azure.yml index 606d827723f..75cb2c5d422 100644 --- a/.github/workflows/promptflow-evals-e2e-test.yml +++ b/.github/workflows/promptflow-evals-e2e-test-azure.yml @@ -1,4 +1,4 @@ -name: promptflow-evals-e2e-test +name: promptflow-evals-e2e-test-azure on: schedule: @@ -6,7 +6,7 @@ on: pull_request: paths: - src/promptflow-evals/** - - .github/workflows/promptflow-evals-e2e-test.yml + - .github/workflows/promptflow-evals-e2e-test-azure.yml workflow_dispatch: env: @@ -83,10 +83,10 @@ jobs: creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }} enable-AzPSSession: true - name: run e2e tests - id: run_all_e2e_tests + id: run_e2e_tests_azure run: | - poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml - poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity all_e2e_tests_run_times --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }} + poetry run pytest -m azuretest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_azure --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }} working-directory: ${{ env.WORKING_DIRECTORY }} - name: upload coverage report uses: actions/upload-artifact@v4 diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml new file mode 100644 index 00000000000..f5cef2aa4d2 --- /dev/null +++ b/.github/workflows/promptflow-evals-e2e-test-local.yml @@ -0,0 +1,98 @@ +name: promptflow-evals-e2e-test-local + +on: + schedule: + - cron: "40 10 * * *" # 2:40 PST every day + pull_request: + paths: + - src/promptflow-evals/** + - .github/workflows/promptflow-evals-e2e-test-local.yml + workflow_dispatch: + +env: + IS_IN_CI_PIPELINE: "true" + WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals + +jobs: + test: + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-13] + # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package + # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158 + # Add 3.9 back after we figure out the issue + python-version: ['3.8', '3.9', '3.10', '3.11'] + fail-fast: false + # snok/install-poetry need this to support Windows + defaults: + run: + shell: bash + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + - name: set test mode + # Always run in replay mode for now until we figure out the test resource to run live mode + run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV + #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: snok/install-poetry@v1 + - name: install test dependency group + run: poetry install --only test + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: install promptflow packages in editable mode + run: | + poetry run pip install -e ../promptflow + poetry run pip install -e ../promptflow-core + poetry run pip install -e ../promptflow-devkit + poetry run pip install -e ../promptflow-tracing + poetry run pip install -e ../promptflow-tools + poetry run pip install -e ../promptflow-evals + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: generate end-to-end test config from secret + run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: check azure is not installed + run: poetry run pytest ../../scripts/code_qa/assert_local_install.py + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: run e2e tests + id: run_e2e_tests_local + run: | + poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml + poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }} + working-directory: ${{ env.WORKING_DIRECTORY }} + - name: upload coverage report + uses: actions/upload-artifact@v4 + with: + name: report-${{ matrix.os }}-py${{ matrix.python-version }} + path: | + ${{ env.WORKING_DIRECTORY }}/*.xml + ${{ env.WORKING_DIRECTORY }}/htmlcov/ + + report: + needs: test + runs-on: ubuntu-latest + permissions: + checks: write + pull-requests: write + contents: read + issues: read + steps: + - uses: actions/download-artifact@v4 + with: + path: artifacts + - uses: EnricoMi/publish-unit-test-result-action@v2 + with: + check_name: promptflow-evals test result + comment_title: promptflow-evals test result + files: "artifacts/**/test-results.xml" # align with `--junit-xml` in pyproject.toml + - uses: irongut/CodeCoverageSummary@v1.3.0 + with: + filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml" + badge: true + fail_below_min: false + format: markdown + hide_complexity: true + output: both + thresholds: 40 80 diff --git a/.gitignore b/.gitignore index 9ef59b176df..5489d5b6745 100644 --- a/.gitignore +++ b/.gitignore @@ -197,6 +197,7 @@ src/promptflow-*/promptflow/__init__.py # Eclipse project files **/.project **/.pydevproject +**/.settings # benchmark results benchmark/promptflow-serve/test_runner/locust-results/ \ No newline at end of file diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py new file mode 100644 index 00000000000..3c9f56bd6d5 --- /dev/null +++ b/scripts/code_qa/assert_local_install.py @@ -0,0 +1,21 @@ +"""Tests checking that azure packages are NOT installed.""" +import importlib +import pytest + + +class TestPackagesNotInstalles(): + """Test imports.""" + + @pytest.mark.parametrize('package', [ + 'promptflow.azure', + 'azure.ai.ml', + 'azure.identity', + 'azure.storage.blob' + ]) + def test_promptflow_azure(self, package): + """Test promptflow. azure is not installed.""" + try: + importlib.import_module(package) + assert False, f'Package {package} must be uninstalled for local test.' + except (ModuleNotFoundError, ImportError): + pass diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py index 6142747def4..01898087ebd 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py @@ -21,10 +21,21 @@ LOGGER = logging.getLogger(__name__) + +# Handle optional import. The azure libraries are only present if +# promptflow-azure is installed. try: - from azure.storage.blob import BlobServiceClient from azure.ai.ml.entities._credentials import AccountKeyConfiguration -except ImportError: + from azure.ai.ml.entities._datastore.datastore import Datastore + from azure.storage.blob import BlobServiceClient +except (ModuleNotFoundError, ImportError): + # If the above mentioned modules cannot be imported, we are running + # in local mode and MLClient in the constructor will be None, so + # we will not arrive to Azure-dependent code. + + # We are logging the import failure only if debug logging level is set because: + # - If the project configuration was not provided this import is not needed. + # - If the project configuration was provided, the error will be raised by PFClient. LOGGER.debug("promptflow.azure is not installed.") @@ -413,7 +424,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART if response.status_code != 200: self._log_warning('register artifact', response) - def _get_datastore_credential(self, datastore: 'Datastore'): + def _get_datastore_credential(self, datastore: "Datastore"): # Reference the logic in azure.ai.ml._artifact._artifact_utilities # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103 credential = datastore.credentials diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py index 4d263152e7b..e184b334628 100644 --- a/src/promptflow-evals/tests/evals/conftest.py +++ b/src/promptflow-evals/tests/evals/conftest.py @@ -4,9 +4,8 @@ from typing import Dict from unittest.mock import patch -import jwt import pytest -from azure.ai.ml._ml_client import MLClient + from pytest_mock import MockerFixture from promptflow.client import PFClient @@ -20,8 +19,8 @@ from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay except ImportError as e: print(f"Failed to import promptflow-recording: {e}") - # Run test in empty mode if promptflow-recording is not installed + def recording_array_reset(): pass @@ -37,6 +36,13 @@ def is_record(): def is_replay(): return False +# Import of optional packages +AZURE_INSTALLED = True +try: + import jwt + from azure.ai.ml._ml_client import MLClient +except ImportError: + AZURE_INSTALLED = False PROMPTFLOW_ROOT = Path(__file__) / "../../../.." CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix() @@ -147,12 +153,15 @@ def mock_validate_trace_destination(): @pytest.fixture def azure_ml_client(project_scope: Dict): """The fixture, returning MLClient""" - return MLClient( - subscription_id=project_scope["subscription_id"], - resource_group_name=project_scope["resource_group_name"], - workspace_name=project_scope["project_name"], - credential=get_cred(), - ) + if AZURE_INSTALLED: + return MLClient( + subscription_id=project_scope["subscription_id"], + resource_group_name=project_scope["resource_group_name"], + workspace_name=project_scope["project_name"], + credential=get_cred(), + ) + else: + return None @pytest.fixture @@ -293,6 +302,8 @@ def azure_cred(): @pytest.fixture(scope=package_scope_in_live_mode()) def user_object_id() -> str: + if not AZURE_INSTALLED: + return "" if pytest.is_replay: from promptflow.recording.azure import SanitizedValues @@ -305,6 +316,8 @@ def user_object_id() -> str: @pytest.fixture(scope=package_scope_in_live_mode()) def tenant_id() -> str: + if not AZURE_INSTALLED: + return "" if pytest.is_replay: from promptflow.recording.azure import SanitizedValues @@ -317,9 +330,12 @@ def tenant_id() -> str: @pytest.fixture(scope=package_scope_in_live_mode()) def variable_recorder(): - from promptflow.recording.azure import VariableRecorder + if pytest.is_record or pytest.is_replay: + from promptflow.recording.azure import VariableRecorder - yield VariableRecorder() + yield VariableRecorder() + else: + yield None @pytest.fixture(scope=package_scope_in_live_mode()) @@ -346,3 +362,18 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id yield recording else: yield None + + +def pytest_collection_modifyitems(items): + parents = {} + for item in items: + # Check if parent contains 'localtest' marker and remove it. + if any(mark.name == 'localtest' for mark in item.parent.own_markers) or id(item.parent) in parents: + if id(item.parent) not in parents: + item.parent.own_markers = [ + marker for marker in item.own_markers if getattr(marker, 'name', None) != 'localtest'] + parents[id(item.parent)] = item.parent + if not item.get_closest_marker('azuretest'): + # If item's parent was marked as 'localtest', mark the child as such, but not if + # it was marked as 'azuretest'. + item.add_marker(pytest.mark.localtest) diff --git a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py index 1faef92a46a..16cd0bab1df 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py @@ -6,7 +6,7 @@ @pytest.mark.usefixtures("recording_injection") -@pytest.mark.e2etest +@pytest.mark.azuretest class TestAdvSimulator: @pytest.mark.usefixtures("vcr_recording") def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope): diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py index 17bfb5029cf..e1a305ca388 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py @@ -11,7 +11,7 @@ @pytest.mark.usefixtures("recording_injection", "vcr_recording") -@pytest.mark.e2etest +@pytest.mark.localtest class TestBuiltInEvaluators: def test_individual_evaluator_prompt_based(self, model_config): eval_fn = FluencyEvaluator(model_config) @@ -31,6 +31,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config): assert score is not None assert score["gpt_fluency"] > 0.0 + @pytest.mark.azuretest def test_individual_evaluator_service_based(self, project_scope, azure_cred): eval_fn = ViolenceEvaluator(project_scope, azure_cred) score = eval_fn( @@ -73,6 +74,7 @@ def test_composite_evaluator_qa(self, model_config, parallel): assert score["gpt_similarity"] > 0.0 assert score["f1_score"] > 0.0 + @pytest.mark.azuretest def test_composite_evaluator_content_safety(self, project_scope, azure_cred): safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred) score = safety_eval( @@ -156,6 +158,7 @@ def test_composite_evaluator_chat(self, model_config, eval_last_turn, parallel): assert score["evaluation_per_turn"]["gpt_retrieval"] is not None assert len(score["evaluation_per_turn"]["gpt_retrieval"]["score"]) == turn_count + @pytest.mark.azuretest @pytest.mark.parametrize( "eval_last_turn, parallel", [ diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py index 16b06daf85f..f57c05e35ce 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py @@ -6,7 +6,6 @@ import pandas as pd import pytest import requests -from azure.identity import DefaultAzureCredential from promptflow.evals.evaluate import evaluate from promptflow.evals.evaluators import ContentSafetyEvaluator, F1ScoreEvaluator, GroundednessEvaluator @@ -46,6 +45,7 @@ def question_evaluator(question): def _get_run_from_run_history(flow_run_id, ml_client, project_scope): """Get run info from run history""" + from azure.identity import DefaultAzureCredential token = "Bearer " + DefaultAzureCredential().get_token("https://management.azure.com/.default").token headers = { "Authorization": token, @@ -80,7 +80,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope): @pytest.mark.usefixtures("recording_injection") -@pytest.mark.e2etest +@pytest.mark.localtest class TestEvaluate: def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): # data @@ -118,6 +118,7 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file): assert row_result_df["outputs.f1_score.f1_score"][2] == 1 assert result["studio_url"] is None + @pytest.mark.azuretest @pytest.mark.skip(reason="Failed in CI pipeline. Pending for investigation.") def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred): input_data = pd.read_json(data_file, lines=True) @@ -301,6 +302,7 @@ def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config): assert "answer.length" in metrics.keys() assert "f1_score.f1_score" in metrics.keys() + @pytest.mark.azuretest def test_evaluate_track_in_cloud( self, questions_file, @@ -344,6 +346,7 @@ def test_evaluate_track_in_cloud( assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run" assert remote_run["runMetadata"]["displayName"] == evaluation_name + @pytest.mark.azuretest def test_evaluate_track_in_cloud_no_target( self, data_file, diff --git a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py index b7dcaf07326..c027ac98a33 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py @@ -10,8 +10,12 @@ from promptflow.evals.evaluate._eval_run import EvalRun from promptflow.evals.evaluate._evaluate import evaluate from promptflow.evals.evaluators._f1_score._f1_score import F1ScoreEvaluator -from promptflow.recording.record_mode import is_live from promptflow.tracing import _start_trace +try: + from promptflow.recording.record_mode import is_live +except ModuleNotFoundError: + # The file is being imported by the local test + pass @pytest.fixture @@ -38,7 +42,7 @@ def tracking_uri(azure_ml_client, project_scope): @pytest.mark.usefixtures("model_config", "recording_injection", "project_scope") -@pytest.mark.e2etest +@pytest.mark.azuretest class TestMetricsUpload(object): """End to end tests to check how the metrics were uploaded to cloud."""