From 52398d3c68a9b7b1e3f57a102f33fd39dcbfa6c0 Mon Sep 17 00:00:00 2001
From: nick863 <30440255+nick863@users.noreply.github.com>
Date: Sun, 14 Jul 2024 19:30:23 -0700
Subject: [PATCH] Add local e2e test gate (#3534)

# Description

Mark e2e tests, which can be ran without azure infrastructure local and
create a gate, which ensures that the azure packages are not installed.
See work item 3336565.
**Note:** Checked that we have 54 e2e tests, 20 are azure and 34 local
(the number matches with the one before split).

# All Promptflow Contribution checklist:
- [x] **The pull request does not introduce [breaking changes].**
- [x] **CHANGELOG is updated for new features, bug fixes or other
significant changes.**
- [x] **I have read the [contribution guidelines](../CONTRIBUTING.md).**
- [x] **Create an issue and link to the pull request to get dedicated
review from promptflow team. Learn more: [suggested
workflow](../CONTRIBUTING.md#suggested-workflow).**

## General Guidelines and Best Practices
- [x] Title of the pull request is clear and informative.
- [x] There are a small number of commits, each of which have an
informative message. This means that previously merged commits do not
appear in the history of the PR. For more information on cleaning up the
commits in your PR, [see this
page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).

### Testing Guidelines
- [x] Pull request includes test coverage for the included changes.
---
 ...ml => promptflow-evals-e2e-test-azure.yml} | 10 +-
 .../promptflow-evals-e2e-test-local.yml       | 98 +++++++++++++++++++
 .gitignore                                    |  1 +
 scripts/code_qa/assert_local_install.py       | 21 ++++
 .../promptflow/evals/evaluate/_eval_run.py    | 17 +++-
 src/promptflow-evals/tests/evals/conftest.py  | 53 +++++++---
 .../evals/e2etests/test_adv_simulator.py      |  2 +-
 .../evals/e2etests/test_builtin_evaluators.py |  5 +-
 .../tests/evals/e2etests/test_evaluate.py     |  7 +-
 .../evals/e2etests/test_metrics_upload.py     |  8 +-
 10 files changed, 197 insertions(+), 25 deletions(-)
 rename .github/workflows/{promptflow-evals-e2e-test.yml => promptflow-evals-e2e-test-azure.yml} (90%)
 create mode 100644 .github/workflows/promptflow-evals-e2e-test-local.yml
 create mode 100644 scripts/code_qa/assert_local_install.py

diff --git a/.github/workflows/promptflow-evals-e2e-test.yml b/.github/workflows/promptflow-evals-e2e-test-azure.yml
similarity index 90%
rename from .github/workflows/promptflow-evals-e2e-test.yml
rename to .github/workflows/promptflow-evals-e2e-test-azure.yml
index 606d827723f..75cb2c5d422 100644
--- a/.github/workflows/promptflow-evals-e2e-test.yml
+++ b/.github/workflows/promptflow-evals-e2e-test-azure.yml
@@ -1,4 +1,4 @@
-name: promptflow-evals-e2e-test
+name: promptflow-evals-e2e-test-azure
 
 on:
   schedule:
@@ -6,7 +6,7 @@ on:
   pull_request:
     paths:
       - src/promptflow-evals/**
-      - .github/workflows/promptflow-evals-e2e-test.yml
+      - .github/workflows/promptflow-evals-e2e-test-azure.yml
   workflow_dispatch:
 
 env:
@@ -83,10 +83,10 @@ jobs:
           creds: ${{ secrets.PF_EVALS_SP_CREDENTIALS }}
           enable-AzPSSession: true
       - name: run e2e tests
-        id: run_all_e2e_tests
+        id: run_e2e_tests_azure
         run: | 
-          poetry run pytest -m e2etest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
-          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity all_e2e_tests_run_times --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
+          poetry run pytest -m azuretest --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_azure --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
         working-directory: ${{ env.WORKING_DIRECTORY }}
       - name: upload coverage report
         uses: actions/upload-artifact@v4
diff --git a/.github/workflows/promptflow-evals-e2e-test-local.yml b/.github/workflows/promptflow-evals-e2e-test-local.yml
new file mode 100644
index 00000000000..f5cef2aa4d2
--- /dev/null
+++ b/.github/workflows/promptflow-evals-e2e-test-local.yml
@@ -0,0 +1,98 @@
+name: promptflow-evals-e2e-test-local
+
+on:
+  schedule:
+    - cron: "40 10 * * *" # 2:40 PST every day
+  pull_request:
+    paths:
+      - src/promptflow-evals/**
+      - .github/workflows/promptflow-evals-e2e-test-local.yml
+  workflow_dispatch:
+
+env:
+  IS_IN_CI_PIPELINE: "true"
+  WORKING_DIRECTORY: ${{ github.workspace }}/src/promptflow-evals
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-13]
+        # TODO: Encounter hash mismatch for ubuntu-latest and 3.9 combination during installing promptflow-evals package
+        # https://github.com/microsoft/promptflow/actions/runs/9009397933/job/24753518853?pr=3158
+        # Add 3.9 back after we figure out the issue
+        python-version: ['3.8', '3.9', '3.10', '3.11']
+      fail-fast: false
+    # snok/install-poetry need this to support Windows
+    defaults:
+      run:
+        shell: bash
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: set test mode
+        # Always run in replay mode for now until we figure out the test resource to run live mode
+        run: echo "PROMPT_FLOW_TEST_MODE=replay" >> $GITHUB_ENV
+        #run: echo "PROMPT_FLOW_TEST_MODE=$(if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo replay; else echo live; fi)" >> $GITHUB_ENV
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: snok/install-poetry@v1
+      - name: install test dependency group
+        run: poetry install --only test
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: install promptflow packages in editable mode
+        run: |
+          poetry run pip install -e ../promptflow
+          poetry run pip install -e ../promptflow-core
+          poetry run pip install -e ../promptflow-devkit
+          poetry run pip install -e ../promptflow-tracing
+          poetry run pip install -e ../promptflow-tools
+          poetry run pip install -e ../promptflow-evals
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: generate end-to-end test config from secret
+        run: echo '${{ secrets.PF_EVALS_E2E_TEST_CONFIG }}' >> connections.json
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: check azure is not installed
+        run: poetry run pytest ../../scripts/code_qa/assert_local_install.py
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: run e2e tests
+        id: run_e2e_tests_local
+        run: |
+          poetry run pytest -m localtest tests/evals/e2etests --cov=promptflow --cov-config=pyproject.toml --cov-report=term --cov-report=html --cov-report=xml
+          poetry run python ../../scripts/code_qa/report_to_app_insights.py --activity e2e_tests_local --junit-xml test-results.xml --git-hub-action-run-id ${{ github.run_id }} --git-hub-workflow ${{ github.workflow }} --git-hub-action ${{ github.action }} --git-branch ${{ github.ref }}
+        working-directory: ${{ env.WORKING_DIRECTORY }}
+      - name: upload coverage report
+        uses: actions/upload-artifact@v4
+        with:
+          name: report-${{ matrix.os }}-py${{ matrix.python-version }}
+          path: |
+            ${{ env.WORKING_DIRECTORY }}/*.xml
+            ${{ env.WORKING_DIRECTORY }}/htmlcov/
+
+  report:
+    needs: test
+    runs-on: ubuntu-latest
+    permissions:
+      checks: write
+      pull-requests: write
+      contents: read
+      issues: read
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: artifacts
+      - uses: EnricoMi/publish-unit-test-result-action@v2
+        with:
+          check_name: promptflow-evals test result
+          comment_title: promptflow-evals test result
+          files: "artifacts/**/test-results.xml"  # align with `--junit-xml` in pyproject.toml
+      - uses: irongut/CodeCoverageSummary@v1.3.0
+        with:
+          filename: "artifacts/report-ubuntu-latest-py3.11/coverage.xml"
+          badge: true
+          fail_below_min: false
+          format: markdown
+          hide_complexity: true
+          output: both
+          thresholds: 40 80
diff --git a/.gitignore b/.gitignore
index 9ef59b176df..5489d5b6745 100644
--- a/.gitignore
+++ b/.gitignore
@@ -197,6 +197,7 @@ src/promptflow-*/promptflow/__init__.py
 # Eclipse project files
 **/.project
 **/.pydevproject
+**/.settings
 
 # benchmark results
 benchmark/promptflow-serve/test_runner/locust-results/
\ No newline at end of file
diff --git a/scripts/code_qa/assert_local_install.py b/scripts/code_qa/assert_local_install.py
new file mode 100644
index 00000000000..3c9f56bd6d5
--- /dev/null
+++ b/scripts/code_qa/assert_local_install.py
@@ -0,0 +1,21 @@
+"""Tests checking that azure packages are NOT installed."""
+import importlib
+import pytest
+
+
+class TestPackagesNotInstalles():
+    """Test imports."""
+
+    @pytest.mark.parametrize('package', [
+        'promptflow.azure',
+        'azure.ai.ml',
+        'azure.identity',
+        'azure.storage.blob'
+    ])
+    def test_promptflow_azure(self, package):
+        """Test promptflow. azure is not installed."""
+        try:
+            importlib.import_module(package)
+            assert False, f'Package {package} must be uninstalled for local test.'
+        except (ModuleNotFoundError, ImportError):
+            pass
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
index 6142747def4..01898087ebd 100644
--- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
+++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py
@@ -21,10 +21,21 @@
 
 LOGGER = logging.getLogger(__name__)
 
+
+# Handle optional import. The azure libraries are only present if
+# promptflow-azure is installed.
 try:
-    from azure.storage.blob import BlobServiceClient
     from azure.ai.ml.entities._credentials import AccountKeyConfiguration
-except ImportError:
+    from azure.ai.ml.entities._datastore.datastore import Datastore
+    from azure.storage.blob import BlobServiceClient
+except (ModuleNotFoundError, ImportError):
+    # If the above mentioned modules cannot be imported, we are running
+    # in local mode and MLClient in the constructor will be None, so
+    # we will not arrive to Azure-dependent code.
+
+    # We are logging the import failure only if debug logging level is set because:
+    # - If the project configuration was not provided this import is not needed.
+    # - If the project configuration was provided, the error will be raised by PFClient.
     LOGGER.debug("promptflow.azure is not installed.")
 
 
@@ -413,7 +424,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART
         if response.status_code != 200:
             self._log_warning('register artifact', response)
 
-    def _get_datastore_credential(self, datastore: 'Datastore'):
+    def _get_datastore_credential(self, datastore: "Datastore"):
         # Reference the logic in azure.ai.ml._artifact._artifact_utilities
         # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ml/azure-ai-ml/azure/ai/ml/_artifacts/_artifact_utilities.py#L103
         credential = datastore.credentials
diff --git a/src/promptflow-evals/tests/evals/conftest.py b/src/promptflow-evals/tests/evals/conftest.py
index 4d263152e7b..e184b334628 100644
--- a/src/promptflow-evals/tests/evals/conftest.py
+++ b/src/promptflow-evals/tests/evals/conftest.py
@@ -4,9 +4,8 @@
 from typing import Dict
 from unittest.mock import patch
 
-import jwt
 import pytest
-from azure.ai.ml._ml_client import MLClient
+
 from pytest_mock import MockerFixture
 
 from promptflow.client import PFClient
@@ -20,8 +19,8 @@
     from promptflow.recording.record_mode import is_in_ci_pipeline, is_live, is_record, is_replay
 except ImportError as e:
     print(f"Failed to import promptflow-recording: {e}")
-
     # Run test in empty mode if promptflow-recording is not installed
+
     def recording_array_reset():
         pass
 
@@ -37,6 +36,13 @@ def is_record():
     def is_replay():
         return False
 
+# Import of optional packages
+AZURE_INSTALLED = True
+try:
+    import jwt
+    from azure.ai.ml._ml_client import MLClient
+except ImportError:
+    AZURE_INSTALLED = False
 
 PROMPTFLOW_ROOT = Path(__file__) / "../../../.."
 CONNECTION_FILE = (PROMPTFLOW_ROOT / "promptflow-evals/connections.json").resolve().absolute().as_posix()
@@ -147,12 +153,15 @@ def mock_validate_trace_destination():
 @pytest.fixture
 def azure_ml_client(project_scope: Dict):
     """The fixture, returning MLClient"""
-    return MLClient(
-        subscription_id=project_scope["subscription_id"],
-        resource_group_name=project_scope["resource_group_name"],
-        workspace_name=project_scope["project_name"],
-        credential=get_cred(),
-    )
+    if AZURE_INSTALLED:
+        return MLClient(
+            subscription_id=project_scope["subscription_id"],
+            resource_group_name=project_scope["resource_group_name"],
+            workspace_name=project_scope["project_name"],
+            credential=get_cred(),
+        )
+    else:
+        return None
 
 
 @pytest.fixture
@@ -293,6 +302,8 @@ def azure_cred():
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def user_object_id() -> str:
+    if not AZURE_INSTALLED:
+        return ""
     if pytest.is_replay:
         from promptflow.recording.azure import SanitizedValues
 
@@ -305,6 +316,8 @@ def user_object_id() -> str:
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def tenant_id() -> str:
+    if not AZURE_INSTALLED:
+        return ""
     if pytest.is_replay:
         from promptflow.recording.azure import SanitizedValues
 
@@ -317,9 +330,12 @@ def tenant_id() -> str:
 
 @pytest.fixture(scope=package_scope_in_live_mode())
 def variable_recorder():
-    from promptflow.recording.azure import VariableRecorder
+    if pytest.is_record or pytest.is_replay:
+        from promptflow.recording.azure import VariableRecorder
 
-    yield VariableRecorder()
+        yield VariableRecorder()
+    else:
+        yield None
 
 
 @pytest.fixture(scope=package_scope_in_live_mode())
@@ -346,3 +362,18 @@ def vcr_recording(request: pytest.FixtureRequest, user_object_id: str, tenant_id
         yield recording
     else:
         yield None
+
+
+def pytest_collection_modifyitems(items):
+    parents = {}
+    for item in items:
+        # Check if parent contains 'localtest' marker and remove it.
+        if any(mark.name == 'localtest' for mark in item.parent.own_markers) or id(item.parent) in parents:
+            if id(item.parent) not in parents:
+                item.parent.own_markers = [
+                    marker for marker in item.own_markers if getattr(marker, 'name', None) != 'localtest']
+                parents[id(item.parent)] = item.parent
+            if not item.get_closest_marker('azuretest'):
+                # If item's parent was marked as 'localtest', mark the child as such, but not if
+                # it was marked as 'azuretest'.
+                item.add_marker(pytest.mark.localtest)
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py
index 1faef92a46a..16cd0bab1df 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_adv_simulator.py
@@ -6,7 +6,7 @@
 
 
 @pytest.mark.usefixtures("recording_injection")
-@pytest.mark.e2etest
+@pytest.mark.azuretest
 class TestAdvSimulator:
     @pytest.mark.usefixtures("vcr_recording")
     def test_adv_sim_init_with_prod_url(self, azure_cred, project_scope):
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
index 17bfb5029cf..e1a305ca388 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_builtin_evaluators.py
@@ -11,7 +11,7 @@
 
 
 @pytest.mark.usefixtures("recording_injection", "vcr_recording")
-@pytest.mark.e2etest
+@pytest.mark.localtest
 class TestBuiltInEvaluators:
     def test_individual_evaluator_prompt_based(self, model_config):
         eval_fn = FluencyEvaluator(model_config)
@@ -31,6 +31,7 @@ def test_individual_evaluator_prompt_based_with_dict_input(self, model_config):
         assert score is not None
         assert score["gpt_fluency"] > 0.0
 
+    @pytest.mark.azuretest
     def test_individual_evaluator_service_based(self, project_scope, azure_cred):
         eval_fn = ViolenceEvaluator(project_scope, azure_cred)
         score = eval_fn(
@@ -73,6 +74,7 @@ def test_composite_evaluator_qa(self, model_config, parallel):
         assert score["gpt_similarity"] > 0.0
         assert score["f1_score"] > 0.0
 
+    @pytest.mark.azuretest
     def test_composite_evaluator_content_safety(self, project_scope, azure_cred):
         safety_eval = ContentSafetyEvaluator(project_scope, parallel=False, credential=azure_cred)
         score = safety_eval(
@@ -156,6 +158,7 @@ def test_composite_evaluator_chat(self, model_config, eval_last_turn, parallel):
         assert score["evaluation_per_turn"]["gpt_retrieval"] is not None
         assert len(score["evaluation_per_turn"]["gpt_retrieval"]["score"]) == turn_count
 
+    @pytest.mark.azuretest
     @pytest.mark.parametrize(
         "eval_last_turn, parallel",
         [
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
index 16b06daf85f..f57c05e35ce 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -6,7 +6,6 @@
 import pandas as pd
 import pytest
 import requests
-from azure.identity import DefaultAzureCredential
 
 from promptflow.evals.evaluate import evaluate
 from promptflow.evals.evaluators import ContentSafetyEvaluator, F1ScoreEvaluator, GroundednessEvaluator
@@ -46,6 +45,7 @@ def question_evaluator(question):
 
 def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
     """Get run info from run history"""
+    from azure.identity import DefaultAzureCredential
     token = "Bearer " + DefaultAzureCredential().get_token("https://management.azure.com/.default").token
     headers = {
         "Authorization": token,
@@ -80,7 +80,7 @@ def _get_run_from_run_history(flow_run_id, ml_client, project_scope):
 
 
 @pytest.mark.usefixtures("recording_injection")
-@pytest.mark.e2etest
+@pytest.mark.localtest
 class TestEvaluate:
     def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         # data
@@ -118,6 +118,7 @@ def test_evaluate_with_groundedness_evaluator(self, model_config, data_file):
         assert row_result_df["outputs.f1_score.f1_score"][2] == 1
         assert result["studio_url"] is None
 
+    @pytest.mark.azuretest
     @pytest.mark.skip(reason="Failed in CI pipeline. Pending for investigation.")
     def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred):
         input_data = pd.read_json(data_file, lines=True)
@@ -301,6 +302,7 @@ def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config):
         assert "answer.length" in metrics.keys()
         assert "f1_score.f1_score" in metrics.keys()
 
+    @pytest.mark.azuretest
     def test_evaluate_track_in_cloud(
         self,
         questions_file,
@@ -344,6 +346,7 @@ def test_evaluate_track_in_cloud(
         assert remote_run["runMetadata"]["properties"]["runType"] == "eval_run"
         assert remote_run["runMetadata"]["displayName"] == evaluation_name
 
+    @pytest.mark.azuretest
     def test_evaluate_track_in_cloud_no_target(
         self,
         data_file,
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
index b7dcaf07326..c027ac98a33 100644
--- a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
+++ b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py
@@ -10,8 +10,12 @@
 from promptflow.evals.evaluate._eval_run import EvalRun
 from promptflow.evals.evaluate._evaluate import evaluate
 from promptflow.evals.evaluators._f1_score._f1_score import F1ScoreEvaluator
-from promptflow.recording.record_mode import is_live
 from promptflow.tracing import _start_trace
+try:
+    from promptflow.recording.record_mode import is_live
+except ModuleNotFoundError:
+    # The file is being imported by the local test
+    pass
 
 
 @pytest.fixture
@@ -38,7 +42,7 @@ def tracking_uri(azure_ml_client, project_scope):
 
 
 @pytest.mark.usefixtures("model_config", "recording_injection", "project_scope")
-@pytest.mark.e2etest
+@pytest.mark.azuretest
 class TestMetricsUpload(object):
     """End to end tests to check how the metrics were uploaded to cloud."""