From fda05abd75a7596fe14953a54cf2ca0b91ba91a3 Mon Sep 17 00:00:00 2001 From: Diondra <16376603+diondrapeck@users.noreply.github.com> Date: Tue, 23 Jul 2024 20:13:15 -0700 Subject: [PATCH] Add pylint and docstring checks for promptflow-evals PRs (#3547) # Description This PR - Fixes all existing pylint errors in promptflow-evals - Adds docstrings to all public promptflow-evals methods and classes - Adds a pylint gate to run on all PRs that make changes to the promptflow-evals module # All Promptflow Contribution checklist: - [x] **The pull request does not introduce [breaking changes].** - [x] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [x] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [x] **I confirm that all new dependencies are compatible with the MIT license.** - [] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [x] Title of the pull request is clear and informative. - [x] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [] Pull request includes test coverage for the included changes. --- .cspell.json | 4 +- .github/workflows/pylint.yml | 32 +++ .pre-commit-config.yaml | 53 +++++ pylintrc | 57 ++++++ .../promptflow/evals/_constants.py | 13 +- .../promptflow/evals/_version.py | 4 +- .../_batch_run_client/batch_run_context.py | 10 +- .../evaluate/_batch_run_client/code_client.py | 16 +- .../promptflow/evals/evaluate/_eval_run.py | 186 +++++++++--------- .../promptflow/evals/evaluate/_evaluate.py | 101 ++++++---- .../evals/evaluate/_telemetry/__init__.py | 111 +++++++---- .../promptflow/evals/evaluate/_utils.py | 4 +- .../evals/evaluators/_chat/_chat.py | 9 +- .../evaluators/_chat/retrieval/_retrieval.py | 10 +- .../evals/evaluators/_coherence/_coherence.py | 15 +- .../_content_safety/_content_safety.py | 12 +- .../_content_safety/_content_safety_chat.py | 11 +- .../_content_safety_sub_evaluator_base.py | 17 +- .../_content_safety/_hate_unfairness.py | 20 +- .../evaluators/_content_safety/_self_harm.py | 18 +- .../evaluators/_content_safety/_sexual.py | 17 +- .../evaluators/_content_safety/_violence.py | 17 +- .../_content_safety/common/constants.py | 4 + .../common/evaluate_with_rai_service.py | 136 +++++++++++-- .../_content_safety/common/utils.py | 9 +- .../_content_safety/common/validate_inputs.py | 12 +- .../evals/evaluators/_f1_score/_f1_score.py | 29 ++- .../evals/evaluators/_fluency/_fluency.py | 15 +- .../evaluators/_groundedness/_groundedness.py | 17 +- .../promptflow/evals/evaluators/_qa/_qa.py | 20 +- .../evals/evaluators/_relevance/_relevance.py | 19 +- .../evaluators/_similarity/_similarity.py | 19 +- .../evals/synthetic/_conversation/__init__.py | 103 ++++++++-- .../synthetic/_conversation/_conversation.py | 26 ++- .../synthetic/_conversation/constants.py | 1 + .../evals/synthetic/_model_tools/__init__.py | 1 + .../_model_tools/_async_http_client.py | 55 +++++- .../_model_tools/_identity_manager.py | 82 +++++++- .../_model_tools/_proxy_completion_model.py | 78 ++++++-- .../synthetic/_model_tools/_rai_client.py | 35 +++- .../_model_tools/_template_handler.py | 57 +++++- .../evals/synthetic/adversarial_scenario.py | 5 + .../evals/synthetic/adversarial_simulator.py | 96 +++++---- .../jailbreak_adversarial_simulator.py | 42 ++-- src/promptflow-evals/pyproject.toml | 29 +++ .../apology-prompty/apology.prompty | 10 +- .../LoadSaveEvals/apology-prompty/sample.json | 2 +- .../LoadSaveEvals/apology_dag/apology.py | 3 +- .../LoadSaveEvals/apology_dag/flow.dag.yaml | 1 - .../evaluation_dataset_context.jsonl | 2 +- .../evaluate-target/askwiki/askwiki.py | 69 ++++--- .../evaluators/blocklist/blocklist.py | 1 - src/promptflow-evals/samples/evaluate_chat.py | 4 +- .../answer_length_with_aggregation.py | 1 - .../tests/evals/e2etests/target_fn.py | 2 +- .../evals/e2etests/test_metrics_upload.py | 11 +- .../data/validate_defect_rate_test_data.jsonl | 2 +- .../evals/unittests/data/yeti_questions.jsonl | 2 +- .../test_content_safety_defect_rate.py | 5 +- .../tests/evals/unittests/test_eval_run.py | 21 +- .../tests/evals/unittests/test_evaluate.py | 6 +- .../unittests/test_evaluate_telemetry.py | 58 +++--- .../test_evaluators/apology_dag/apology.py | 3 +- .../test_evaluators/apology_dag/flow.dag.yaml | 1 - .../apology_prompty/apology.prompty | 10 +- .../apology_prompty/sample.json | 2 +- 66 files changed, 1265 insertions(+), 578 deletions(-) create mode 100644 .github/workflows/pylint.yml create mode 100644 pylintrc diff --git a/.cspell.json b/.cspell.json index 812bbfc507d..9890a83f774 100644 --- a/.cspell.json +++ b/.cspell.json @@ -243,7 +243,9 @@ "openaimodelconfiguration", "usecwd", "locustio", - "euap" + "euap", + "rcfile", + "pylintrc" ], "flagWords": [ "Prompt Flow" diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 00000000000..ff278cea998 --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,32 @@ +name: Pylint + +on: + pull_request: + paths: + - src/promptflow-evals/** + +jobs: + run_pylint: + runs-on: ubuntu-latest + + steps: + - name: checkout code + uses: actions/checkout@v2 + + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - uses: snok/install-poetry@v1 + - name: install pylint and azure-pylint-guidelines-checker + working-directory: ${{ env.WORKING_DIRECTORY }} + run: | + set -xe + poetry install -C src/promptflow-evals --with dev + poetry show -C src/promptflow-evals + - name: run pylint + working-directory: ${{ env.WORKING_DIRECTORY }} + run: | + cd src/promptflow-evals + poetry run pylint promptflow/evals --rcfile=../../pylintrc \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e01af824d3c..85f891cdf9a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,3 +44,56 @@ repos: # Use black profile for isort to avoid conflicts # see https://github.com/PyCQA/isort/issues/1518 args: ["--profile", "black", --line-length=120] + - repo: local + hooks: + - id: pylint + name: pylint + entry: python + language: system + args: [ -m, pylint, --rcfile=pylintrc, --output-format=parseable ] + types: [python] + - repo: local + hooks: + - id: pylint-dependencies-check + name: pylint-dependencies-check + entry: python + language: system + types: [python] + args: + - "-c" + - | + import os + import sys + + import pkg_resources + + # These are the versions that run in our CI + dependencies = [ + ( + "azure-pylint-guidelines-checker", + "0.3.1", + [ + "--index-url", + "https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/", + ], + ), + ("pylint", "3.0.3", []), + ] + + # Make sure that correct versions are installed + for packagename, required_version, install_args in dependencies: + try: + version = pkg_resources.get_distribution(packagename).version + assert version == required_version + except AssertionError: + print( + f"Version mismatch: Installed version '{version}' of '{packagename}' does not match required version {required_version}" + ) + except pkg_resources.DistributionNotFound: + print(f"Package '{packagename}' is not installed") + else: + continue + + print(f"Please run the following command to install the correct version of {packagename}") + print(f"\tpython -m pip install {packagename}=={required_version} {' '.join(install_args)}") + sys.exit(1) diff --git a/pylintrc b/pylintrc new file mode 100644 index 00000000000..0b8147bab77 --- /dev/null +++ b/pylintrc @@ -0,0 +1,57 @@ +[MASTER] +py-version=3.9 +ignore-patterns=test_*,conftest,setup,.*_utils\.py +ignore-paths=src\promptflow-evals\tests,src\promptflow-evals\samples,promptflow\evals\evaluate\_telemetry,promptflow\evals\evaluate\_batch_run_client\code_client.py,promptflow\evals\evaluate\_batch_run_client\proxy_client.py +reports=no +load-plugins=pylint_guidelines_checker + +[MESSAGES CONTROL] +# For all codes, run 'pylint --list-msgs' or go to 'https://pylint.pycqa.org/en/latest/technical_reference/features.html' +# locally-disabled: Warning locally suppressed using disable-msg +# cyclic-import: because of https://github.com/PyCQA/pylint/issues/850 +# too-many-arguments: Due to the nature of the CLI many commands have large arguments set which reflect in large arguments set in corresponding methods. +# Let's black deal with bad-continuation + +# Added disables from super-with-arguments +disable=useless-object-inheritance,missing-timeout,missing-client-constructor-parameter-kwargs,logging-fstring-interpolation,locally-disabled,fixme,cyclic-import,unnecessary-lambda-assignment,client-method-missing-type-annotations,too-many-arguments,invalid-name,duplicate-code,too-few-public-methods,consider-using-f-string,super-with-arguments,redefined-builtin,import-outside-toplevel,client-suffix-needed,unnecessary-dunder-call,unnecessary-ellipsis,client-paging-methods-use-list,docstring-keyword-should-match-keyword-only,docstring-type-do-not-use-class,client-accepts-api-version-keyword,networking-import-outside-azure-core-transport,protected-access,missing-module-docstring,missing-client-constructor-parameter-credential + +[FORMAT] +max-line-length=120 + +[VARIABLES] +# Tells whether we should check for unused import in __init__ files. +init-import=yes + +[DESIGN] +# Maximum number of locals for function / method body +max-locals=25 +# Maximum number of branch for function / method body +max-branches=20 +# Maximum number of instance attributes for class +max-attributes=10 +# Maximum number of ancestors +max-parents=15 + +[SIMILARITIES] +min-similarity-lines=10 + +[BASIC] +# Naming hints based on PEP 8 (https://www.python.org/dev/peps/pep-0008/#naming-conventions). +# Consider these guidelines and not hard rules. Read PEP 8 for more details. + +# The invalid-name checker must be **enabled** for these hints to be used. +include-naming-hint=yes + +module-naming-style=snake_case +const-naming-style=UPPER_CASE +class-naming-style=PascalCase +class-attribute-naming-style=snake_case +attr-naming-style=snake_case +method-naming-style=snake_case +function-naming-style=snake_case +argument-naming-style=snake_case +variable-naming-style=snake_case +inlinevar-naming-style=snake_case + +[TYPECHECK] +generated-members=js.* diff --git a/src/promptflow-evals/promptflow/evals/_constants.py b/src/promptflow-evals/promptflow/evals/_constants.py index 5166f6a464e..b980142cb29 100644 --- a/src/promptflow-evals/promptflow/evals/_constants.py +++ b/src/promptflow-evals/promptflow/evals/_constants.py @@ -1,4 +1,10 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + class EvaluationMetrics: + """Metrics for model evaluation.""" GPT_GROUNDEDNESS = "gpt_groundedness" GPT_RELEVANCE = "gpt_relevance" GPT_COHERENCE = "gpt_coherence" @@ -14,9 +20,10 @@ class EvaluationMetrics: class Prefixes: - _INPUTS = "inputs." - _OUTPUTS = "outputs." - _TGT_OUTPUTS = "__outputs." + """Column prefixes for inputs and outputs.""" + INPUTS = "inputs." + OUTPUTS = "outputs." + TSG_OUTPUTS = "__outputs." DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json" diff --git a/src/promptflow-evals/promptflow/evals/_version.py b/src/promptflow-evals/promptflow/evals/_version.py index bc5ffac1fb9..7a1f1b66965 100644 --- a/src/promptflow-evals/promptflow/evals/_version.py +++ b/src/promptflow-evals/promptflow/evals/_version.py @@ -6,7 +6,7 @@ try: __version__ = importlib.metadata.version("promptflow-evals") -except BaseException: - __version__ = '0.0.1.dev0' +except BaseException: # pylint: disable=broad-exception-caught + __version__ = "0.0.1.dev0" VERSION: str = __version__ diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/batch_run_context.py b/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/batch_run_context.py index 1d188c98ce7..7f4a50e1afa 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/batch_run_context.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/batch_run_context.py @@ -15,7 +15,15 @@ class BatchRunContext: - def __init__(self, client): + """Context manager for batch run clients. + + :param client: The client to run in the context. + :type client: Union[ + ~promptflow.evals.evaluate.code_client.CodeClient, + ~promptflow.evals.evaluate.proxy_client.ProxyClient + ] + """ + def __init__(self, client) -> None: self.client = client self._is_timeout_set_by_system = False diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/code_client.py b/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/code_client.py index e791d35bdc8..97358439a17 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/code_client.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_batch_run_client/code_client.py @@ -38,7 +38,7 @@ def get_aggregated_metrics(self): if self.aggregated_metrics is not None else None ) - except Exception as ex: + except Exception as ex: # pylint: disable=broad-exception-caught LOGGER.debug(f"Error calculating metrics for evaluator {self.evaluator_name}, failed with error {str(ex)}") aggregated_metrics = None @@ -107,7 +107,7 @@ def _calculate_aggregations(self, evaluator, run): aggr_func = getattr(evaluator, "__aggregate__") aggregated_output = aggr_func(aggregate_input) return aggregated_output - except Exception as ex: + except Exception as ex: # pylint: disable=broad-exception-caught LOGGER.warning( f"Error calculating aggregations for evaluator {run.evaluator_name}," f" failed with error {str(ex)}" ) @@ -118,11 +118,15 @@ def run(self, flow, data, evaluator_name=None, column_mapping=None, **kwargs): if not isinstance(input_df, pd.DataFrame): try: json_data = load_jsonl(data) - except json.JSONDecodeError: - raise ValueError(f"Failed to parse data as JSON: {data}. Please provide a valid json lines data.") + except json.JSONDecodeError as exc: + raise ValueError( + f"Failed to parse data as JSON: {data}. Please provide a valid json lines data." + ) from exc input_df = pd.DataFrame(json_data) - eval_future = self._thread_pool.submit(self._calculate_metric, flow, input_df, column_mapping, evaluator_name) + eval_future = self._thread_pool.submit( + self._calculate_metric, flow, input_df, column_mapping, evaluator_name + ) # pylint: disable=specify-parameter-names-in-call run = CodeRun(run=eval_future, input_data=data, evaluator_name=evaluator_name, aggregated_metrics=None) aggregation_future = self._thread_pool.submit(self._calculate_aggregations, evaluator=flow, run=run) run.aggregated_metrics = aggregation_future @@ -137,7 +141,7 @@ def get_metrics(self, run): aggregated_metrics = run.get_aggregated_metrics() print("Aggregated metrics") print(aggregated_metrics) - except Exception as ex: + except Exception as ex: # pylint: disable=broad-exception-caught LOGGER.debug(f"Error calculating metrics for evaluator {run.evaluator_name}, failed with error {str(ex)}") return None return aggregated_metrics diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py index 01898087ebd..74bee5c579a 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_eval_run.py @@ -7,17 +7,17 @@ import logging import os import posixpath -import requests import time import uuid from typing import Any, Dict, Optional, Set from urllib.parse import urlparse +import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry -from promptflow.evals._version import VERSION from promptflow._sdk.entities import Run +from promptflow.evals._version import VERSION LOGGER = logging.getLogger(__name__) @@ -50,30 +50,32 @@ class RunInfo: run_name: str @staticmethod - def generate(run_name: Optional[str]) -> 'RunInfo': + def generate(run_name: Optional[str]) -> "RunInfo": """ Generate the new RunInfo instance with the RunID and Experiment ID. **Note:** This code is used when we are in failed state and cannot get a run. + :param run_name: The name of a run. - :type run_name: str + :type run_name: Optional[str] + :return: The RunInfo instance. + :rtype: promptflow.evals.evaluate.RunInfo """ - return RunInfo( - str(uuid.uuid4()), - str(uuid.uuid4()), - run_name or "" - ) + return RunInfo(str(uuid.uuid4()), str(uuid.uuid4()), run_name or "") class RunStatus(enum.Enum): """Run states.""" + NOT_STARTED = 0 STARTED = 1 BROKEN = 2 TERMINATED = 3 -class EvalRun(contextlib.AbstractContextManager): +class EvalRun( + contextlib.AbstractContextManager +): # pylint: disable=too-many-instance-attributes,docstring-missing-param """ The simple singleton run class, used for accessing artifact store. @@ -88,7 +90,7 @@ class EvalRun(contextlib.AbstractContextManager): :param workspace_name: The name of workspace/project used to track run. :type workspace_name: str :param ml_client: The ml client used for authentication into Azure. - :type ml_client: MLClient + :type ml_client: azure.ai.ml.MLClient :param promptflow_run: The promptflow run used by the """ @@ -97,22 +99,19 @@ class EvalRun(contextlib.AbstractContextManager): _TIMEOUT = 5 _SCOPE = "https://management.azure.com/.default" - EVALUATION_ARTIFACT = 'instance_results.jsonl' - EVALUATION_ARTIFACT_DUMMY_RUN = 'eval_results.jsonl' - - def __init__(self, - run_name: Optional[str], - tracking_uri: str, - subscription_id: str, - group_name: str, - workspace_name: str, - ml_client: Any, - promptflow_run: Optional[Run] = None, - ): - """ - Constructor - """ - + EVALUATION_ARTIFACT = "instance_results.jsonl" + EVALUATION_ARTIFACT_DUMMY_RUN = "eval_results.jsonl" + + def __init__( + self, + run_name: Optional[str], + tracking_uri: str, + subscription_id: str, + group_name: str, + workspace_name: str, + ml_client: "MLClient", + promptflow_run: Optional[Run] = None, + ) -> None: self._tracking_uri: str = tracking_uri self._subscription_id: str = subscription_id self._resource_group_name: str = group_name @@ -122,6 +121,8 @@ def __init__(self, self._run_name = run_name self._promptflow_run = promptflow_run self._status = RunStatus.NOT_STARTED + self._url_base = None + self.info = None @property def status(self) -> RunStatus: @@ -129,6 +130,7 @@ def status(self) -> RunStatus: Return the run status. :return: The status of the run. + :rtype: promptflow._sdk._constants.RunStatus """ return self._status @@ -136,8 +138,6 @@ def _get_scope(self) -> str: """ Return the scope information for the workspace. - :param workspace_object: The workspace object. - :type workspace_object: azureml.core.workspace.Workspace :return: The scope information for the workspace. :rtype: str """ @@ -153,13 +153,12 @@ def _start_run(self) -> None: """ Start the run, or, if it is not applicable (for example, if tracking is not enabled), mark it as started. """ - self._check_state_and_log('start run', - {v for v in RunStatus if v != RunStatus.NOT_STARTED}, - True) + self._check_state_and_log("start run", {v for v in RunStatus if v != RunStatus.NOT_STARTED}, True) self._status = RunStatus.STARTED if self._tracking_uri is None: - LOGGER.warning("tracking_uri was not provided, " - "The results will be saved locally, but will not be logged to Azure.") + LOGGER.warning( + "A tracking_uri was not provided, The results will be saved locally, but will not be logged to Azure." + ) self._url_base = None self._status = RunStatus.BROKEN self.info = RunInfo.generate(self._run_name) @@ -167,9 +166,7 @@ def _start_run(self) -> None: self._url_base = urlparse(self._tracking_uri).netloc if self._promptflow_run is not None: self.info = RunInfo( - self._promptflow_run.name, - self._promptflow_run._experiment_name, - self._promptflow_run.name + self._promptflow_run.name, self._promptflow_run._experiment_name, self._promptflow_run.name ) else: url = f"https://{self._url_base}/mlflow/v2.0" f"{self._get_scope()}/api/2.0/mlflow/runs/create" @@ -181,36 +178,34 @@ def _start_run(self) -> None: } if self._run_name: body["run_name"] = self._run_name - response = self.request_with_retry( - url=url, - method='POST', - json_dict=body - ) + response = self.request_with_retry(url=url, method="POST", json_dict=body) if response.status_code != 200: self.info = RunInfo.generate(self._run_name) - LOGGER.warning(f"The run failed to start: {response.status_code}: {response.text}." - "The results will be saved locally, but will not be logged to Azure.") + LOGGER.warning( + f"The run failed to start: {response.status_code}: {response.text}." + "The results will be saved locally, but will not be logged to Azure." + ) self._status = RunStatus.BROKEN else: parsed_response = response.json() self.info = RunInfo( - run_id=parsed_response['run']['info']['run_id'], - experiment_id=parsed_response['run']['info']['experiment_id'], - run_name=parsed_response['run']['info']['run_name'] + run_id=parsed_response["run"]["info"]["run_id"], + experiment_id=parsed_response["run"]["info"]["experiment_id"], + run_name=parsed_response["run"]["info"]["run_name"], ) self._status = RunStatus.STARTED def _end_run(self, reason: str) -> None: """ - Tetminate the run. + Terminate the run. - :param reason: One of "FINISHED" "FAILED" and "KILLED" + :param reason: Reason for run termination. Possible values are "FINISHED" "FAILED", and "KILLED" :type reason: str - :raises: ValueError if the run is not in ("FINISHED", "FAILED", "KILLED") + :raises ValueError: Raised if the run is not in ("FINISHED", "FAILED", "KILLED") """ - if not self._check_state_and_log('stop run', - {RunStatus.BROKEN, RunStatus.NOT_STARTED, RunStatus.TERMINATED}, - False): + if not self._check_state_and_log( + "stop run", {RunStatus.BROKEN, RunStatus.NOT_STARTED, RunStatus.TERMINATED}, False + ): return if self._is_promptflow_run: # This run is already finished, we just add artifacts/metrics to it. @@ -233,17 +228,24 @@ def _end_run(self, reason: str) -> None: self._status = RunStatus.TERMINATED def __enter__(self): - """The Context Manager enter call.""" + """The Context Manager enter call. + + :return: The instance of the class. + :rtype: promptflow.evals.evaluate.EvalRun + """ self._start_run() return self - def __exit__(self, exc_type, exc_value, exc_tb): + def __exit__(self, exc_type, exc_value, exc_tb): # pylint: disable=docstring-missing-param """The context manager exit call.""" self._end_run("FINISHED") def get_run_history_uri(self) -> str: """ - Return the run history service URI. + Get the run history service URI. + + :return: The run history service URI. + :rtype: str """ return ( f"https://{self._url_base}" @@ -254,20 +256,27 @@ def get_run_history_uri(self) -> str: def get_artifacts_uri(self) -> str: """ - Returns the url to upload the artifacts. + Gets the URI to upload the artifacts to. + + :return: The URI to upload the artifacts to. + :rtype: str """ return self.get_run_history_uri() + "/artifacts/batch/metadata" def get_metrics_url(self): """ Return the url needed to track the mlflow metrics. + + :return: The url needed to track the mlflow metrics. + :rtype: str """ return f"https://{self._url_base}" "/mlflow/v2.0" f"{self._get_scope()}" f"/api/2.0/mlflow/runs/log-metric" def _get_token(self): # We have to use lazy import because promptflow.azure # is an optional dependency. - from promptflow.azure._utils._token_cache import ArmTokenCache + from promptflow.azure._utils._token_cache import ArmTokenCache # pylint: disable=import-error,no-name-in-module + return ArmTokenCache().get_token(self._ml_client._credential) def request_with_retry( @@ -278,13 +287,14 @@ def request_with_retry( :param url: The url to send the request to. :type url: str - :param auth_token: Azure authentication token - :type auth_token: str or None :param method: The request method to be used. :type method: str :param json_dict: The json dictionary (not serialized) to be sent. - :type json_dict: dict. - :return: The requests.Response object. + :type json_dict: Dict[str, Any] + :param headers: The headers to be sent with the request. + :type headers: Optional[Dict[str, str]] + :return: The response + :rtype: requests.Response """ if headers is None: headers = {} @@ -320,30 +330,26 @@ def _log_warning(self, failed_op: str, response: requests.Response) -> None: f"{response.text=}." ) - def _check_state_and_log( - self, - action: str, - bad_states: Set[RunStatus], - should_raise: bool) -> bool: + def _check_state_and_log(self, action: str, bad_states: Set[RunStatus], should_raise: bool) -> bool: """ Check that the run is in the correct state and log worning if it is not. :param action: Action, which caused this check. For example if it is "log artifact", - the log message will start "Unable to log artifact." + the log message will start "Unable to log artifact." :type action: str :param bad_states: The states, considered invalid for given action. - :type bad_states: set - :param should_raise: Should we raise an error if the bad state has been encountered? + :type bad_states: Set[RunStatus] + :param should_raise: Should we raise an error if the bad state has been encountered :type should_raise: bool :raises: RuntimeError if should_raise is True and invalid state was encountered. - :return: boolean saying if run is in the correct state. + :return: Whether or not run is in the correct state. + :rtype: bool """ if self._status in bad_states: msg = f"Unable to {action} due to Run status={self._status}." if should_raise: raise RuntimeError(msg) - else: - LOGGER.warning(msg) + LOGGER.warning(msg) return False return True @@ -354,10 +360,14 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART **Note:** In the current implementation we are not using the thread pool executor as it is done in azureml-mlflow, instead we are just running upload in cycle as we are not expecting uploading a lot of artifacts. + :param artifact_folder: The folder with artifacts to be uploaded. :type artifact_folder: str + :param artifact_name: The name of the artifact to be uploaded. Defaults to + promptflow.evals.evaluate.EvalRun.EVALUATION_ARTIFACT. + :type artifact_name: str """ - if not self._check_state_and_log('log artifact', {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False): + if not self._check_state_and_log("log artifact", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False): return # Check if artifact dirrectory is empty or does not exist. if not os.path.isdir(artifact_folder): @@ -370,15 +380,15 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART LOGGER.warning("The run results file was not found, skipping artifacts upload.") return # First we will list the files and the appropriate remote paths for them. - root_upload_path = posixpath.join("promptflow", 'PromptFlowArtifacts', self.info.run_name) - remote_paths = {'paths': []} + root_upload_path = posixpath.join("promptflow", "PromptFlowArtifacts", self.info.run_name) + remote_paths = {"paths": []} local_paths = [] # Go over the artifact folder and upload all artifacts. for (root, _, filenames) in os.walk(artifact_folder): upload_path = root_upload_path if root != artifact_folder: rel_path = os.path.relpath(root, artifact_folder) - if rel_path != '.': + if rel_path != ".": upload_path = posixpath.join(root_upload_path, rel_path) for f in filenames: remote_file_path = posixpath.join(upload_path, f) @@ -389,13 +399,10 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART # We will write the artifacts to the workspaceblobstore datastore = self._ml_client.datastores.get_default(include_secrets=True) account_url = f"{datastore.account_name}.blob.{datastore.endpoint}" - svc_client = BlobServiceClient( - account_url=account_url, credential=self._get_datastore_credential(datastore)) - for local, remote in zip(local_paths, remote_paths['paths']): - blob_client = svc_client.get_blob_client( - container=datastore.container_name, - blob=remote['path']) - with open(local, 'rb') as fp: + svc_client = BlobServiceClient(account_url=account_url, credential=self._get_datastore_credential(datastore)) + for local, remote in zip(local_paths, remote_paths["paths"]): + blob_client = svc_client.get_blob_client(container=datastore.container_name, blob=remote["path"]) + with open(local, "rb") as fp: blob_client.upload_blob(fp, overwrite=True) # To show artifact in UI we will need to register it. If it is a promptflow run, @@ -422,7 +429,7 @@ def log_artifact(self, artifact_folder: str, artifact_name: str = EVALUATION_ART }, ) if response.status_code != 200: - self._log_warning('register artifact', response) + self._log_warning("register artifact", response) def _get_datastore_credential(self, datastore: "Datastore"): # Reference the logic in azure.ai.ml._artifact._artifact_utilities @@ -430,10 +437,9 @@ def _get_datastore_credential(self, datastore: "Datastore"): credential = datastore.credentials if isinstance(credential, AccountKeyConfiguration): return credential.account_key - elif hasattr(credential, "sas_token"): + if hasattr(credential, "sas_token"): return credential.sas_token - else: - return self._ml_client.datastores._credential + return self._ml_client.datastores._credential def log_metric(self, key: str, value: float) -> None: """ @@ -444,7 +450,7 @@ def log_metric(self, key: str, value: float) -> None: :param value: The valure to be logged. :type value: float """ - if not self._check_state_and_log('log metric', {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False): + if not self._check_state_and_log("log metric", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False): return body = { "run_uuid": self.info.run_id, @@ -469,7 +475,7 @@ def write_properties_to_run_history(self, properties: Dict[str, Any]) -> None: :param properties: The properties to be written to run history. :type properties: dict """ - if not self._check_state_and_log('write properties', {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False): + if not self._check_state_and_log("write properties", {RunStatus.BROKEN, RunStatus.NOT_STARTED}, False): return # update host to run history and request PATCH API response = self.request_with_retry( diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py index 54ddca281be..8f2b979afb9 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py @@ -10,16 +10,16 @@ from promptflow._sdk._constants import LINE_NUMBER from promptflow.client import PFClient -from ._telemetry import log_evaluate_activity from .._constants import CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT, EvaluationMetrics, Prefixes from .._user_agent import USER_AGENT from ._batch_run_client import BatchRunContext, CodeClient, ProxyClient +from ._telemetry import log_evaluate_activity from ._utils import ( _apply_column_mapping, _log_metrics_and_instance_results, - _write_output, _trace_destination_from_project_scope, + _write_output, ) @@ -79,8 +79,7 @@ def _validate_input_data_for_evaluator(evaluator, evaluator_name, df_data, is_ta if missing_inputs: if not is_target_fn: raise ValueError(f"Missing required inputs for evaluator {evaluator_name} : {missing_inputs}.") - else: - raise ValueError(f"Missing required inputs for target : {missing_inputs}.") + raise ValueError(f"Missing required inputs for target : {missing_inputs}.") def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name): @@ -114,7 +113,9 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj try: initial_data_df = pd.read_json(data, lines=True) except Exception as e: - raise ValueError(f"Failed to load data from {data}. Please validate it is a valid jsonl data. Error: {str(e)}.") + raise ValueError( + f"Failed to load data from {data}. Please validate it is a valid jsonl data. Error: {str(e)}." + ) from e return initial_data_df @@ -128,16 +129,19 @@ def _validate_columns( """ Check that all columns needed by evaluator or target function are present. - :keyword df: The data frame to be validated. - :paramtype df: pd.DataFrame - :keyword evaluators: The dictionary of evaluators. - :paramtype evaluators: Dict[str, Any] - :keyword target: The callable to be applied to data set. - :paramtype target: Optional[Callable] + :param df: The data frame to be validated. + :type df: pd.DataFrame + :param evaluators: The dictionary of evaluators. + :type evaluators: Dict[str, Any] + :param target: The callable to be applied to data set. + :type target: Optional[Callable] + :param evaluator_config: The configuration for evaluators. + :type evaluator_config: Dict[str, Dict[str, str]] + :raises ValueError: If column starts from "__outputs." while target is defined. """ if target: - if any(c.startswith(Prefixes._TGT_OUTPUTS) for c in df.columns): - raise ValueError("The column cannot start from " f'"{Prefixes._TGT_OUTPUTS}" if target was defined.') + if any(c.startswith(Prefixes.TSG_OUTPUTS) for c in df.columns): + raise ValueError("The column cannot start from " f'"{Prefixes.TSG_OUTPUTS}" if target was defined.') # If the target function is given, it may return # several columns and hence we cannot check the availability of columns # without knowing target function semantics. @@ -164,18 +168,20 @@ def _apply_target_to_data( """ Apply the target function to the data set and return updated data and generated columns. - :keyword target: The function to be applied to data. - :paramtype target: Callable - :keyword data: The path to input jsonl file. - :paramtype data: str - :keyword pf_client: The promptflow client to be used. - :paramtype pf_client: PFClient - :keyword initial_data: The data frame with the loaded data. - :paramtype initial_data: pd.DataFrame - :keyword _run_name: The name of target run. Used for testing only. - :paramtype _run_name: Optional[str] + :param target: The function to be applied to data. + :type target: Callable + :param data: The path to input jsonl file. + :type data: str + :param pf_client: The promptflow client to be used. + :type pf_client: PFClient + :param initial_data: The data frame with the loaded data. + :type initial_data: pd.DataFrame + :param evaluation_name: The name of the evaluation. + :type evaluation_name: Optional[str] + :param _run_name: The name of target run. Used for testing only. + :type _run_name: Optional[str] :return: The tuple, containing data frame and the list of added columns. - :rtype: Tuple[pd.DataFrame, List[str]] + :rtype: Tuple[pandas.DataFrame, List[str]] """ # We are manually creating the temporary directory for the flow # because the way tempdir remove temporary directories will @@ -191,7 +197,7 @@ def _apply_target_to_data( target_output = pf_client.runs.get_details(run, all_results=True) # Remove input and output prefix generated_columns = { - col[len(Prefixes._OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes._OUTPUTS) + col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS) } # Sort output by line numbers target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True) @@ -202,15 +208,22 @@ def _apply_target_to_data( drop_columns = list(filter(lambda x: x.startswith("inputs"), target_output.columns)) target_output.drop(drop_columns, inplace=True, axis=1) # Rename outputs columns to __outputs - rename_dict = {col: col.replace(Prefixes._OUTPUTS, Prefixes._TGT_OUTPUTS) for col in target_output.columns} + rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns} target_output.rename(columns=rename_dict, inplace=True) # Concatenate output to input target_output = pd.concat([target_output, initial_data], axis=1) + return target_output, generated_columns, run -def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]): - """Process evaluator_config to replace ${target.} with ${data.}""" +def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]) -> Dict[str, Dict[str, str]]: + """Process evaluator_config to replace ${target.} with ${data.} + + :param evaluator_config: The configuration for evaluators. + :type evaluator_config: Dict[str, Dict[str, str]] + :return: The processed configuration. + :rtype: Dict[str, Dict[str, str]] + """ processed_config = {} @@ -222,7 +235,6 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]): processed_config[evaluator] = {} for map_to_key, map_value in mapping_config.items(): - # Check if there's any unexpected reference other than ${target.} or ${data.} if unexpected_references.search(map_value): raise ValueError( @@ -236,21 +248,24 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]): return processed_config -def _rename_columns_conditionally(df: pd.DataFrame): +def _rename_columns_conditionally(df: pd.DataFrame) -> pd.DataFrame: """ Change the column names for data frame. The change happens inplace. The columns with _OUTPUTS prefix will not be changed. _OUTPUTS prefix will will be added to columns in target_generated set. The rest columns will get ".inputs" prefix. + :param df: The data frame to apply changes to. + :type df: pandas.DataFrame :return: The changed data frame. + :rtype: pandas.DataFrame """ rename_dict = {} for col in df.columns: # Rename columns generated by target. - if Prefixes._TGT_OUTPUTS in col: - rename_dict[col] = col.replace(Prefixes._TGT_OUTPUTS, Prefixes._OUTPUTS) + if Prefixes.TSG_OUTPUTS in col: + rename_dict[col] = col.replace(Prefixes.TSG_OUTPUTS, Prefixes.OUTPUTS) else: rename_dict[col] = f"inputs.{col}" df.rename(columns=rename_dict, inplace=True) @@ -360,12 +375,12 @@ def evaluate( " if __name__ == '__main__':\n" " evaluate(...)" ) - raise RuntimeError(error_message) + raise RuntimeError(error_message) from e raise e -def _evaluate( +def _evaluate( # pylint: disable=too-many-locals *, evaluation_name: Optional[str] = None, target: Optional[Callable] = None, @@ -376,7 +391,6 @@ def _evaluate( output_path: Optional[str] = None, **kwargs, ): - input_data_df = _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name) # Process evaluator config to replace ${target.} with ${data.} @@ -387,8 +401,9 @@ def _evaluate( # Target Run pf_client = PFClient( - config={ - "trace.destination": _trace_destination_from_project_scope(azure_ai_project)} if azure_ai_project else None, + config={"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} + if azure_ai_project + else None, user_agent=USER_AGENT, ) @@ -418,7 +433,7 @@ def _evaluate( # We will add our mapping only if # customer did not mapped target output. if col not in mapping and run_output not in mapped_to_values: - evaluator_config[evaluator_name][col] = run_output + evaluator_config[evaluator_name][col] = run_output # pylint: disable=unnecessary-dict-index-lookup # After we have generated all columns we can check if we have # everything we need for evaluators. @@ -455,14 +470,14 @@ def _evaluate( # drop input columns evaluator_result_df = evaluator_result_df.drop( - columns=[col for col in evaluator_result_df.columns if str(col).startswith(Prefixes._INPUTS)] + columns=[col for col in evaluator_result_df.columns if str(col).startswith(Prefixes.INPUTS)] ) # rename output columns # Assuming after removing inputs columns, all columns are output columns evaluator_result_df.rename( columns={ - col: f"outputs.{evaluator_name}.{str(col).replace(Prefixes._OUTPUTS, '')}" + col: f"outputs.{evaluator_name}.{str(col).replace(Prefixes.OUTPUTS, '')}" for col in evaluator_result_df.columns }, inplace=True, @@ -486,7 +501,11 @@ def _evaluate( metrics.update(evaluators_metric) studio_url = _log_metrics_and_instance_results( - metrics, result_df, trace_destination, target_run, evaluation_name, + metrics, + result_df, + trace_destination, + target_run, + evaluation_name, ) result = {"rows": result_df.to_dict("records"), "metrics": metrics, "studio_url": studio_url} diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_telemetry/__init__.py b/src/promptflow-evals/promptflow/evals/evaluate/_telemetry/__init__.py index b9c92b7a579..9bd505b41e0 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_telemetry/__init__.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_telemetry/__init__.py @@ -5,45 +5,63 @@ __path__ = __import__("pkgutil").extend_path(__path__, __name__) import functools +import inspect import json import logging -import inspect +from typing import Callable, Dict import pandas as pd -from .._utils import _trace_destination_from_project_scope -from ..._user_agent import USER_AGENT -from promptflow.core import Prompty as prompty_core -from promptflow._sdk.entities._flows import Prompty as prompty_sdk, FlexFlow as flex_flow +from promptflow._sdk.entities._flows import FlexFlow as flex_flow +from promptflow._sdk.entities._flows import Prompty as prompty_sdk from promptflow._sdk.entities._flows.dag import Flow as dag_flow from promptflow.client import PFClient +from promptflow.core import Prompty as prompty_core + +from ..._user_agent import USER_AGENT +from .._utils import _trace_destination_from_project_scope LOGGER = logging.getLogger(__name__) -def _get_evaluator_type(evaluator): +def _get_evaluator_type(evaluator: Dict[str, Callable]): """ - Get evaluator type for telemetry. Possible values are "built-in", "custom" and "content-safety" + Get evaluator type for telemetry. + + :param evaluator: The evaluator object + :type evaluator: Dict[str, Callable] + :return: The evaluator type. Possible values are "built-in", "custom", and "content-safety". + :rtype: str """ built_in = False content_safety = False module = inspect.getmodule(evaluator) - built_in = (module and module.__name__.startswith("promptflow.evals.evaluators.")) - + built_in = module and module.__name__.startswith("promptflow.evals.evaluators.") if built_in: content_safety = module.__name__.startswith("promptflow.evals.evaluators._content_safety") - return "content-safety" if content_safety else "built-in" if built_in else "custom" + if content_safety: + return "content-safety" + if built_in: + return "built-in" + return "custom" def _get_evaluator_properties(evaluator, evaluator_name): """ - Get evaluator properties for telemetry - It gets name, pf_type, and type - name : tries best to get the most meaningful name for the evaluator - pf_type : The type of promptflow being used - type : The type of evaluator being used. Possible values are "built-in", "custom" and "content-safety" + Get evaluator properties for telemetry. + + :param: evaluator: The evaluator object + :param: evaluator_name: The alias for the evaluator + :type: str + :raises Exception: If the evaluator properties cannot be retrieved + :return: A dictionary containing the evaluator properties, including + "name": A name for the evaluator + "pf_type": The promptflow type being used + "type": The evaluator type. Accepted values are "built-in", "custom", and "content-safety" + "alias": The alias for the evaluator. Defaults to an empty string. + :rtype: Dict[str, str] """ try: @@ -65,7 +83,7 @@ def _get_evaluator_properties(evaluator, evaluator_name): # fallback option name = str(evaluator) pf_type = "Unknown" - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught LOGGER.debug(f"Failed to get evaluator properties: {e}") name = str(evaluator) pf_type = "Unknown" @@ -74,16 +92,20 @@ def _get_evaluator_properties(evaluator, evaluator_name): "name": name, "pf_type": pf_type, "type": _get_evaluator_type(evaluator), - "alias": evaluator_name if evaluator_name else "" + "alias": evaluator_name if evaluator_name else "", } # cspell:ignore isna -def log_evaluate_activity(func): - """Decorator to log evaluate activity""" +def log_evaluate_activity(func) -> None: + """Decorator to log evaluate activity + + :param func: The function to be decorated + :type func: Callable + """ @functools.wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable: from promptflow._sdk._telemetry import ActivityType, log_activity from promptflow._sdk._telemetry.telemetry import get_telemetry_logger @@ -91,23 +113,28 @@ def wrapper(*args, **kwargs): azure_ai_project = kwargs.get("azure_ai_project", None) pf_client = PFClient( - config={ - "trace.destination": _trace_destination_from_project_scope( - azure_ai_project)} if azure_ai_project else None, + config={"trace.destination": _trace_destination_from_project_scope(azure_ai_project)} + if azure_ai_project + else None, user_agent=USER_AGENT, ) - track_in_cloud = True if pf_client._config.get_trace_destination() else False - evaluate_target = True if kwargs.get("target", None) else False - evaluator_config = True if kwargs.get("evaluator_config", None) else False + track_in_cloud = bool(pf_client._config.get_trace_destination()) + evaluate_target = bool(kwargs.get("target", None)) + evaluator_config = bool(kwargs.get("evaluator_config", None)) custom_dimensions = { "track_in_cloud": track_in_cloud, "evaluate_target": evaluate_target, "evaluator_config": evaluator_config, } - with log_activity(get_telemetry_logger(), "pf.evals.evaluate", activity_type=ActivityType.PUBLICAPI, - user_agent=USER_AGENT, custom_dimensions=custom_dimensions): + with log_activity( + get_telemetry_logger(), + "pf.evals.evaluate", + activity_type=ActivityType.PUBLICAPI, + user_agent=USER_AGENT, + custom_dimensions=custom_dimensions, + ): result = func(*args, **kwargs) try: @@ -115,27 +142,31 @@ def wrapper(*args, **kwargs): for evaluator_name, evaluator in evaluators.items(): evaluator_info = _get_evaluator_properties(evaluator, evaluator_name) try: - evaluator_df = pd.DataFrame(result.get("rows", [])).filter(like=f"outputs.{evaluator_name}", - axis=1) + evaluator_df = pd.DataFrame(result.get("rows", [])).filter( + like=f"outputs.{evaluator_name}", axis=1 + ) - failed_rows = evaluator_df.shape[0] if evaluator_df.empty else int( - evaluator_df.isna().any(axis=1).sum()) + failed_rows = ( + evaluator_df.shape[0] if evaluator_df.empty else int(evaluator_df.isna().any(axis=1).sum()) + ) total_rows = evaluator_df.shape[0] evaluator_info["failed_rows"] = failed_rows evaluator_info["total_rows"] = total_rows - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught LOGGER.debug(f"Failed to collect evaluate failed row info for {evaluator_name}: {e}") evaluators_info.append(evaluator_info) - custom_dimensions = { - "evaluators_info": json.dumps(evaluators_info) - } - with log_activity(get_telemetry_logger(), "pf.evals.evaluate_usage_info", - activity_type=ActivityType.PUBLICAPI, user_agent=USER_AGENT, - custom_dimensions=custom_dimensions): + custom_dimensions = {"evaluators_info": json.dumps(evaluators_info)} + with log_activity( + get_telemetry_logger(), + "pf.evals.evaluate_usage_info", + activity_type=ActivityType.PUBLICAPI, + user_agent=USER_AGENT, + custom_dimensions=custom_dimensions, + ): pass - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught LOGGER.debug(f"Failed to collect evaluate usage info: {e}") return result diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_utils.py b/src/promptflow-evals/promptflow/evals/evaluate/_utils.py index 35685469ba3..fbd0ab27104 100644 --- a/src/promptflow-evals/promptflow/evals/evaluate/_utils.py +++ b/src/promptflow-evals/promptflow/evals/evaluate/_utils.py @@ -28,7 +28,7 @@ def is_none(value): return value is None or str(value).lower() == "none" -def extract_workspace_triad_from_trace_provider(trace_provider: str): +def extract_workspace_triad_from_trace_provider(trace_provider: str): # pylint: disable=name-too-long match = re.match(AZURE_WORKSPACE_REGEX_FORMAT, trace_provider) if not match or len(match.groups()) != 5: raise ValueError( @@ -181,7 +181,7 @@ def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace map_from_key = pattern[len(pattern_prefix) :] elif pattern.startswith(run_outputs_prefix): # Target-generated columns always starts from .outputs. - map_from_key = f"{Prefixes._TGT_OUTPUTS}{pattern[len(run_outputs_prefix) :]}" + map_from_key = f"{Prefixes.TSG_OUTPUTS}{pattern[len(run_outputs_prefix) :]}" # if we are not renaming anything, skip. if map_from_key == map_to_key: continue diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py b/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py index 3ca45d1edd7..e5525706b60 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_chat/_chat.py @@ -96,9 +96,9 @@ def __call__(self, *, conversation, **kwargs): """ Evaluates chat scenario. - :param conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys. + :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys. "context" key is optional for assistant's turn and should have "citations" key with list of citations. - :type conversation: List[Dict] + :paramtype conversation: List[Dict] :return: The scores for Chat scenario. :rtype: dict """ @@ -197,7 +197,7 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator): score = evaluator(question=question, answer=answer, context=context) return score - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught logger.warning( f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}" ) @@ -243,8 +243,7 @@ def _validate_conversation(self, conversation: List[Dict]): one_based_turn_num = turn_num + 1 if not isinstance(turn, dict): - raise ValueError( - f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}") + raise ValueError(f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}") if "role" not in turn or "content" not in turn: raise ValueError( diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_chat/retrieval/_retrieval.py b/src/promptflow-evals/promptflow/evals/evaluators/_chat/retrieval/_retrieval.py index c0e307cd0c1..fce82e5ed99 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_chat/retrieval/_retrieval.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_chat/retrieval/_retrieval.py @@ -67,8 +67,8 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration): def __call__(self, *, conversation, **kwargs): """Evaluates retrieval score chat scenario. - :param conversation: The conversation to be evaluated. - :type conversation: List[Dict] + :keyword conversation: The conversation to be evaluated. + :paramtype conversation: List[Dict] :return: The scores for Chat scenario. :rtype: dict """ @@ -90,9 +90,9 @@ def __call__(self, *, conversation, **kwargs): # Evaluate each turn per_turn_scores = [] history = [] - for turn_num in range(len(questions)): + for turn_num, question in enumerate(questions): try: - question = questions[turn_num] if turn_num < len(questions) else "" + question = question if turn_num < len(questions) else "" answer = answers[turn_num] if turn_num < len(answers) else "" context = contexts[turn_num] if turn_num < len(contexts) else "" @@ -107,7 +107,7 @@ def __call__(self, *, conversation, **kwargs): per_turn_scores.append(score) - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught logger.warning( f"Evaluator {self.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}" ) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_coherence/_coherence.py b/src/promptflow-evals/promptflow/evals/evaluators/_coherence/_coherence.py index 937863f2aa1..74aaebe1377 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_coherence/_coherence.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_coherence/_coherence.py @@ -22,9 +22,8 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration): model_config.api_version = "2024-02-15-preview" prompty_model_config = {"configuration": model_config} - prompty_model_config.update( - {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} - ) if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration) else None + if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration): + prompty_model_config.update({"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}}) current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, "coherence.prompty") self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config) @@ -81,12 +80,12 @@ def __call__(self, *, question: str, answer: str, **kwargs): """ Evaluate coherence. - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str + :keyword question: The question to be evaluated. + :paramtype question: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str :return: The coherence score. - :rtype: dict + :rtype: Dict[str, float] """ return async_run_allowing_running_loop(self._async_evaluator, question=question, answer=answer, **kwargs) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety.py index e7357b90f54..8bf5c4533f0 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety.py @@ -77,12 +77,12 @@ def __call__(self, *, question: str, answer: str, **kwargs): """ Evaluates content-safety metrics for "question-answering" scenario. - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :param parallel: Whether to evaluate in parallel. - :type parallel: bool + :keyword question: The question to be evaluated. + :paramtype question: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str + :keyword parallel: Whether to evaluate in parallel. + :paramtype parallel: bool :return: The scores for content-safety. :rtype: dict """ diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_chat.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_chat.py index dc6756d0000..7d3134e2385 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_chat.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_chat.py @@ -95,8 +95,8 @@ def __call__(self, *, conversation, **kwargs): """ Evaluates content-safety metrics for "chat" scenario. - :param conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys. - :type conversation: List[Dict] + :keyword conversation: The conversation to be evaluated. Each turn should have "role" and "content" keys. + :paramtype conversation: List[Dict] :return: The scores for Chat scenario. :rtype: dict """ @@ -154,7 +154,7 @@ def _evaluate_turn(self, turn_num, questions, answers, evaluator): score = evaluator(question=question, answer=answer) return score - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught logger.warning( f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}" ) @@ -209,8 +209,7 @@ def _validate_conversation(self, conversation: List[Dict]): one_based_turn_num = turn_num + 1 if not isinstance(turn, dict): - raise ValueError( - f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}") + raise ValueError(f"Each turn in 'conversation' must be a dictionary. Turn number: {one_based_turn_num}") if "role" not in turn or "content" not in turn: raise ValueError( @@ -245,7 +244,7 @@ def _get_harm_severity_level(self, harm_score: float) -> str: return np.nan for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items(): - if harm_score >= harm_score_range[0] and harm_score <= harm_score_range[1]: + if harm_score_range[0] <= harm_score <= harm_score_range[1]: return harm_level return np.nan diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_sub_evaluator_base.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_sub_evaluator_base.py index 9c69747f715..36a4bdbd768 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_sub_evaluator_base.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_content_safety_sub_evaluator_base.py @@ -23,12 +23,12 @@ class ContentSafetySubEvaluatorBase(ABC): :type metric: ~promptflow.evals.evaluators._content_safety.flow.constants.EvaluationMetrics :param project_scope: The scope of the Azure AI project. It contains subscription id, resource group, and project name. - :type project_scope: dict + :type project_scope: Dict :param credential: The credential for connecting to Azure AI project. :type credential: TokenCredential """ - def __init__(self, metric: EvaluationMetrics, project_scope: dict, credential=None): + def __init__(self, metric: EvaluationMetrics, project_scope: dict, credential=None): self._metric = metric self._project_scope = project_scope self._credential = credential @@ -37,18 +37,17 @@ def __call__(self, *, question: str, answer: str, **kwargs): """ Evaluates content according to this evaluator's metric. - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :return: The evaluation score. - :rtype: dict + :keyword question: The question to be evaluated. + :paramtype question: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str + :return: The evaluation score computation based on the Content Safety metric (self.metric). + :rtype: Any """ # Validate inputs # Raises value error if failed, so execution alone signifies success. _ = validate_inputs(question=question, answer=answer) - # question: str, answer: str, metric_name: str, project_scope: dict, credential: TokenCredential # Run score computation based on supplied metric. result = evaluate_with_rai_service( metric_name=self._metric, diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_hate_unfairness.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_hate_unfairness.py index 0a9a28e6f4c..9799d8c247b 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_hate_unfairness.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_hate_unfairness.py @@ -1,12 +1,13 @@ # --------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- + try: - from .common.constants import EvaluationMetrics from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase + from .common.constants import EvaluationMetrics except ImportError: - from common.constants import EvaluationMetrics from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase + from common.constants import EvaluationMetrics class HateUnfairnessEvaluator(ContentSafetySubEvaluatorBase): @@ -42,7 +43,7 @@ class HateUnfairnessEvaluator(ContentSafetySubEvaluatorBase): } """ - def __init__(self, project_scope: dict, credential=None): + def __init__(self, project_scope: dict, credential=None) -> None: # Hate_fairness is the actual backend metric name. Which, uh, doesn't sound great. # so invert the name. super().__init__( @@ -50,16 +51,3 @@ def __init__(self, project_scope: dict, credential=None): project_scope=project_scope, credential=credential, ) - - def __call__(self, *, question: str, answer: str, **kwargs): - """ - Evaluates hate unfairness content. - - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :return: The hate unfairness score. - :rtype: dict - """ - return super().__call__(question=question, answer=answer, **kwargs) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_self_harm.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_self_harm.py index 4c9d85107be..fcb957491b7 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_self_harm.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_self_harm.py @@ -2,11 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- try: - from .common.constants import EvaluationMetrics from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase + from .common.constants import EvaluationMetrics except ImportError: - from common.constants import EvaluationMetrics from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase + from common.constants import EvaluationMetrics class SelfHarmEvaluator(ContentSafetySubEvaluatorBase): @@ -48,17 +48,3 @@ def __init__(self, project_scope: dict, credential=None): project_scope=project_scope, credential=credential, ) - - def __call__(self, *, question: str, answer: str, **kwargs): - """ - Evaluates self harm content. - - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :return: The self harm score. - :rtype: dict - """ - - return super().__call__(question=question, answer=answer, **kwargs) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_sexual.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_sexual.py index 17430926150..28f401729bb 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_sexual.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_sexual.py @@ -2,11 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- try: - from .common.constants import EvaluationMetrics from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase + from .common.constants import EvaluationMetrics except ImportError: - from common.constants import EvaluationMetrics from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase + from common.constants import EvaluationMetrics class SexualEvaluator(ContentSafetySubEvaluatorBase): @@ -48,16 +48,3 @@ def __init__(self, project_scope: dict, credential=None): project_scope=project_scope, credential=credential, ) - - def __call__(self, *, question: str, answer: str, **kwargs): - """ - Evaluates sexual content. - - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :return: The sexual score. - :rtype: dict - """ - return super().__call__(question=question, answer=answer, **kwargs) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_violence.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_violence.py index 9411c20645a..0d883210c0d 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_violence.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/_violence.py @@ -2,11 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- try: - from .common.constants import EvaluationMetrics from ._content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase + from .common.constants import EvaluationMetrics except ImportError: - from common.constants import EvaluationMetrics from _content_safety_sub_evaluator_base import ContentSafetySubEvaluatorBase + from common.constants import EvaluationMetrics class ViolenceEvaluator(ContentSafetySubEvaluatorBase): @@ -48,16 +48,3 @@ def __init__(self, project_scope: dict, credential=None): project_scope=project_scope, credential=credential, ) - - def __call__(self, *, question: str, answer: str, **kwargs): - """ - Evaluates violence content. - - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :return: The violence score. - :rtype: dict - """ - return super().__call__(question=question, answer=answer, **kwargs) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/constants.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/constants.py index 5018688b174..b03d59c61da 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/constants.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/constants.py @@ -13,6 +13,8 @@ class RAIService: class HarmSeverityLevel(Enum): + """Harm severity levels.""" + VeryLow = "Very low" Low = "Low" Medium = "Medium" @@ -26,6 +28,8 @@ class Tasks: class EvaluationMetrics: + """Evaluation metrics for content safety evaluation.""" + HATE_FAIRNESS = "hate_fairness" HATE_UNFAIRNESS = "hate_unfairness" VIOLENCE = "violence" diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/evaluate_with_rai_service.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/evaluate_with_rai_service.py index bca72c451d9..00c231163c2 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/evaluate_with_rai_service.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/evaluate_with_rai_service.py @@ -1,7 +1,11 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- import importlib.metadata import re import time -from typing import List +from ast import literal_eval +from typing import Dict, List from urllib.parse import urlparse import jwt @@ -24,7 +28,17 @@ USER_AGENT = "{}/{}".format("promptflow-evals", version) -def ensure_service_availability(rai_svc_url: str, token: str, capability: str = None): +def ensure_service_availability(rai_svc_url: str, token: str, capability: str = None) -> None: + """Check if the Responsible AI service is available in the region and has the required capability, if relevant. + + :param rai_svc_url: The Responsible AI service URL. + :type rai_svc_url: str + :param token: The Azure authentication token. + :type token: str + :param capability: The capability to check. Default is None. + :type capability: str + :raises Exception: If the service is not available in the region or the capability is not available. + """ headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json", @@ -35,15 +49,34 @@ def ensure_service_availability(rai_svc_url: str, token: str, capability: str = response = requests.get(svc_liveness_url, headers=headers) if response.status_code != 200: - raise Exception(f"RAI service is not available in this region. Status Code: {response.status_code}") + raise Exception( # pylint: disable=broad-exception-raised + f"RAI service is not available in this region. Status Code: {response.status_code}" + ) capabilities = response.json() if capability and capability not in capabilities: - raise Exception(f"Capability '{capability}' is not available in this region") - - -def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, token: str): + raise Exception( # pylint: disable=broad-exception-raised + f"Capability '{capability}' is not available in this region" + ) + + +def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, token: str) -> str: + """Submit request to Responsible AI service for evaluation and return operation ID + + :param question: The question to evaluate. + :type question: str + :param answer: The answer to evaluate. + :type answer: str + :param metric: The evaluation metric to use. + :type metric: str + :param rai_svc_url: The Responsible AI service URL. + :type rai_svc_url: str + :param token: The Azure authentication token. + :type token: str + :return: The operation ID. + :rtype: str + """ user_text = f"{question}{answer}" normalized_user_text = user_text.replace("'", '\\"') payload = {"UserTextList": [normalized_user_text], "AnnotationTask": Tasks.CONTENT_HARM, "MetricList": [metric]} @@ -65,7 +98,20 @@ def submit_request(question: str, answer: str, metric: str, rai_svc_url: str, to return operation_id -def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential, token: str): +def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredential, token: str) -> Dict: + """Fetch the annotation result from Responsible AI service + + :param operation_id: The operation ID. + :type operation_id: str + :param rai_svc_url: The Responsible AI service URL. + :type rai_svc_url: str + :param credential: The Azure authentication credential. + :type credential: TokenCredential + :param token: The Azure authentication token. + :type token: str + :return: The annotation result. + :rtype: Dict + """ start = time.time() request_count = 0 @@ -86,7 +132,18 @@ def fetch_result(operation_id: str, rai_svc_url: str, credential: TokenCredentia time.sleep(sleep_time) -def parse_response(batch_response: List[dict], metric_name: str) -> List[List[dict]]: +def parse_response( # pylint: disable=too-many-branches,too-many-statements + batch_response: List[Dict], metric_name: str +) -> List[List[dict]]: + """Parse the annotation response from Responsible AI service + + :param batch_response: The annotation response from Responsible AI service. + :type batch_response: List[Dict] + :param metric_name: The evaluation metric to use. + :type metric_name: str + :return: The parsed annotation result. + :rtype: List[List[Dict]] + """ # Fix the metric name if it's "hate_fairness" # Eventually we will remove this fix once the RAI service is updated key = metric_name @@ -100,17 +157,17 @@ def parse_response(batch_response: List[dict], metric_name: str) -> List[List[di return result try: - harm_response = eval(response[metric_name]) + harm_response = literal_eval(response[metric_name]) except NameError as e: # fix the eval error if there's "true" in the response m = re.findall(r"name '(\w+)' is not defined", str(e)) if m: for word in m: response[metric_name] = response[metric_name].replace(word, word.title()) - harm_response = eval(response[metric_name]) + harm_response = literal_eval(response[metric_name]) else: harm_response = "" - except Exception: + except Exception: # pylint: disable=broad-exception-caught harm_response = response[metric_name] if harm_response != "" and isinstance(harm_response, dict): @@ -140,8 +197,8 @@ def parse_response(batch_response: List[dict], metric_name: str) -> List[List[di else: metric_value = np.nan reason = harm_response - elif harm_response != "" and (isinstance(harm_response, int) or isinstance(harm_response, float)): - if harm_response >= 0 and harm_response <= 7: + elif harm_response != "" and isinstance(harm_response, (int, float)): + if 0 < harm_response <= 7: metric_value = harm_response else: metric_value = np.nan @@ -158,7 +215,16 @@ def parse_response(batch_response: List[dict], metric_name: str) -> List[List[di return result -def _get_service_discovery_url(azure_ai_project, token): +def _get_service_discovery_url(azure_ai_project: dict, token: str) -> str: + """Get the discovery service URL for the Azure AI project + + :param azure_ai_project: The Azure AI project details. + :type azure_ai_project: Dict + :param token: The Azure authentication token. + :type token: str + :return: The discovery service URL. + :rtype: str + """ headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} response = requests.get( f"https://management.azure.com/subscriptions/{azure_ai_project['subscription_id']}/" @@ -169,12 +235,21 @@ def _get_service_discovery_url(azure_ai_project, token): timeout=5, ) if response.status_code != 200: - raise Exception("Failed to retrieve the discovery service URL") + raise Exception("Failed to retrieve the discovery service URL") # pylint: disable=broad-exception-raised base_url = urlparse(response.json()["properties"]["discoveryUrl"]) return f"{base_url.scheme}://{base_url.netloc}" -def get_rai_svc_url(project_scope: dict, token: str): +def get_rai_svc_url(project_scope: dict, token: str) -> str: + """Get the Responsible AI service URL + + :param project_scope: The Azure AI project scope details. + :type project_scope: Dict + :param token: The Azure authentication token. + :type token: str + :return: The Responsible AI service URL. + :rtype: str + """ discovery_url = _get_service_discovery_url(azure_ai_project=project_scope, token=token) subscription_id = project_scope["subscription_id"] resource_group_name = project_scope["resource_group_name"] @@ -189,7 +264,15 @@ def get_rai_svc_url(project_scope: dict, token: str): return rai_url -def fetch_or_reuse_token(credential: TokenCredential, token: str = None): +def fetch_or_reuse_token(credential: TokenCredential, token: str = None) -> str: + """Get token. Fetch a new token if the current token is near expiry + + :param credential: The Azure authentication credential. + :type credential: TokenCredential + :param token: The Azure authentication token. Defaults to None. If none, a new token will be fetched. + :type token: str + :return: The Azure authentication token. + """ acquire_new_token = True try: if token: @@ -201,7 +284,7 @@ def fetch_or_reuse_token(credential: TokenCredential, token: str = None): # Check if the token is near expiry if (exp_time - current_time) >= 300: acquire_new_token = False - except Exception: + except Exception: # pylint: disable=broad-exception-caught pass if acquire_new_token: @@ -213,6 +296,21 @@ def fetch_or_reuse_token(credential: TokenCredential, token: str = None): def evaluate_with_rai_service( question: str, answer: str, metric_name: str, project_scope: dict, credential: TokenCredential ): + """ "Evaluate the content safety of the answer using Responsible AI service + + :param question: The question to evaluate. + :type question: str + :param answer: The answer to evaluate. + :type answer: str + :param metric_name: The evaluation metric to use. + :type metric_name: str + :param project_scope: The Azure AI project scope details. + :type project_scope: Dict + :param credential: The Azure authentication credential. + :type credential: TokenCredential + :return: The parsed annotation result. + :rtype: List[List[Dict]] + """ # Use DefaultAzureCredential if no credential is provided # This is for the for batch run scenario as the credential cannot be serialized by promoptflow if credential is None or credential == {}: diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/utils.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/utils.py index a7741046e89..48f4d80e468 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/utils.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/utils.py @@ -9,6 +9,13 @@ def get_harm_severity_level(harm_score: int) -> str: + """Generate harm severity level based on harm score. + + :param harm_score: The harm score to be evaluated. + :type harm_score: int + :return: The harm severity level. If harm score is None or numpy.nan, returns numpy.nan. + :rtype: str + """ HARM_SEVERITY_LEVEL_MAPPING = { constants.HarmSeverityLevel.VeryLow: [0, 1], constants.HarmSeverityLevel.Low: [2, 3], @@ -18,6 +25,6 @@ def get_harm_severity_level(harm_score: int) -> str: if harm_score == np.nan or harm_score is None: return np.nan for harm_level, harm_score_range in HARM_SEVERITY_LEVEL_MAPPING.items(): - if harm_score >= harm_score_range[0] and harm_score <= harm_score_range[1]: + if harm_score_range[0] <= harm_score <= harm_score_range[1]: return harm_level.value return np.nan diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/validate_inputs.py b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/validate_inputs.py index a6083b8ddab..af14460fef3 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/validate_inputs.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_content_safety/common/validate_inputs.py @@ -5,7 +5,17 @@ @tool -def validate_inputs(question: str, answer: str): +def validate_inputs(question: str, answer: str) -> bool: + """Validate input parameters for content safety evaluation. + + :param question: The question to be evaluated. + :type question: str + :param answer: The answer to be evaluated. + :type answer: str + :raises ValueError: If input parameters are invalid. + :return: True if input parameters are valid. + :rtype: bool + """ # Validate input parameters if not (question and question.strip() and question != "None") or not ( answer and answer.strip() and answer != "None" diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_f1_score/_f1_score.py b/src/promptflow-evals/promptflow/evals/evaluators/_f1_score/_f1_score.py index ed88a351ddd..f7f54174b27 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_f1_score/_f1_score.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_f1_score/_f1_score.py @@ -3,6 +3,7 @@ # --------------------------------------------------------- from collections import Counter +from typing import List class F1ScoreEvaluator: @@ -35,10 +36,10 @@ def __call__(self, *, answer: str, ground_truth: str, **kwargs): """ Evaluate F1 score. - :param answer: The answer to be evaluated. - :type answer: str - :param ground_truth: The ground truth to be evaluated. - :type ground_truth: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str + :keyword ground_truth: The ground truth to be evaluated. + :paramtype ground_truth: str :return: The F1 score. :rtype: dict """ @@ -67,17 +68,27 @@ def _compute_f1_score(cls, answer: str, ground_truth: str) -> str: import string class QASplitTokenizer: - def __call__(self, line): + """Quality assurance tokenizer that splits text on whitespace.""" + + def __call__(self, line) -> List[str]: """Tokenizes an input line using split() on whitespace - :param line: a segment to tokenize - :return: the tokenized line + :param line: The input segment to be tokenized + :type line: str + :return: The tokenized segment + :rtype: List[str] """ return line.split() - def normalize_text(text) -> str: - """Lower text and remove punctuation, articles and extra whitespace.""" + def normalize_text(text: str) -> str: + """Lower text and remove punctuation, articles and extra whitespace. + + :param text: The text to be normalized + :type text: str + :return: The normalized text + :rtype: str + """ def remove_articles(text): return re.sub(r"\b(a|an|the)\b", " ", text) diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_fluency/_fluency.py b/src/promptflow-evals/promptflow/evals/evaluators/_fluency/_fluency.py index c5027f8b707..5f76b04c058 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_fluency/_fluency.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_fluency/_fluency.py @@ -22,9 +22,10 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration): model_config.api_version = "2024-02-15-preview" prompty_model_config = {"configuration": model_config} - prompty_model_config.update( - {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} - ) if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration) else None + if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration): + prompty_model_config.update( + {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} + ) current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, "fluency.prompty") self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config) @@ -81,10 +82,10 @@ def __call__(self, *, question: str, answer: str, **kwargs): """ Evaluate fluency. - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str + :keyword question: The question to be evaluated. + :paramtype question: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str :return: The fluency score. :rtype: dict """ diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py index 706ae477158..2de2ce04db8 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_groundedness/_groundedness.py @@ -22,9 +22,10 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration): model_config.api_version = "2024-02-15-preview" prompty_model_config = {"configuration": model_config} - prompty_model_config.update( - {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} - ) if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration) else None + if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration): + prompty_model_config.update( + {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} + ) current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, "groundedness.prompty") self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config) @@ -34,7 +35,7 @@ async def __call__(self, *, answer: str, context: str, **kwargs): answer = str(answer or "") context = str(context or "") - if not (answer.strip()) or not (context.strip()): + if not answer.strip() or not context.strip(): raise ValueError("Both 'answer' and 'context' must be non-empty strings.") # Run the evaluation flow @@ -82,10 +83,10 @@ def __call__(self, *, answer: str, context: str, **kwargs): """ Evaluate groundedness of the answer in the context. - :param answer: The answer to be evaluated. - :type answer: str - :param context: The context in which the answer is evaluated. - :type context: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str + :keyword context: The context in which the answer is evaluated. + :paramtype context: str :return: The groundedness score. :rtype: dict """ diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_qa/_qa.py b/src/promptflow-evals/promptflow/evals/evaluators/_qa/_qa.py index 3887616a794..2c9f2952795 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_qa/_qa.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_qa/_qa.py @@ -66,16 +66,16 @@ def __call__(self, *, question: str, answer: str, context: str, ground_truth: st """ Evaluates question-answering scenario. - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :param context: The context to be evaluated. - :type context: str - :param ground_truth: The ground truth to be evaluated. - :type ground_truth: str - :param parallel: Whether to evaluate in parallel. Defaults to True. - :type parallel: bool + :keyword question: The question to be evaluated. + :paramtype question: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str + :keyword context: The context to be evaluated. + :paramtype context: str + :keyword ground_truth: The ground truth to be evaluated. + :paramtype ground_truth: str + :keyword parallel: Whether to evaluate in parallel. Defaults to True. + :paramtype parallel: bool :return: The scores for QA scenario. :rtype: dict """ diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_relevance/_relevance.py b/src/promptflow-evals/promptflow/evals/evaluators/_relevance/_relevance.py index 2dec30b13ea..34327bbbcda 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_relevance/_relevance.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_relevance/_relevance.py @@ -22,9 +22,10 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration): model_config.api_version = "2024-02-15-preview" prompty_model_config = {"configuration": model_config} - prompty_model_config.update( - {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} - ) if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration) else None + if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration): + prompty_model_config.update( + {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} + ) current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, "relevance.prompty") self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config) @@ -84,12 +85,12 @@ def __call__(self, *, question: str, answer: str, context: str, **kwargs): """ Evaluate relevance. - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :param context: The context to be evaluated. - :type context: str + :keyword question: The question to be evaluated. + :paramtype question: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str + :keyword context: The context to be evaluated. + :paramtype context: str :return: The relevance score. :rtype: dict """ diff --git a/src/promptflow-evals/promptflow/evals/evaluators/_similarity/_similarity.py b/src/promptflow-evals/promptflow/evals/evaluators/_similarity/_similarity.py index 93ce98e3509..85884e6a08c 100644 --- a/src/promptflow-evals/promptflow/evals/evaluators/_similarity/_similarity.py +++ b/src/promptflow-evals/promptflow/evals/evaluators/_similarity/_similarity.py @@ -22,9 +22,10 @@ def __init__(self, model_config: AzureOpenAIModelConfiguration): model_config.api_version = "2024-02-15-preview" prompty_model_config = {"configuration": model_config} - prompty_model_config.update( - {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} - ) if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration) else None + if USER_AGENT and isinstance(model_config, AzureOpenAIModelConfiguration): + prompty_model_config.update( + {"parameters": {"extra_headers": {"x-ms-useragent": USER_AGENT}}} + ) current_dir = os.path.dirname(__file__) prompty_path = os.path.join(current_dir, "similarity.prompty") self._flow = AsyncPrompty.load(source=prompty_path, model=prompty_model_config) @@ -83,12 +84,12 @@ def __call__(self, *, question: str, answer: str, ground_truth: str, **kwargs): """ Evaluate similarity. - :param question: The question to be evaluated. - :type question: str - :param answer: The answer to be evaluated. - :type answer: str - :param ground_truth: The ground truth to be evaluated. - :type ground_truth: str + :keyword question: The question to be evaluated. + :paramtype question: str + :keyword answer: The answer to be evaluated. + :paramtype answer: str + :keyword ground_truth: The ground truth to be evaluated. + :paramtype ground_truth: str :return: The similarity score. :rtype: dict """ diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_conversation/__init__.py b/src/promptflow-evals/promptflow/evals/synthetic/_conversation/__init__.py index 8a8cbd8c120..41b5936729c 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_conversation/__init__.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_conversation/__init__.py @@ -7,7 +7,7 @@ import logging import time from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import jinja2 @@ -17,20 +17,59 @@ @dataclass class ConversationTurn: + """Class to represent a turn in a conversation. + + A "turn" involves only one exchange between the user and the chatbot. + + :param role: The role of the participant in the conversation. Accepted values are + "user" and "assistant". + :type role: ~promptflow.evals.synthetic._conversation.constants.ConversationRole + :param name: The name of the participant in the conversation. + :type name: Optional[str] + :param message: The message exchanged in the conversation. Defaults to an empty string. + :type message: str + :param full_response: The full response. + :type full_response: Optional[Any] + :param request: The request. + :type request: Optional[Any] + """ role: "ConversationRole" name: Optional[str] = None message: str = "" full_response: Optional[Any] = None request: Optional[Any] = None - def to_openai_chat_format(self, reverse: bool = False) -> dict: + def to_openai_chat_format(self, reverse: bool = False) -> Dict[str, str]: + """Convert the conversation turn to the OpenAI chat format. + + OpenAI chat format is a dictionary with two keys: "role" and "content". + + :param reverse: Whether to reverse the conversation turn. Defaults to False. + :type reverse: bool + :return: The conversation turn in the OpenAI chat format. + :rtype: Dict[str, str] + """ if reverse is False: return {"role": self.role.value, "content": self.message} if self.role == ConversationRole.ASSISTANT: return {"role": ConversationRole.USER.value, "content": self.message} return {"role": ConversationRole.ASSISTANT.value, "content": self.message} - def to_annotation_format(self, turn_number: int) -> dict: + def to_annotation_format(self, turn_number: int) -> Dict[str, Any]: + """Convert the conversation turn to an annotation format. + + Annotation format is a dictionary with the following keys: + - "turn_number": The turn number. + - "response": The response. + - "actor": The actor. + - "request": The request. + - "full_json_response": The full JSON response. + + :param turn_number: The turn number. + :type turn_number: int + :return: The conversation turn in the annotation format. + :rtype: Dict[str, Any] + """ return { "turn_number": turn_number, "response": self.message, @@ -44,6 +83,22 @@ def __str__(self) -> str: class ConversationBot: + """ + A conversation chat bot with a specific name, persona and a sentence that can be used as a conversation starter. + + :param role: The role of the bot in the conversation, either "user" or "assistant". + :type role: ~promptflow.evals.synthetic._conversation.constants.ConversationRole + :param model: The LLM model to use for generating responses. + :type model: Union[ + ~promptflow.evals.synthetic._model_tools.LLMBase, + ~promptflow.evals.synthetic._model_tools.OpenAIChatCompletionsModel + ] + :param conversation_template: A Jinja2 template describing the conversation to generate the prompt for the LLM + :type conversation_template: str + :param instantiation_parameters: A dictionary of parameters used to instantiate the conversation template + :type instantiation_parameters: Dict[str, str] + """ + def __init__( self, *, @@ -51,20 +106,7 @@ def __init__( model: Union[LLMBase, OpenAIChatCompletionsModel], conversation_template: str, instantiation_parameters: Dict[str, str], - ): - """ - Create a ConversationBot with specific name, persona and a sentence that can be used as a conversation starter. - - :param role: The role of the bot in the conversation, either USER or ASSISTANT. - :type role: ConversationRole - :param model: The LLM model to use for generating responses. - :type model: OpenAIChatCompletionsModel - :param conversation_template: A Jinja2 template describing the conversation to generate the prompt for the LLM - :type conversation_template: str - :param instantiation_parameters: A dictionary of parameters used to instantiate the conversation template - :type instantiation_parameters: dict - """ - + ) -> None: self.role = role self.conversation_template_orig = conversation_template self.conversation_template: jinja2.Template = jinja2.Template( @@ -89,7 +131,7 @@ def __init__( self.conversation_starter = jinja2.Template( conversation_starter_content, undefined=jinja2.StrictUndefined ) - except jinja2.exceptions.TemplateSyntaxError as e: # noqa: F841 + except jinja2.exceptions.TemplateSyntaxError: # noqa: F841 self.conversation_starter = conversation_starter_content else: self.logger.info( @@ -174,7 +216,27 @@ def __repr__(self): class CallbackConversationBot(ConversationBot): - def __init__(self, callback, user_template, user_template_parameters, *args, **kwargs): + """Conversation bot that uses a user provided callback to generate responses. + + :param callback: The callback function to use to generate responses. + :type callback: Callable + :param user_template: The template to use for the request. + :type user_template: str + :param user_template_parameters: The template parameters to use for the request. + :type user_template_parameters: Dict + :param args: Optional arguments to pass to the parent class. + :type args: Any + :param kwargs: Optional keyword arguments to pass to the parent class. + :type kwargs: Any + """ + def __init__( + self, + callback: Callable, + user_template: str, + user_template_parameters: Dict, + *args, + **kwargs, + ) -> None: self.callback = callback self.user_template = user_template self.user_template_parameters = user_template_parameters @@ -219,7 +281,8 @@ async def generate_response( return response, {}, time_taken, result - def _to_chat_protocol(self, template, conversation_history, template_parameters): + # Bug 3354264: template is unused in the method - is this intentional? + def _to_chat_protocol(self, template, conversation_history, template_parameters): # pylint: disable=unused-argument messages = [] for _, m in enumerate(conversation_history): diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_conversation/_conversation.py b/src/promptflow-evals/promptflow/evals/synthetic/_conversation/_conversation.py index 17d8e685c81..2e62c45b2b6 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_conversation/_conversation.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_conversation/_conversation.py @@ -4,15 +4,24 @@ import asyncio import logging -from typing import Any, Callable, List, Tuple +from typing import Callable, Dict, List, Tuple, Union from .._model_tools import RetryClient from . import ConversationBot, ConversationTurn -def is_closing_message(response: Any, recursion_depth: int = 0): +def is_closing_message(response: Union[Dict, str], recursion_depth: int = 0) -> bool: + """Determine if a response indicates an end to the conversation. + + :param response: The response to check. + :type response: Union[Dict, str] + :param recursion_depth: The current recursion depth. Defaults to 0. + :type recursion_depth: int + :return: True if the response indicates an end to the conversation, False otherwise. + :rtype: bool + """ if recursion_depth > 10: - raise Exception("Exceeded max call depth in is_closing_message") + raise Exception("Exceeded max call depth in is_closing_message") # pylint: disable=broad-exception-raised # recursively go through each inner dictionary in the JSON dict # and check if any value entry contains a closing message @@ -26,7 +35,14 @@ def is_closing_message(response: Any, recursion_depth: int = 0): return False -def is_closing_message_helper(response: str): +def is_closing_message_helper(response: str) -> bool: + """Determine if a response indicates an end to the conversation. + + :param response: The response to check. + :type response: str + :return: True if the response indicates an end to the conversation, False otherwise. + :rtype: bool + """ message = response.lower() if "?" in message.lower(): return False @@ -109,7 +125,7 @@ async def simulate_conversation( current_bot = bots[current_character_idx] # invoke Bot to generate response given the input request # pass only the last generated turn without passing the bot name. - response, request, time_taken, full_response = await current_bot.generate_response( + response, request, _, full_response = await current_bot.generate_response( session=session, conversation_history=conversation_history, max_history=history_limit, diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_conversation/constants.py b/src/promptflow-evals/promptflow/evals/synthetic/_conversation/constants.py index 40407106c3e..f7b364f8316 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_conversation/constants.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_conversation/constants.py @@ -24,5 +24,6 @@ class ConversationRole(Enum): + """Role in a chatbot conversation""" USER = "user" ASSISTANT = "assistant" diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/__init__.py b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/__init__.py index 8d7e8122e8c..039478e7bfb 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/__init__.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/__init__.py @@ -2,6 +2,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +"""Tooling for model evaluation""" from ._identity_manager import ManagedIdentityAPITokenManager, PlainTokenManager, TokenScope from ._proxy_completion_model import ProxyChatCompletionsModel diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_async_http_client.py b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_async_http_client.py index d0e93e64082..ae3c4ef445f 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_async_http_client.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_async_http_client.py @@ -2,12 +2,25 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +import logging + from aiohttp import TraceConfig from aiohttp_retry import RandomRetry, RetryClient class AsyncHTTPClientWithRetry: - def __init__(self, n_retry, retry_timeout, logger, retry_options=None): + """Async HTTP client with retry configuration and request logging. + + :param n_retry: Number of retries to attempt + :type n_retry: int + :param retry_timeout: Timeout between retries, in seconds + :type retry_timeout: int + :param logger: The logger object to use for request logging + :type logger: logging.Logger + :param retry_options: The retry options. Defaults to None. + :type retry_options: Optional[aiohttp_retry.retry_options.BaseRandomRetry] + """ + def __init__(self, n_retry: int, retry_timeout: int, logger: logging.Logger, retry_options=None): self.attempts = n_retry self.logger = logger @@ -25,18 +38,52 @@ def __init__(self, n_retry, retry_timeout, logger, retry_options=None): self.client = RetryClient(trace_configs=[trace_config], retry_options=retry_options) - async def on_request_start(self, session, trace_config_ctx, params): + async def on_request_start(self, session, trace_config_ctx, params): # pylint: disable=unused-argument + """Build a new trace context from the config and log the request. + + :param session: The aiohttp client session. This parameter is not used in this method; + however, it must be included to match the method signature of the parent class. + :type session: aiohttp.ClientSession + :param trace_config_ctx: The trace config context + :type trace_config_ctx: Any + :param params: The request parameters + :type params: Any + """ current_attempt = trace_config_ctx.trace_request_ctx["current_attempt"] self.logger.info("[ATTEMPT %s] Sending %s request to %s" % (current_attempt, params.method, params.url)) - async def delete_auth_header(self, session, trace_config_ctx, params): + async def delete_auth_header(self, session, trace_config_ctx, params): # pylint: disable=unused-argument + """Delete authorization header from request headers + + If set, the "Authorization" and "api-key" headers is removed from the request headers. + + :param session: The aiohttp client session. This parameter is not used in this method; + however, it must be included to match the method signature of the parent class. + :type session: aiohttp.ClientSession + :param trace_config_ctx: The trace config context. This parameter is not used in this method; + however, it must be included to match the method signature of the parent class. + :type trace_config_ctx: Any + :param params: The request parameters + :type params: Any + """ request_headers = dict(params.response.request_info.headers) if "Authorization" in request_headers: del request_headers["Authorization"] if "api-key" in request_headers: del request_headers["api-key"] - async def on_request_end(self, session, trace_config_ctx, params): + async def on_request_end(self, session, trace_config_ctx, params): # pylint: disable=unused-argument + """ + Retrieve current request trace and log the response. + + :param session: The aiohttp client session. This parameter is not used in this method; + however, it must be included to match the method signature of the parent class. + :type session: aiohttp.ClientSession + :param trace_config_ctx: The trace config context + :type trace_config_ctx: Any + :param params: The request parameters + :type params: Any + """ current_attempt = trace_config_ctx.trace_request_ctx["current_attempt"] request_headers = dict(params.response.request_info.headers) if "Authorization" in request_headers: diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_identity_manager.py b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_identity_manager.py index 8ccda33fc27..cff70f37839 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_identity_manager.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_identity_manager.py @@ -3,10 +3,12 @@ # --------------------------------------------------------- import asyncio +import logging import os import time from abc import ABC, abstractmethod from enum import Enum +from typing import Dict, Optional, Union from azure.identity import DefaultAzureCredential, ManagedIdentityCredential @@ -14,11 +16,25 @@ class TokenScope(Enum): + """Token scopes for Azure endpoints""" DEFAULT_AZURE_MANAGEMENT = "https://management.azure.com/.default" class APITokenManager(ABC): - def __init__(self, logger, auth_header="Bearer", credential=None): + """Base class for managing API tokens. Subclasses should implement the get_token method. + + :param logger: Logger object + :type logger: logging.Logger + :param auth_header: Authorization header prefix. Defaults to "Bearer" + :type auth_header: str + :param credential: Azure credential object + :type credential: Optional[Union[azure.identity.DefaultAzureCredential, azure.identity.ManagedIdentityCredential] + """ + def __init__( + self, logger: logging.Logger, + auth_header: str = "Bearer", + credential: Optional[Union[DefaultAzureCredential, ManagedIdentityCredential]] = None + ) -> None: self.logger = logger self.auth_header = auth_header self._lock = None @@ -30,12 +46,27 @@ def __init__(self, logger, auth_header="Bearer", credential=None): self.last_refresh_time = None @property - def lock(self): + def lock(self) -> asyncio.Lock: + """Return object for managing concurrent access to the token. + + If the lock object does not exist, it will be created first. + + :return: Lock object + :rtype: asyncio.Lock + """ if self._lock is None: self._lock = asyncio.Lock() return self._lock - def get_aad_credential(self): + def get_aad_credential(self) -> Union[DefaultAzureCredential, ManagedIdentityCredential]: + """Return the AAD credential object. + + If the environment variable DEFAULT_IDENTITY_CLIENT_ID is set, ManagedIdentityCredential will be used with + the specified client ID. Otherwise, DefaultAzureCredential will be used. + + :return: The AAD credential object + :rtype: Union[DefaultAzureCredential, ManagedIdentityCredential] + """ identity_client_id = os.environ.get("DEFAULT_IDENTITY_CLIENT_ID", None) if identity_client_id is not None: self.logger.info(f"Using DEFAULT_IDENTITY_CLIENT_ID: {identity_client_id}") @@ -46,17 +77,36 @@ def get_aad_credential(self): return credential @abstractmethod - async def get_token(self): - pass + async def get_token(self) -> str: + """Async method to get the API token. Subclasses should implement this method. + + :return: API token + :rtype: str + """ + pass # pylint: disable=unnecessary-pass class ManagedIdentityAPITokenManager(APITokenManager): - def __init__(self, token_scope, logger, **kwargs): + """API Token Manager for Azure Managed Identity + + :param token_scope: Token scope for Azure endpoint + :type token_scope: ~promptflow.evals.synthetic._model_tools.TokenScope + :param logger: Logger object + :type logger: logging.Logger + :keyword kwargs: Additional keyword arguments + :paramtype kwargs: Dict + """ + def __init__(self, token_scope: TokenScope, logger: logging.Logger, **kwargs: Dict): super().__init__(logger, **kwargs) self.token_scope = token_scope - def get_token(self): + # Bug 3353724: This get_token is sync method, but it is defined as async method in the base class + def get_token(self) -> str: # pylint: disable=invalid-overridden-method + """Get the API token. If the token is not available or has expired, refresh the token. + :return: API token + :rtype: str + """ if ( self.token is None or self.last_refresh_time is None @@ -70,9 +120,23 @@ def get_token(self): class PlainTokenManager(APITokenManager): - def __init__(self, openapi_key, logger, **kwargs): + """Plain API Token Manager + + :param openapi_key: OpenAPI key + :type openapi_key: str + :param logger: Logger object + :type logger: logging.Logger + :keyword kwargs: Optional keyword arguments + :paramtype kwargs: Dict + """ + def __init__(self, openapi_key: str, logger: logging.Logger, **kwargs: Dict): super().__init__(logger, **kwargs) self.token = openapi_key - async def get_token(self): + async def get_token(self) -> str: + """Get the API token + + :return: API token + :rtype: str + """ return self.token diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_proxy_completion_model.py b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_proxy_completion_model.py index 510f5f2b8ca..1c7318e3975 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_proxy_completion_model.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_proxy_completion_model.py @@ -7,10 +7,10 @@ import logging import time import uuid -from typing import List +from typing import Dict, List -from aiohttp.web import HTTPException -from aiohttp_retry import JitterRetry, RetryClient +from aiohttp.web import HTTPException # pylint: disable=networking-import-outside-azure-core-transport +from aiohttp_retry import JitterRetry, RetryClient # pylint: disable=networking-import-outside-azure-core-transport from promptflow.evals._user_agent import USER_AGENT @@ -18,6 +18,22 @@ class SimulationRequestDTO: + """Simulation Request Data Transfer Object + + :param url: The URL to send the request to. + :type url: str + :param headers: The headers to send with the request. + :type headers: Dict[str, str] + :param payload: The payload to send with the request. + :type payload: Dict[str, Any] + :param params: The parameters to send with the request. + :type params: Dict[str, str] + :param template_key: The template key to use for the request. + :type template_key: str + :param template_parameters: The template parameters to use for the request. + :type template_parameters: Dict + """ + def __init__(self, url, headers, payload, params, templatekey, template_parameters): self.url = url self.headers = headers @@ -26,48 +42,82 @@ def __init__(self, url, headers, payload, params, templatekey, template_paramete self.templatekey = templatekey self.templateParameters = template_parameters - def to_dict(self): + def to_dict(self) -> Dict: + """Convert the DTO to a dictionary. + + :return: The DTO as a dictionary. + :rtype: Dict + """ if self.templateParameters is not None: self.templateParameters = {str(k): str(v) for k, v in self.templateParameters.items()} return self.__dict__ def to_json(self): + """Convert the DTO to a JSON string. + + :return: The DTO as a JSON string. + :rtype: str + """ return json.dumps(self.__dict__) class ProxyChatCompletionsModel(OpenAIChatCompletionsModel): - def __init__(self, name, template_key, template_parameters, *args, **kwargs): + """A chat completion model that uses a proxy to query the model with a body of data. + + :param name: The name of the model. + :type name: str + :param template_key: The template key to use for the request. + :type template_key: str + :param template_parameters: The template parameters to use for the request. + :type template_parameters: Dict + :keyword args: Additional arguments to pass to the parent class. + :keyword kwargs: Additional keyword arguments to pass to the parent class. + """ + + def __init__(self, name: str, template_key: str, template_parameters, *args, **kwargs) -> None: self.tkey = template_key self.tparam = template_parameters self.result_url = None super().__init__(name=name, *args, **kwargs) - def format_request_data(self, messages: List[dict], **request_params): # type: ignore[override] + def format_request_data(self, messages: List[Dict], **request_params) -> Dict: # type: ignore[override] + """Format the request data to query the model with. + + :param messages: List of messages to query the model with. + Expected format: [{"role": "user", "content": "Hello!"}, ...] + :type messages: List[Dict] + :keyword request_params: Additional parameters to pass to the model. + :paramtype request_params: Dict + :return: The formatted request data. + :rtype: Dict + """ request_data = {"messages": messages, **self.get_model_params()} request_data.update(request_params) return request_data async def get_conversation_completion( self, - messages: List[dict], + messages: List[Dict], session: RetryClient, - role: str = "assistant", + role: str = "assistant", # pylint: disable=unused-argument **request_params, ) -> dict: """ Query the model a single time with a message. :param messages: List of messages to query the model with. - Expected format: [{"role": "user", "content": "Hello!"}, ...] - :type messages: List[dict] + Expected format: [{"role": "user", "content": "Hello!"}, ...] + :type messages: List[Dict] :param session: aiohttp RetryClient object to query the model with. - :type session: RetryClient - :param role: Not used for this model, since it is a chat model. + :type session: ~promptflow.evals.synthetic._model_tools.RetryClient + :param role: The role of the user sending the message. This parameter is not used in this method; + however, it must be included to match the method signature of the parent class. Defaults to "assistant". :type role: str - :keyword **request_params: Additional parameters to pass to the model. + :keyword request_params: Additional parameters to pass to the model. + :paramtype request_params: Dict :return: A dictionary representing the completion of the conversation query. - :rtype: dict + :rtype: Dict """ request_data = self.format_request_data( messages=messages, diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_rai_client.py b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_rai_client.py index 35369bb29ea..2e101db04c2 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_rai_client.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_rai_client.py @@ -3,14 +3,15 @@ # --------------------------------------------------------- import logging import os -from typing import Any +from typing import Any, Dict from urllib.parse import urljoin, urlparse -import requests +import requests # pylint: disable=networking-import-outside-azure-core-transport from promptflow.evals._user_agent import USER_AGENT from ._async_http_client import AsyncHTTPClientWithRetry +from ._identity_manager import APITokenManager api_url = None if "RAI_SVC_URL" in os.environ: @@ -20,7 +21,14 @@ class RAIClient: - def __init__(self, azure_ai_project: dict, token_manager: Any) -> None: + """Client for the Responsible AI Service + + :param azure_ai_project: The Azure AI project + :type azure_ai_project: Dict + :param token_manager: The token manager + :type token_manage: ~promptflow.evals.synthetic._model_tools._identity_manager.APITokenManager + """ + def __init__(self, azure_ai_project: Dict, token_manager: APITokenManager) -> None: self.azure_ai_project = azure_ai_project self.token_manager = token_manager @@ -60,26 +68,43 @@ def _get_service_discovery_url(self): timeout=5, ) if response.status_code != 200: - raise Exception("Failed to retrieve the discovery service URL") + raise Exception("Failed to retrieve the discovery service URL") # pylint: disable=broad-exception-raised base_url = urlparse(response.json()["properties"]["discoveryUrl"]) return f"{base_url.scheme}://{base_url.netloc}" - def _create_async_client(self): + def _create_async_client(self) -> AsyncHTTPClientWithRetry: + """Create an async http client with retry mechanism + + Number of retries is set to 6, and the timeout is set to 5 seconds. + + :return: The async http client + :rtype: ~promptflow.evals.synthetic._model_tools._async_http_client.AsyncHTTPClientWithRetry + """ return AsyncHTTPClientWithRetry(n_retry=6, retry_timeout=5, logger=logging.getLogger()) async def get_contentharm_parameters(self) -> Any: + """Get the content harm parameters, if they exist""" if self.contentharm_parameters is None: self.contentharm_parameters = await self.get(self.parameter_json_endpoint) return self.contentharm_parameters async def get_jailbreaks_dataset(self) -> Any: + "Get the jailbreaks dataset, if exists" if self.jailbreaks_dataset is None: self.jailbreaks_dataset = await self.get(self.jailbreaks_json_endpoint) return self.jailbreaks_dataset async def get(self, url: str) -> Any: + """Make a GET request to the given url + + :param url: The url + :type url: str + :raises ValueError: If the Azure safety evaluation service is not available in the current region + :return: The response + :rtype: Any + """ token = self.token_manager.get_token() headers = { "Authorization": f"Bearer {token}", diff --git a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_template_handler.py b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_template_handler.py index fa8002573ce..9179a3007cb 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_template_handler.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/_model_tools/_template_handler.py @@ -2,6 +2,10 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- +from typing import Any, Dict, Optional + +from ._rai_client import RAIClient + CONTENT_HARM_TEMPLATES_COLLECTION_KEY = set( [ "adv_qa", @@ -16,12 +20,21 @@ class ContentHarmTemplatesUtils: + """Content harm templates utility functions.""" @staticmethod - def get_template_category(key): + def get_template_category(key: str) -> str: + """Parse category from template key + + :param key: The template key + :type key: str + :return: The category + :rtype: str + """ return key.split("/")[0] + # Bug 3353405: Need to add docstring @staticmethod - def get_template_key(key): + def get_template_key(key: str) -> str: # pylint: disable=missing-function-docstring filepath = key.rsplit(".json")[0] parts = str(filepath).split("/") filename = ContentHarmTemplatesUtils.json_name_to_md_name(parts[-1]) @@ -31,14 +44,30 @@ def get_template_key(key): return "/".join(prefix) @staticmethod - def json_name_to_md_name(name): + def json_name_to_md_name(name) -> str: + """Convert JSON filename to Markdown filename + + :param name: The JSON filename + :type name: str + :return: The Markdown filename + :rtype: str + """ result = name.replace("_aml", "") return result + ".md" class AdversarialTemplate: - def __init__(self, template_name, text, context_key, template_parameters=None): + """Template for adversarial scenarios. + + :param template_name: The name of the template. + :type template_name: str + :param text: The template text. + :type text: str + :param context_key: The context key. + :param template_parameters: The template parameters. + """ + def __init__(self, template_name, text, context_key, template_parameters=None) -> None: self.text = text self.context_key = context_key self.template_name = template_name @@ -49,7 +78,15 @@ def __str__(self): class AdversarialTemplateHandler: - def __init__(self, azure_ai_project, rai_client): + """ + Adversarial template handler constructor. + + :param azure_ai_project: The Azure AI project. + :type azure_ai_project: Dict[str, Any] + :param rai_client: The RAI client. + :type rai_client: ~promptflow.evals.synthetic._model_tools.RAIClient + """ + def __init__(self, azure_ai_project: Dict[str, Any], rai_client: RAIClient) -> None: self.cached_templates_source = {} # self.template_env = JinjaEnvironment(loader=JinjaFileSystemLoader(searchpath=template_dir)) self.azure_ai_project = azure_ai_project @@ -88,6 +125,14 @@ async def _get_content_harm_template_collections(self, collection_key): ch_templates.append(template) return ch_templates - def get_template(self, template_name): + def get_template(self, template_name: str) -> Optional[AdversarialTemplate]: + """Generate content harm template. + + :param template_name: The name of the template. + :type template_name: str + :return: The generated content harm template. + :rtype: Optional[~promptflow.evals.synthetic._model_tools.AdversarialTemplate] + """ if template_name in CONTENT_HARM_TEMPLATES_COLLECTION_KEY: return AdversarialTemplate(template_name=template_name, text=None, context_key=[], template_parameters=None) + return None diff --git a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py index c078ee52efd..a5ce1129bae 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_scenario.py @@ -1,7 +1,12 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + from enum import Enum class AdversarialScenario(Enum): + """Adversarial scenario types""" ADVERSARIAL_QA = "adv_qa" ADVERSARIAL_CONVERSATION = "adv_conversation" ADVERSARIAL_SUMMARIZATION = "adv_summarization" diff --git a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_simulator.py b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_simulator.py index 9e2dcb0bce4..d09cfa0d0b2 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/adversarial_simulator.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/adversarial_simulator.py @@ -29,7 +29,14 @@ logger = logging.getLogger(__name__) -def monitor_adversarial_scenario(func): +def monitor_adversarial_scenario(func) -> Callable: + """Monitor an adversarial scenario with logging + + :param func: The function to be monitored + :type func: Callable + :return: The decorated function + :rtype: Callable + """ @functools.wraps(func) def wrapper(*args, **kwargs): scenario = str(kwargs.get("scenario", None)) @@ -109,36 +116,36 @@ async def __call__( """ Executes the adversarial simulation against a specified target function asynchronously. - :param scenario: Enum value specifying the adversarial scenario used for generating inputs. + :keyword scenario: Enum value specifying the adversarial scenario used for generating inputs. example: - :py:const:`promptflow.evals.synthetic.adversarial_scenario.AdversarialScenario.ADVERSARIAL_QA` - :py:const:`promptflow.evals.synthetic.adversarial_scenario.AdversarialScenario.ADVERSARIAL_CONVERSATION` - :type scenario: promptflow.evals.synthetic.adversarial_scenario.AdversarialScenario - :param target: The target function to simulate adversarial inputs against. + :paramtype scenario: promptflow.evals.synthetic.adversarial_scenario.AdversarialScenario + :keyword target: The target function to simulate adversarial inputs against. This function should be asynchronous and accept a dictionary representing the adversarial input. - :type target: Callable - :param max_conversation_turns: The maximum number of conversation turns to simulate. + :paramtype target: Callable + :keyword max_conversation_turns: The maximum number of conversation turns to simulate. Defaults to 1. - :type max_conversation_turns: int - :param max_simulation_results: The maximum number of simulation results to return. + :paramtype max_conversation_turns: int + :keyword max_simulation_results: The maximum number of simulation results to return. Defaults to 3. - :type max_simulation_results: int - :param api_call_retry_limit: The maximum number of retries for each API call within the simulation. + :paramtype max_simulation_results: int + :keyword api_call_retry_limit: The maximum number of retries for each API call within the simulation. Defaults to 3. - :type api_call_retry_limit: int - :param api_call_retry_sleep_sec: The sleep duration (in seconds) between retries for API calls. + :paramtype api_call_retry_limit: int + :keyword api_call_retry_sleep_sec: The sleep duration (in seconds) between retries for API calls. Defaults to 1 second. - :type api_call_retry_sleep_sec: int - :param api_call_delay_sec: The delay (in seconds) before making an API call. + :paramtype api_call_retry_sleep_sec: int + :keyword api_call_delay_sec: The delay (in seconds) before making an API call. This can be used to avoid hitting rate limits. Defaults to 0 seconds. - :type api_call_delay_sec: int - :param concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation. + :paramtype api_call_delay_sec: int + :keyword concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation. Defaults to 3. - :type concurrent_async_task: int - :param jailbreak: If set to True, allows breaking out of the conversation flow defined by the scenario. + :paramtype concurrent_async_task: int + :keyword jailbreak: If set to True, allows breaking out of the conversation flow defined by the scenario. Defaults to False. - :type jailbreak: bool + :paramtype jailbreak: bool :return: A list of dictionaries, each representing a simulated conversation. Each dictionary contains: - 'template_parameters': A dictionary with parameters used in the conversation template, @@ -233,9 +240,11 @@ async def __call__( return JsonLineList(sim_results) - def _to_chat_protocol(self, *, conversation_history, template_parameters: Dict = {}): + def _to_chat_protocol(self, *, conversation_history, template_parameters: Dict = None): + if template_parameters is None: + template_parameters = {} messages = [] - for i, m in enumerate(conversation_history): + for _, m in enumerate(conversation_history): message = {"content": m.message, "role": m.role.value} if "context" in m.full_response: message["context"] = m.full_response["context"] @@ -352,7 +361,26 @@ def call_sync( api_call_retry_sleep_sec: int, api_call_delay_sec: int, concurrent_async_task: int, - ): + ) -> List[Dict[str, Any]]: + """Call the adversarial simulator synchronously. + + :keyword max_conversation_turns: The maximum number of conversation turns to simulate. + :paramtype max_conversation_turns: int + :keyword max_simulation_results: The maximum number of simulation results to return. + :paramtype max_simulation_results: int + :keyword target: The target function to simulate adversarial inputs against. + :paramtype target: Callable + :keyword api_call_retry_limit: The maximum number of retries for each API call within the simulation. + :paramtype api_call_retry_limit: int + :keyword api_call_retry_sleep_sec: The sleep duration (in seconds) between retries for API calls. + :paramtype api_call_retry_sleep_sec: int + :keyword api_call_delay_sec: The delay (in seconds) before making an API call. + :paramtype api_call_delay_sec: int + :keyword concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation. + :paramtype concurrent_async_task: int + :return: A list of dictionaries, each representing a simulated conversation. + :rtype: List[Dict[str, Any]] + """ # Running the async method in a synchronous context loop = asyncio.get_event_loop() if loop.is_running(): @@ -370,16 +398,16 @@ def call_sync( ) ) return loop.run_until_complete(future) - else: - # If no event loop is running, use asyncio.run (Python 3.7+) - return asyncio.run( - self( - max_conversation_turns=max_conversation_turns, - max_simulation_results=max_simulation_results, - target=target, - api_call_retry_limit=api_call_retry_limit, - api_call_retry_sleep_sec=api_call_retry_sleep_sec, - api_call_delay_sec=api_call_delay_sec, - concurrent_async_task=concurrent_async_task, - ) + + # If no event loop is running, use asyncio.run (Python 3.7+) + return asyncio.run( + self( + max_conversation_turns=max_conversation_turns, + max_simulation_results=max_simulation_results, + target=target, + api_call_retry_limit=api_call_retry_limit, + api_call_retry_sleep_sec=api_call_retry_sleep_sec, + api_call_delay_sec=api_call_delay_sec, + concurrent_async_task=concurrent_async_task, ) + ) diff --git a/src/promptflow-evals/promptflow/evals/synthetic/jailbreak_adversarial_simulator.py b/src/promptflow-evals/promptflow/evals/synthetic/jailbreak_adversarial_simulator.py index 9f40fcc2aef..065ad927b0b 100644 --- a/src/promptflow-evals/promptflow/evals/synthetic/jailbreak_adversarial_simulator.py +++ b/src/promptflow-evals/promptflow/evals/synthetic/jailbreak_adversarial_simulator.py @@ -17,7 +17,15 @@ logger = logging.getLogger(__name__) -def monitor_adversarial_scenario(func): +def monitor_adversarial_scenario(func) -> Callable: + """Decorator to monitor adversarial scenario. + + :param func: The function to be decorated. + :type func: Callable + :return: The decorated function. + :rtype: Callable + """ + @functools.wraps(func) def wrapper(*args, **kwargs): scenario = str(kwargs.get("scenario", None)) @@ -99,33 +107,33 @@ async def __call__( Executes the adversarial simulation and jailbreak adversarial simulation against a specified target function asynchronously. - :param scenario: Enum value specifying the adversarial scenario used for generating inputs. + :keyword scenario: Enum value specifying the adversarial scenario used for generating inputs. example: - :py:const:`promptflow.evals.synthetic.adversarial_scenario.AdversarialScenario.ADVERSARIAL_QA` - :py:const:`promptflow.evals.synthetic.adversarial_scenario.AdversarialScenario.ADVERSARIAL_CONVERSATION` - :type scenario: promptflow.evals.synthetic.adversarial_scenario.AdversarialScenario - :param target: The target function to simulate adversarial inputs against. + :paramtype scenario: promptflow.evals.synthetic.adversarial_scenario.AdversarialScenario + :keyword target: The target function to simulate adversarial inputs against. This function should be asynchronous and accept a dictionary representing the adversarial input. - :type target: Callable - :param max_conversation_turns: The maximum number of conversation turns to simulate. + :paramtype target: Callable + :keyword max_conversation_turns: The maximum number of conversation turns to simulate. Defaults to 1. - :type max_conversation_turns: int - :param max_simulation_results: The maximum number of simulation results to return. + :paramtype max_conversation_turns: int + :keyword max_simulation_results: The maximum number of simulation results to return. Defaults to 3. - :type max_simulation_results: int - :param api_call_retry_limit: The maximum number of retries for each API call within the simulation. + :paramtype max_simulation_results: int + :keyword api_call_retry_limit: The maximum number of retries for each API call within the simulation. Defaults to 3. - :type api_call_retry_limit: int - :param api_call_retry_sleep_sec: The sleep duration (in seconds) between retries for API calls. + :paramtype api_call_retry_limit: int + :keyword api_call_retry_sleep_sec: The sleep duration (in seconds) between retries for API calls. Defaults to 1 second. - :type api_call_retry_sleep_sec: int - :param api_call_delay_sec: The delay (in seconds) before making an API call. + :paramtype api_call_retry_sleep_sec: int + :keyword api_call_delay_sec: The delay (in seconds) before making an API call. This can be used to avoid hitting rate limits. Defaults to 0 seconds. - :type api_call_delay_sec: int - :param concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation. + :paramtype api_call_delay_sec: int + :keyword concurrent_async_task: The number of asynchronous tasks to run concurrently during the simulation. Defaults to 3. - :type concurrent_async_task: int + :paramtype concurrent_async_task: int :return: A list of dictionaries, each representing a simulated conversation. Each dictionary contains: - 'template_parameters': A dictionary with parameters used in the conversation template, diff --git a/src/promptflow-evals/pyproject.toml b/src/promptflow-evals/pyproject.toml index 1e1da86ffee..ec3623ff17f 100644 --- a/src/promptflow-evals/pyproject.toml +++ b/src/promptflow-evals/pyproject.toml @@ -47,6 +47,7 @@ jsonpath_ng = ">=1.5.0" urllib3 = ">1.26.17" numpy = ">=1.22" promptflow-azure = { version = "<2.0.0,>=1.13.0", optional = true} # Needed for remote tracking +isort = "^5.13.2" pyjwt = ">=2.8.0" azure-identity = ">=1.17.1" azure-core = ">=1.30.2" @@ -60,6 +61,8 @@ azure = [ [tool.poetry.group.dev.dependencies] pre-commit = "*" import-linter = "*" +pylint = "3.0.3" +azure-pylint-guidelines-checker = {version = "0.3.1", source = "azure-sdk"} [tool.poetry.group.test.dependencies] pytest = "*" @@ -69,6 +72,12 @@ pytest-mock = "*" pytest-xdist = "*" # test: pytest and coverage + +[[tool.poetry.source]] +name = "azure-sdk" +url = "https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/" +priority = "supplemental" + [tool.pytest.ini_options] markers = [ "unittest", @@ -111,3 +120,23 @@ name = "Contract forbidden modules" type = "forbidden" source_modules = ["promptflow.evals"] forbidden_modules = [] + +[tool.isort] +# we use check for make fmt* +profile = "black" +# no need to fmt ignored +skip_gitignore = true +# needs to be the same as in black +line_length = 120 +use_parentheses = true +include_trailing_comma = true +honor_noqa = true +ensure_newline_before_comments = true +skip_glob = [ + "docs/**", + "pipelines/**", + "pytest/**", + "samples/**", +] +known_third_party = ["azure", "mock", "numpy", "pandas", "pydash", "pytest", "pytest_mock", "requests", "setuptools", "six", "sklearn", "tqdm", "urllib3", "utilities", "utils", "yaml", "jsonschema", "strictyaml", "jwt", "pathspec", "isodate", "docker"] +known_first_party = ["promptflow"] diff --git a/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/apology.prompty b/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/apology.prompty index a6c43f7a807..bf20f567ff1 100644 --- a/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/apology.prompty +++ b/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/apology.prompty @@ -1,5 +1,5 @@ --- -name: basic evaluate +name: basic evaluate description: basic evaluator for QA scenario model: api: chat @@ -15,7 +15,7 @@ model: response_format: type: json_object -inputs: +inputs: answer: type: string outputs: @@ -23,7 +23,7 @@ outputs: type: string --- system: -You are an AI assistant. +You are an AI assistant. You task is to answer if answer contains an apology. If Answer contains apology, return 1, otherwise return 0. The output should be valid JSON. @@ -32,6 +32,6 @@ answer: "Sorry, I can only truth questions related to outdoor/camping gear and e output: {"score": "1"} -user: +user: answer: {{answer}} -output: \ No newline at end of file +output: diff --git a/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/sample.json b/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/sample.json index 9fe47f3087b..4ddda615d8f 100644 --- a/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/sample.json +++ b/src/promptflow-evals/samples/LoadSaveEvals/apology-prompty/sample.json @@ -1,4 +1,4 @@ { "question": "what's the capital of China?", "answer": "Shanghai" -} \ No newline at end of file +} diff --git a/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/apology.py b/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/apology.py index 9e9de34f2dd..d1b86bd4828 100644 --- a/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/apology.py +++ b/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/apology.py @@ -1,7 +1,8 @@ import re + from promptflow.core import tool @tool def apology(answer): - return len(re.findall('(sorry)|(apology)|(apologies)', answer.lower())) + return len(re.findall("(sorry)|(apology)|(apologies)", answer.lower())) diff --git a/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/flow.dag.yaml b/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/flow.dag.yaml index f4422866c03..c82f6eed99d 100644 --- a/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/flow.dag.yaml +++ b/src/promptflow-evals/samples/LoadSaveEvals/apology_dag/flow.dag.yaml @@ -14,4 +14,3 @@ nodes: path: apology.py inputs: answer: ${inputs.answer} - diff --git a/src/promptflow-evals/samples/LoadSaveEvals/evaluation_dataset_context.jsonl b/src/promptflow-evals/samples/LoadSaveEvals/evaluation_dataset_context.jsonl index 76345b983c5..c5035dc18af 100644 --- a/src/promptflow-evals/samples/LoadSaveEvals/evaluation_dataset_context.jsonl +++ b/src/promptflow-evals/samples/LoadSaveEvals/evaluation_dataset_context.jsonl @@ -1,4 +1,4 @@ {"question": "Which tent is the most waterproof?", "answer": "The TrailMaster X4 tent has the highest rainfly waterproof rating of the available tents, at 2000m", "context": "#TrailMaster X4 Tent, price $250,## BrandOutdoorLiving## CategoryTents## Features- Polyester material for durability- Spacious interior to accommodate multiple people- Easy setup with included instructions- Water-resistant construction to withstand light rain- Mesh panels for ventilation and insect protection- Rainfly included for added weather protection- Multiple doors for convenient entry and exit- Interior pockets for organizing small ite- Reflective guy lines for improved visibility at night- Freestanding design for easy setup and relocation- Carry bag included for convenient storage and transportatio## Technical Specs**Best Use**: Camping **Capacity**: 4-person **Season Rating**: 3-season **Setup**: Freestanding **Material**: Polyester **Waterproof**: Yes **Rainfly**: Included **Rainfly Waterproof Rating**: 2000mm", "ground_truth": "#TrailMaster X4 Tent"} {"question": "Which camping table is the lightest?", "answer": "The BaseCamp Folding Table is the lightest of all of the other camping tables mentioned", "context": "#BaseCamp Folding Table, price $60,## BrandCampBuddy## CategoryCamping Tables## FeaturesLightweight and durable aluminum constructionFoldable design with a compact size for easy storage and transport## Technical Specifications- **Weight**: 15 lbs- **Maximum Weight Capacity**: Up to a certain weight limit (specific weight limit not provided)", "ground_truth": "I cannot say based on the information provided."} {"question": "How much does TrailWalker Hiking Shoes cost? ", "answer": "$110", "context": "#TrailWalker Hiking Shoes, price $110## BrandTrekReady## CategoryHiking Footwear", "ground_truth": "$110"} -{"question": "Is France in Europe?", "answer": "Sorry, I can only truth questions related to outdoor/camping gear and equipment", "context": "#TrailWalker Hiking Shoes, price $110## BrandTrekReady## CategoryHiking Footwear", "ground_truth": "Yes"} \ No newline at end of file +{"question": "Is France in Europe?", "answer": "Sorry, I can only truth questions related to outdoor/camping gear and equipment", "context": "#TrailWalker Hiking Shoes, price $110## BrandTrekReady## CategoryHiking Footwear", "ground_truth": "Yes"} diff --git a/src/promptflow-evals/samples/evaluate-target/askwiki/askwiki.py b/src/promptflow-evals/samples/evaluate-target/askwiki/askwiki.py index fa8800a82de..ae88daa3872 100644 --- a/src/promptflow-evals/samples/evaluate-target/askwiki/askwiki.py +++ b/src/promptflow-evals/samples/evaluate-target/askwiki/askwiki.py @@ -1,17 +1,16 @@ import os import pathlib import random +import re import time +from concurrent.futures import ThreadPoolExecutor from functools import partial +import bs4 import jinja2 import requests -import bs4 -import re -from concurrent.futures import ThreadPoolExecutor from openai import AzureOpenAI - session = requests.Session() templateLoader = jinja2.FileSystemLoader(pathlib.Path(__file__).parent.resolve()) @@ -24,9 +23,9 @@ def decode_str(string): def remove_nested_parentheses(string): - pattern = r'\([^()]+\)' + pattern = r"\([^()]+\)" while re.search(pattern, string): - string = re.sub(pattern, '', string) + string = re.sub(pattern, "", string) return string @@ -38,10 +37,10 @@ def get_page_sentence(page, count: int = 10): # find all sentence sentences = [] for p in paragraphs: - sentences += p.split('. ') - sentences = [s.strip() + '.' for s in sentences if s.strip()] + sentences += p.split(". ") + sentences = [s.strip() + "." for s in sentences if s.strip()] # get first `count` number of sentences - return ' '.join(sentences[:count]) + return " ".join(sentences[:count]) def fetch_text_content_from_url(url: str, count: int = 10): @@ -49,14 +48,14 @@ def fetch_text_content_from_url(url: str, count: int = 10): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35" + "Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35" } delay = random.uniform(0, 0.5) time.sleep(delay) response = session.get(url, headers=headers) if response.status_code == 200: # Parse the HTML content using BeautifulSoup - soup = bs4.BeautifulSoup(response.text, 'html.parser') + soup = bs4.BeautifulSoup(response.text, "html.parser") page_content = [p_ul.get_text().strip() for p_ul in soup.find_all("p") + soup.find_all("ul")] page = "" for content in page_content: @@ -67,8 +66,10 @@ def fetch_text_content_from_url(url: str, count: int = 10): text = get_page_sentence(page, count=count) return (url, text) else: - msg = f"Get url failed with status code {response.status_code}.\nURL: {url}\nResponse: " \ - f"{response.text[:100]}" + msg = ( + f"Get url failed with status code {response.status_code}.\nURL: {url}\nResponse: " + f"{response.text[:100]}" + ) print(msg) return (url, "No available content") @@ -94,18 +95,20 @@ def get_wiki_url(entity: str, count=2): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35"} + "Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35" + } response = requests.get(url, headers=headers) if response.status_code == 200: # Parse the HTML content using BeautifulSoup - soup = bs4.BeautifulSoup(response.text, 'html.parser') + soup = bs4.BeautifulSoup(response.text, "html.parser") mw_divs = soup.find_all("div", {"class": "mw-search-result-heading"}) if mw_divs: # mismatch result_titles = [decode_str(div.get_text().strip()) for div in mw_divs] result_titles = [remove_nested_parentheses(result_title) for result_title in result_titles] # print(f"Could not find {entity}. Similar entity: {result_titles[:count]}.") - url_list.extend([f"https://en.wikipedia.org/w/index.php?search={result_title}" for result_title in - result_titles]) + url_list.extend( + [f"https://en.wikipedia.org/w/index.php?search={result_title}" for result_title in result_titles] + ) else: page_content = [p_ul.get_text().strip() for p_ul in soup.find_all("p") + soup.find_all("ul")] if any("may refer to:" in p for p in page_content): @@ -113,8 +116,10 @@ def get_wiki_url(entity: str, count=2): else: url_list.append(url) else: - msg = f"Get url failed with status code {response.status_code}.\nURL: {url}\nResponse: " \ - f"{response.text[:100]}" + msg = ( + f"Get url failed with status code {response.status_code}.\nURL: {url}\nResponse: " + f"{response.text[:100]}" + ) print(msg) return url_list[:count] except Exception as e: @@ -129,10 +134,12 @@ def format(doc: dict): try: context = [] for url, content in search_result: - context.append({ - "Content": content, - # "Source": url - }) + context.append( + { + "Content": content, + # "Source": url + } + ) context_str = "\n\n".join([format(c) for c in context]) return context_str except Exception as e: @@ -143,20 +150,15 @@ def format(doc: dict): def augmented_qa(question, context): system_message = system_message_template.render(contexts=context) - messages = [ - {"role": "system", "content": system_message}, - {"role": "user", "content": question} - ] + messages = [{"role": "system", "content": system_message}, {"role": "user", "content": question}] with AzureOpenAI( azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], api_key=os.environ["AZURE_OPENAI_API_KEY"], - api_version=os.environ["AZURE_OPENAI_API_VERSION"] + api_version=os.environ["AZURE_OPENAI_API_VERSION"], ) as client: response = client.chat.completions.create( - model=os.environ.get("AZURE_OPENAI_DEPLOYMENT"), - messages=messages, temperature=0.7, - max_tokens=800 + model=os.environ.get("AZURE_OPENAI_DEPLOYMENT"), messages=messages, temperature=0.7, max_tokens=800 ) return response.choices[0].message.content @@ -168,10 +170,7 @@ def ask_wiki(question): context = process_search_result(search_result) answer = augmented_qa(question, context) - return { - "answer": answer, - "context": str(context) - } + return {"answer": answer, "context": str(context)} if __name__ == "__main__": diff --git a/src/promptflow-evals/samples/evaluate-target/evaluators/blocklist/blocklist.py b/src/promptflow-evals/samples/evaluate-target/evaluators/blocklist/blocklist.py index 13e14e2f5eb..0a14cc7e4bc 100644 --- a/src/promptflow-evals/samples/evaluate-target/evaluators/blocklist/blocklist.py +++ b/src/promptflow-evals/samples/evaluate-target/evaluators/blocklist/blocklist.py @@ -1,4 +1,3 @@ - class BlocklistEvaluator: def __init__(self, blocklist): self._blocklist = blocklist diff --git a/src/promptflow-evals/samples/evaluate_chat.py b/src/promptflow-evals/samples/evaluate_chat.py index 9009ceb068b..2e60aaded60 100644 --- a/src/promptflow-evals/samples/evaluate_chat.py +++ b/src/promptflow-evals/samples/evaluate_chat.py @@ -1,9 +1,9 @@ import os from pprint import pprint -from promptflow.evals.evaluators import ChatEvaluator -from promptflow.evals.evaluate import evaluate from promptflow.core import AzureOpenAIModelConfiguration +from promptflow.evals.evaluate import evaluate +from promptflow.evals.evaluators import ChatEvaluator if __name__ == "__main__": # Initialize Chat Evaluator diff --git a/src/promptflow-evals/tests/evals/e2etests/custom_evaluators/answer_length_with_aggregation.py b/src/promptflow-evals/tests/evals/e2etests/custom_evaluators/answer_length_with_aggregation.py index 7f536bbcb7c..c49d8de9681 100644 --- a/src/promptflow-evals/tests/evals/e2etests/custom_evaluators/answer_length_with_aggregation.py +++ b/src/promptflow-evals/tests/evals/e2etests/custom_evaluators/answer_length_with_aggregation.py @@ -4,7 +4,6 @@ class AnswerLength: - def __init__(self, *, return_json: bool = False, aggregate_return_json: bool = False): self.return_json = return_json self.aggregate_return_json = aggregate_return_json diff --git a/src/promptflow-evals/tests/evals/e2etests/target_fn.py b/src/promptflow-evals/tests/evals/e2etests/target_fn.py index eb64a7e5d45..2f4eb1ceb7a 100644 --- a/src/promptflow-evals/tests/evals/e2etests/target_fn.py +++ b/src/promptflow-evals/tests/evals/e2etests/target_fn.py @@ -15,5 +15,5 @@ def target_fn2(question: str) -> str: def target_fn3(question: str) -> str: response = target_fn(question) - response['question'] = f'The question is as follows: {question}' + response["question"] = f"The question is as follows: {question}" return response diff --git a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py index c027ac98a33..e3a8e279cb5 100644 --- a/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py +++ b/src/promptflow-evals/tests/evals/e2etests/test_metrics_upload.py @@ -11,6 +11,7 @@ from promptflow.evals.evaluate._evaluate import evaluate from promptflow.evals.evaluators._f1_score._f1_score import F1ScoreEvaluator from promptflow.tracing import _start_trace + try: from promptflow.recording.record_mode import is_live except ModuleNotFoundError: @@ -162,7 +163,7 @@ def test_e2e_run_target_fn(self, caplog, project_scope, questions_answers_file, # Switch off tracing as it is running in the second thread, wile # thread pool executor is not compatible with VCR.py. if not is_live(): - monkeypatch.setattr(_start_trace, '_is_devkit_installed', lambda: False) + monkeypatch.setattr(_start_trace, "_is_devkit_installed", lambda: False) # All loggers, having promptflow. prefix will have "promptflow" logger # as a parent. This logger does not propagate the logs and cannot be # captured by caplog. Here we will skip this logger to capture logs. @@ -197,7 +198,11 @@ def test_e2e_run(self, caplog, project_scope, questions_answers_file, monkeypatc # Switch off tracing as it is running in the second thread, wile # thread pool executor is not compatible with VCR.py. if not is_live(): - monkeypatch.setattr(_start_trace, '_is_devkit_installed', lambda: False) + monkeypatch.setattr(_start_trace, "_is_devkit_installed", lambda: False) f1_score_eval = F1ScoreEvaluator() - evaluate(data=questions_answers_file, evaluators={"f1": f1_score_eval}, azure_ai_project=project_scope,) + evaluate( + data=questions_answers_file, + evaluators={"f1": f1_score_eval}, + azure_ai_project=project_scope, + ) self._assert_no_errors_for_module(caplog.records, (ev_utils.__name__, EvalRun.__module__)) diff --git a/src/promptflow-evals/tests/evals/unittests/data/validate_defect_rate_test_data.jsonl b/src/promptflow-evals/tests/evals/unittests/data/validate_defect_rate_test_data.jsonl index 0c435df956c..168da36b05c 100644 --- a/src/promptflow-evals/tests/evals/unittests/data/validate_defect_rate_test_data.jsonl +++ b/src/promptflow-evals/tests/evals/unittests/data/validate_defect_rate_test_data.jsonl @@ -7,4 +7,4 @@ {"outputs.content_safety.hate_unfairness":null,"outputs.content_safety.hate_unfairness_score":null,"outputs.content_safety.hate_unfairness_reason":null,"outputs.content_safety.violence":null,"outputs.content_safety.violence_score":null,"outputs.content_safety.violence_reason":null,"outputs.content_safety.sexual":null,"outputs.content_safety.sexual_score":null,"outputs.content_safety.sexual_reason":null,"outputs.content_safety.self_harm":null,"outputs.content_safety.self_harm_score":null,"outputs.content_safety.self_harm_reason":null} {"outputs.content_safety.hate_unfairness":null,"outputs.content_safety.hate_unfairness_score":null,"outputs.content_safety.hate_unfairness_reason":null,"outputs.content_safety.violence":null,"outputs.content_safety.violence_score":null,"outputs.content_safety.violence_reason":null,"outputs.content_safety.sexual":null,"outputs.content_safety.sexual_score":null,"outputs.content_safety.sexual_reason":null,"outputs.content_safety.self_harm":null,"outputs.content_safety.self_harm_score":null,"outputs.content_safety.self_harm_reason":null} {"outputs.content_safety.hate_unfairness":null,"outputs.content_safety.hate_unfairness_score":null,"outputs.content_safety.hate_unfairness_reason":null,"outputs.content_safety.violence":null,"outputs.content_safety.violence_score":null,"outputs.content_safety.violence_reason":null,"outputs.content_safety.sexual":null,"outputs.content_safety.sexual_score":null,"outputs.content_safety.sexual_reason":null,"outputs.content_safety.self_harm":null,"outputs.content_safety.self_harm_score":null,"outputs.content_safety.self_harm_reason":null} -{"outputs.content_safety.hate_unfairness":null,"outputs.content_safety.hate_unfairness_score":null,"outputs.content_safety.hate_unfairness_reason":null,"outputs.content_safety.violence":null,"outputs.content_safety.violence_score":null,"outputs.content_safety.violence_reason":null,"outputs.content_safety.sexual":null,"outputs.content_safety.sexual_score":null,"outputs.content_safety.sexual_reason":null,"outputs.content_safety.self_harm":null,"outputs.content_safety.self_harm_score":null,"outputs.content_safety.self_harm_reason":null} \ No newline at end of file +{"outputs.content_safety.hate_unfairness":null,"outputs.content_safety.hate_unfairness_score":null,"outputs.content_safety.hate_unfairness_reason":null,"outputs.content_safety.violence":null,"outputs.content_safety.violence_score":null,"outputs.content_safety.violence_reason":null,"outputs.content_safety.sexual":null,"outputs.content_safety.sexual_score":null,"outputs.content_safety.sexual_reason":null,"outputs.content_safety.self_harm":null,"outputs.content_safety.self_harm_score":null,"outputs.content_safety.self_harm_reason":null} diff --git a/src/promptflow-evals/tests/evals/unittests/data/yeti_questions.jsonl b/src/promptflow-evals/tests/evals/unittests/data/yeti_questions.jsonl index 9f18714977d..5610a1f1b36 100644 --- a/src/promptflow-evals/tests/evals/unittests/data/yeti_questions.jsonl +++ b/src/promptflow-evals/tests/evals/unittests/data/yeti_questions.jsonl @@ -3,4 +3,4 @@ {"question": "What yeti eats?", "answer": "Meat."} {"question": "Does yeti lives in Seattle area?", "answer": "Nobody knows."} {"question": "Who are aliens?", "answer": "Aliens is life which does not originate from Earth."} -{"question": "Are aliens procaryotes?", "answer": "Only the ones from the deep space."} \ No newline at end of file +{"question": "Are aliens procaryotes?", "answer": "Only the ones from the deep space."} diff --git a/src/promptflow-evals/tests/evals/unittests/test_content_safety_defect_rate.py b/src/promptflow-evals/tests/evals/unittests/test_content_safety_defect_rate.py index 52311862c1f..6579be8edde 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_content_safety_defect_rate.py +++ b/src/promptflow-evals/tests/evals/unittests/test_content_safety_defect_rate.py @@ -1,8 +1,9 @@ import os import pathlib -import pytest import pandas as pd +import pytest + from promptflow.evals.evaluate._evaluate import _aggregate_metrics from promptflow.evals.evaluators import ContentSafetyEvaluator @@ -18,7 +19,7 @@ class TestContentSafetyDefectRate: def test_content_safety_defect_rate(self): file_path = _get_file("validate_defect_rate_test_data.jsonl") df = pd.read_json(file_path, lines=True, orient="records") - df = df.fillna('n/a') + df = df.fillna("n/a") metrics = _aggregate_metrics(df, {"content_safety": ContentSafetyEvaluator}) assert metrics["content_safety.hate_unfairness_defect_rate"] == 0.33 diff --git a/src/promptflow-evals/tests/evals/unittests/test_eval_run.py b/src/promptflow-evals/tests/evals/unittests/test_eval_run.py index 22964bbf12e..1ba8e2983de 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_eval_run.py +++ b/src/promptflow-evals/tests/evals/unittests/test_eval_run.py @@ -250,11 +250,7 @@ def test_get_urls(self, mock_session_cls, token_mock): ), "Wrong Metrics URL" @pytest.mark.parametrize( - 'log_function,expected_str', - [ - ('log_artifact', 'register artifact'), - ('log_metric', 'save metrics') - ] + "log_function,expected_str", [("log_artifact", "register artifact"), ("log_metric", "save metrics")] ) def test_log_artifacts_logs_error(self, token_mock, tmp_path, caplog, log_function, expected_str): """Test that the error is logged.""" @@ -295,11 +291,12 @@ def test_log_artifacts_logs_error(self, token_mock, tmp_path, caplog, log_functi assert expected_str in caplog.records[0].message @pytest.mark.parametrize( - 'dir_exists,dir_empty,expected_error', [ + "dir_exists,dir_empty,expected_error", + [ (True, True, "The path to the artifact is empty."), (False, True, "The path to the artifact is either not a directory or does not exist."), - (True, False, "The run results file was not found, skipping artifacts upload.") - ] + (True, False, "The run results file was not found, skipping artifacts upload."), + ], ) def test_wrong_artifact_path( self, @@ -409,10 +406,10 @@ def test_local_lifecycle(self, token_mock): run = EvalRun( run_name=None, tracking_uri=None, - subscription_id='mock', - group_name='mock', - workspace_name='mock', - ml_client=MagicMock() + subscription_id="mock", + group_name="mock", + workspace_name="mock", + ml_client=MagicMock(), ) assert run.status == RunStatus.NOT_STARTED, f'Get {run.status}, expected {RunStatus.NOT_STARTED}' run._start_run() diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py index 9fccde1b01b..d7f02431581 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py @@ -12,9 +12,8 @@ from promptflow.evals._constants import DEFAULT_EVALUATION_RESULTS_FILE_NAME from promptflow.evals.evaluate import evaluate from promptflow.evals.evaluate._evaluate import _apply_target_to_data, _rename_columns_conditionally -from promptflow.evals.evaluate._utils import _apply_column_mapping +from promptflow.evals.evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope from promptflow.evals.evaluators import F1ScoreEvaluator, GroundednessEvaluator -from promptflow.evals.evaluate._utils import _trace_destination_from_project_scope def _get_file(name): @@ -407,7 +406,8 @@ def test_get_trace_destination(self, mock_validate_trace_destination, mock_proje pf_client = PFClient( config={ "trace.destination": _trace_destination_from_project_scope(mock_project_scope) - if mock_project_scope else None + if mock_project_scope + else None } ) diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate_telemetry.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate_telemetry.py index 70d0a6ef6cc..7bfed72ba9b 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluate_telemetry.py +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluate_telemetry.py @@ -1,16 +1,16 @@ import json import os import pathlib -from typing import Optional, Callable, Dict -from unittest.mock import patch, MagicMock +from typing import Callable, Dict, Optional +from unittest.mock import MagicMock, patch import numpy as np import pandas as pd import pytest +from promptflow.client import load_flow from promptflow.evals.evaluate._telemetry import log_evaluate_activity from promptflow.evals.evaluators import F1ScoreEvaluator, HateUnfairnessEvaluator -from promptflow.client import load_flow def _add_nans(df, n, column_name): @@ -58,14 +58,14 @@ def dummy_evaluate_function( nan_count = kwargs.get("number_of_nans", 1) for evaluation_name, evaluator in evaluators.items(): - df[f'outputs.{evaluation_name}.score'] = np.random.randint(0, 100, df.shape[0]) - _add_nans(df, nan_count, f'outputs.{evaluation_name}.score') + df[f"outputs.{evaluation_name}.score"] = np.random.randint(0, 100, df.shape[0]) + _add_nans(df, nan_count, f"outputs.{evaluation_name}.score") # Add a new column with random strings - df[f'outputs.{evaluation_name}.reason'] = np.random.choice(['a', 'b', 'c', 'd', 'e'], df.shape[0]) + df[f"outputs.{evaluation_name}.reason"] = np.random.choice(["a", "b", "c", "d", "e"], df.shape[0]) return { - "rows": df.to_dict(orient="records"), + "rows": df.to_dict(orient="records"), } @@ -73,29 +73,32 @@ class TestEvaluateTelemetry: def test_evaluators_telemetry(self, mock_app_insight_logger): f1_score = F1ScoreEvaluator() apology_dag = load_flow(os.path.join(pathlib.Path(__file__).parent.resolve(), "test_evaluators", "apology_dag")) - apology_prompty = load_flow(os.path.join(pathlib.Path(__file__).parent.resolve(), - "test_evaluators", "apology_prompty", "apology.prompty")) + apology_prompty = load_flow( + os.path.join( + pathlib.Path(__file__).parent.resolve(), "test_evaluators", "apology_prompty", "apology.prompty" + ) + ) data = _get_file("evaluate_test_data.jsonl") evaluators = { "f1_score": f1_score, "apology_dag": apology_dag, "apology_prompty": apology_prompty, - "answer_length": answer_length + "answer_length": answer_length, } - dummy_evaluate_function( - evaluators=evaluators, - data=data, - number_of_nans=1 - ) + dummy_evaluate_function(evaluators=evaluators, data=data, number_of_nans=1) - evaluate_start_call = [call for call in mock_app_insight_logger.info.call_args_list if - "pf.evals.evaluate.start" in call.args[0]] + evaluate_start_call = [ + call for call in mock_app_insight_logger.info.call_args_list if "pf.evals.evaluate.start" in call.args[0] + ] evaluate_start_call_cd = evaluate_start_call[0].kwargs["extra"]["custom_dimensions"] - evaluate_usage_info_call = [call for call in mock_app_insight_logger.info.call_args_list if - "pf.evals.evaluate_usage_info.start" in call.args[0]] + evaluate_usage_info_call = [ + call + for call in mock_app_insight_logger.info.call_args_list + if "pf.evals.evaluate_usage_info.start" in call.args[0] + ] evaluate_usage_info_call_cd = evaluate_usage_info_call[0].kwargs["extra"]["custom_dimensions"] assert mock_app_insight_logger.info.call_count == 4 @@ -130,8 +133,13 @@ def test_evaluators_telemetry(self, mock_app_insight_logger): assert entry["failed_rows"] == 1 - def test_evaluator_start_telemetry(self, mock_app_insight_logger, mock_project_scope, - mock_trace_destination_to_cloud, mock_validate_trace_destination): + def test_evaluator_start_telemetry( + self, + mock_app_insight_logger, + mock_project_scope, + mock_trace_destination_to_cloud, + mock_validate_trace_destination, + ): hate_unfairness = HateUnfairnessEvaluator(project_scope=None) data = _get_file("evaluate_test_data.jsonl") @@ -145,12 +153,12 @@ def test_evaluator_start_telemetry(self, mock_app_insight_logger, mock_project_s data=data, number_of_nans=2, azure_ai_project=mock_project_scope, - evaluator_config={"hate_unfairness": {"model_config": "test_config"}} - + evaluator_config={"hate_unfairness": {"model_config": "test_config"}}, ) - evaluate_start_call = [call for call in mock_app_insight_logger.info.call_args_list if - "pf.evals.evaluate.start" in call.args[0]] + evaluate_start_call = [ + call for call in mock_app_insight_logger.info.call_args_list if "pf.evals.evaluate.start" in call.args[0] + ] evaluate_start_call_cd = evaluate_start_call[0].kwargs["extra"]["custom_dimensions"] # asserts for evaluate start activity diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_dag/apology.py b/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_dag/apology.py index 9e9de34f2dd..d1b86bd4828 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_dag/apology.py +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_dag/apology.py @@ -1,7 +1,8 @@ import re + from promptflow.core import tool @tool def apology(answer): - return len(re.findall('(sorry)|(apology)|(apologies)', answer.lower())) + return len(re.findall("(sorry)|(apology)|(apologies)", answer.lower())) diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_dag/flow.dag.yaml b/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_dag/flow.dag.yaml index 44f68b520be..c6a7bcfb703 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_dag/flow.dag.yaml +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_dag/flow.dag.yaml @@ -15,4 +15,3 @@ nodes: path: apology.py inputs: answer: ${inputs.answer} - diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_prompty/apology.prompty b/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_prompty/apology.prompty index a6c43f7a807..bf20f567ff1 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_prompty/apology.prompty +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_prompty/apology.prompty @@ -1,5 +1,5 @@ --- -name: basic evaluate +name: basic evaluate description: basic evaluator for QA scenario model: api: chat @@ -15,7 +15,7 @@ model: response_format: type: json_object -inputs: +inputs: answer: type: string outputs: @@ -23,7 +23,7 @@ outputs: type: string --- system: -You are an AI assistant. +You are an AI assistant. You task is to answer if answer contains an apology. If Answer contains apology, return 1, otherwise return 0. The output should be valid JSON. @@ -32,6 +32,6 @@ answer: "Sorry, I can only truth questions related to outdoor/camping gear and e output: {"score": "1"} -user: +user: answer: {{answer}} -output: \ No newline at end of file +output: diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_prompty/sample.json b/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_prompty/sample.json index 9fe47f3087b..4ddda615d8f 100644 --- a/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_prompty/sample.json +++ b/src/promptflow-evals/tests/evals/unittests/test_evaluators/apology_prompty/sample.json @@ -1,4 +1,4 @@ { "question": "what's the capital of China?", "answer": "Shanghai" -} \ No newline at end of file +}