diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index 06e65797c..f8288a784 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -15,7 +15,7 @@ ) from langtest.utils.custom_types.helpers import default_user_prompt from langtest.errors import Errors -from langtest.utils.custom_types.sample import DegradationSample +from langtest.utils.custom_types.sample import DegradationSample, QASample from langtest.utils.util_metrics import ( calculate_f1_score, calculate_f1_score_multi_label, @@ -1157,7 +1157,7 @@ class DegradationAnalysis(BaseAccuracy): alias_name = ["degradation_analysis"] - supported_tasks = ["ner", "text-classification"] + supported_tasks = ["ner", "text-classification", "question-answering"] result_data = defaultdict(dict) @@ -1183,11 +1183,27 @@ async def run( test_cases: Dict[str, Dict[str, List[Sample]]] = kwargs.get("test_cases", []) X_test = kwargs.get("X_test", []) - if isinstance(X_test, pd.Series) or isinstance(X_test, list): + if len(X_test) and isinstance(X_test[0], QASample): X_test = pd.DataFrame( { - "index": [x.original for x in X_test], - "expected_results": [x.expected_results for x in X_test], + "original_content": [ + sample.original_context if sample.original_context else "" + for sample in X_test + ], + "original_question": [sample.original_question for sample in X_test], + "expected_results": [sample.expected_results for sample in X_test], + } + ) + X_test["index"] = ( + X_test["original_content"] + "\n" + X_test["original_question"] + ) + X_test.set_index("index", inplace=True) + + elif isinstance(X_test, pd.Series) or isinstance(X_test, list): + X_test = pd.DataFrame( + { + "index": [sample.original for sample in X_test], + "expected_results": [sample.expected_results for sample in X_test], } ) @@ -1208,19 +1224,25 @@ async def run( if category not in ["robustness", "bias"]: continue for test_type, samples in data.items(): - ground_truth = X_test[X_test.index.isin([i.original for i in samples])][ - "expected_results" - ].to_list() + if len(samples) and isinstance(samples[0], QASample): + accuracy_score1, accuracy_score2 = DegradationAnalysis.qa_evaluation( + samples, X_test + ) - expected_results = [x.expected_results for x in samples] - actual_results = [x.actual_results for x in samples] + else: + ground_truth = X_test[ + X_test.index.isin([i.original for i in samples]) + ]["expected_results"].to_list() - accuracy_score1 = calculate_f1_score( - *DegradationAnalysis.preprocess(ground_truth, expected_results) - ) - accuracy_score2 = calculate_f1_score( - *DegradationAnalysis.preprocess(ground_truth, actual_results) - ) + expected_results = [x.expected_results for x in samples] + actual_results = [x.actual_results for x in samples] + + accuracy_score1 = calculate_f1_score( + *DegradationAnalysis.preprocess(ground_truth, expected_results) + ) + accuracy_score2 = calculate_f1_score( + *DegradationAnalysis.preprocess(ground_truth, actual_results) + ) degradation = accuracy_score2 - accuracy_score1 @@ -1286,6 +1308,77 @@ def preprocess(y_true: Union[list, pd.Series], y_pred: Union[list, pd.Series]): return y_true, y_pred + @staticmethod + def qa_evaluation(samples: List[QASample], X_test: pd.DataFrame): + """ + Evaluates the model performance on question-answering tasks. + + Args: + + samples (List[QASample]): The list of QASample instances. + X_test (pd.DataFrame): The test data. + + Returns: + + Tuple[float, float]: The accuracy scores for the original and perturbed samples. + + """ + + results = { + "original": [], + "perturbed": [], + "total": len(samples), + } + for sample in samples: + if sample.original_context is None: + context = "" + else: + context = sample.original_context + index = context + "\n" + sample.original_question + + g_values = X_test[X_test.index == index]["expected_results"].values + ground_truth = g_values[0] if len(g_values) else None + + # if ground_truth is None, skip the sample and continue to the next sample + if ground_truth is None: + results["total"] -= 1 + continue + + expected_results = sample.expected_results + actual_results = sample.actual_results + + original = sample.original_context + "\n" + sample.original_question + perturbed = sample.perturbed_context + "\n" + sample.perturbed_question + + from ..utils.custom_types.helpers import is_pass_llm_eval + from ..langtest import EVAL_MODEL + + o_result = is_pass_llm_eval( + eval_model=EVAL_MODEL, + dataset_name=sample.dataset_name, + original_question=original, + answer="\n".join(map(str, ground_truth)), + perturbed_question=perturbed, + prediction=expected_results, + ) + + p_result = is_pass_llm_eval( + eval_model=EVAL_MODEL, + dataset_name=sample.dataset_name, + original_question=original, + answer="\n".join(map(str, ground_truth)), + perturbed_question=perturbed, + prediction=actual_results, + ) + + results["original"].append(int(o_result) if o_result else 0) + results["perturbed"].append(int(p_result) if p_result else 0) + + accuracy_score1 = sum(results["original"]) / max(results["total"], 1) + accuracy_score2 = sum(results["perturbed"]) / max(results["total"], 1) + + return accuracy_score1, accuracy_score2 + @staticmethod def show_results(): import pandas as pd @@ -1304,14 +1397,7 @@ def show_results(): y_pos = range(len(y_labels)) for i, label in enumerate(y_labels): - # Before robustness bar - ax.broken_barh( - [(0, df["after"][i])], - (i - 0.2, 0.4), - color="#1f77b4", - label="After" if i == 0 else "", - ) - # After robustness bar with adjusted width send this bar to back + # Before robustness bar (back layer) ax.broken_barh( [(0, df["before"][i])], (i - 0.4, 0.8), @@ -1319,28 +1405,37 @@ def show_results(): zorder=0, label="Before" if i == 0 else "", ) + # After robustness bar (front layer) + ax.broken_barh( + [(0, df["after"][i])], + (i - 0.2, 0.4), + color="#1f77b4", + label="After" if i == 0 else "", + zorder=1, + ) - # Adjust label positions if too close - if abs(df["before"][i] - df["after"][i]) < 0.05: - offset = 0.03 - else: - offset = 0.01 + # Adjust label position dynamically + offset = 0.03 if abs(df["before"][i] - df["after"][i]) < 0.05 else 0.01 + # Text for "after" bar ax.text( df["after"][i] + 0.01, - i, + i - 0.1, f"{df['after'][i]:.2f}", va="center", ha="left", color="#1f77b4", + zorder=2, ) + # Text for "before" bar ax.text( df["before"][i] + offset, - i, + i + 0.1, f"{df['before'][i]:.2f}", va="center", ha="left", color="black", + zorder=2, ) ax.set_xlim(0, 1)