From 652d688cefe77fb3df6ab94987a43327fab01691 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 16 Dec 2024 22:00:07 +0530 Subject: [PATCH 1/7] feat: enhance DegradationAnalysis to support question-answering tasks and add evaluation method --- langtest/transform/accuracy.py | 144 ++++++++++++++++++++++++++------- 1 file changed, 115 insertions(+), 29 deletions(-) diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index 06e65797c..5f03be8b4 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -15,7 +15,7 @@ ) from langtest.utils.custom_types.helpers import default_user_prompt from langtest.errors import Errors -from langtest.utils.custom_types.sample import DegradationSample +from langtest.utils.custom_types.sample import DegradationSample, QASample from langtest.utils.util_metrics import ( calculate_f1_score, calculate_f1_score_multi_label, @@ -1157,7 +1157,7 @@ class DegradationAnalysis(BaseAccuracy): alias_name = ["degradation_analysis"] - supported_tasks = ["ner", "text-classification"] + supported_tasks = ["ner", "text-classification", "question-answering"] result_data = defaultdict(dict) @@ -1183,7 +1183,22 @@ async def run( test_cases: Dict[str, Dict[str, List[Sample]]] = kwargs.get("test_cases", []) X_test = kwargs.get("X_test", []) - if isinstance(X_test, pd.Series) or isinstance(X_test, list): + if len(X_test) and isinstance(X_test[0], QASample): + X_test = pd.DataFrame( + { + "original_content": [ + x.original_context if x.original_context else "" for x in X_test + ], + "original_question": [x.original_question for x in X_test], + "expected_results": [x.expected_results for x in X_test], + } + ) + X_test["index"] = ( + X_test["original_content"] + "\n" + X_test["original_question"] + ) + X_test.set_index("index", inplace=True) + + elif isinstance(X_test, pd.Series) or isinstance(X_test, list): X_test = pd.DataFrame( { "index": [x.original for x in X_test], @@ -1208,19 +1223,25 @@ async def run( if category not in ["robustness", "bias"]: continue for test_type, samples in data.items(): - ground_truth = X_test[X_test.index.isin([i.original for i in samples])][ - "expected_results" - ].to_list() + if len(samples) and isinstance(samples[0], QASample): + accuracy_score1, accuracy_score2 = DegradationAnalysis.qa_evaluation( + samples, X_test + ) + + else: + ground_truth = X_test[ + X_test.index.isin([i.original for i in samples]) + ]["expected_results"].to_list() - expected_results = [x.expected_results for x in samples] - actual_results = [x.actual_results for x in samples] + expected_results = [x.expected_results for x in samples] + actual_results = [x.actual_results for x in samples] - accuracy_score1 = calculate_f1_score( - *DegradationAnalysis.preprocess(ground_truth, expected_results) - ) - accuracy_score2 = calculate_f1_score( - *DegradationAnalysis.preprocess(ground_truth, actual_results) - ) + accuracy_score1 = calculate_f1_score( + *DegradationAnalysis.preprocess(ground_truth, expected_results) + ) + accuracy_score2 = calculate_f1_score( + *DegradationAnalysis.preprocess(ground_truth, actual_results) + ) degradation = accuracy_score2 - accuracy_score1 @@ -1286,6 +1307,69 @@ def preprocess(y_true: Union[list, pd.Series], y_pred: Union[list, pd.Series]): return y_true, y_pred + def qa_evaluation(self, samples: List[QASample], X_test: pd.DataFrame): + """ + Evaluates the model performance on question-answering tasks. + + Args: + + samples (List[QASample]): The list of QASample instances. + X_test (pd.DataFrame): The test data. + + Returns: + + Tuple[float, float]: The accuracy scores for the original and perturbed samples. + + """ + + results = { + "original": [], + "perturbed": [], + "total": len(samples), + } + for sample in samples: + if sample.original_context is None: + context = "" + else: + context = sample.original_context + index = context + "\n" + sample.original_question + ground_truth = X_test[X_test.index == index]["expected_results"].values[0] + + expected_results = sample.expected_results + actual_results = sample.actual_results + + original = sample.original_context + "\n" + sample.original_question + perturbed = sample.perturbed_context + "\n" + sample.perturbed_question + + from ..utils.custom_types.helpers import is_pass_llm_eval + from ..langtest import EVAL_MODEL + + o_result = is_pass_llm_eval( + eval_model=EVAL_MODEL, + dataset_name=sample.dataset_name, + original_question=original, + answer="\n".join(map(str, ground_truth)), + perturbed_question=perturbed, + prediction=expected_results, + ) + + p_result = is_pass_llm_eval( + eval_model=EVAL_MODEL, + dataset_name=sample.dataset_name, + original_question=original, + answer="\n".join(map(str, ground_truth)), + perturbed_question=perturbed, + prediction=actual_results, + ) + + results["original"].append(int(o_result) if o_result else 0) + results["perturbed"].append(int(p_result) if p_result else 0) + + accuracy_score1 = sum(results["original"]) / max(results["total"], 1) + accuracy_score2 = sum(results["perturbed"]) / max(results["total"], 1) + + return accuracy_score1, accuracy_score2 + @staticmethod def show_results(): import pandas as pd @@ -1304,14 +1388,7 @@ def show_results(): y_pos = range(len(y_labels)) for i, label in enumerate(y_labels): - # Before robustness bar - ax.broken_barh( - [(0, df["after"][i])], - (i - 0.2, 0.4), - color="#1f77b4", - label="After" if i == 0 else "", - ) - # After robustness bar with adjusted width send this bar to back + # Before robustness bar (back layer) ax.broken_barh( [(0, df["before"][i])], (i - 0.4, 0.8), @@ -1319,28 +1396,37 @@ def show_results(): zorder=0, label="Before" if i == 0 else "", ) + # After robustness bar (front layer) + ax.broken_barh( + [(0, df["after"][i])], + (i - 0.2, 0.4), + color="#1f77b4", + label="After" if i == 0 else "", + zorder=1, + ) - # Adjust label positions if too close - if abs(df["before"][i] - df["after"][i]) < 0.05: - offset = 0.03 - else: - offset = 0.01 + # Adjust label position dynamically + offset = 0.03 if abs(df["before"][i] - df["after"][i]) < 0.05 else 0.01 + # Text for "after" bar ax.text( df["after"][i] + 0.01, - i, + i - 0.1, f"{df['after'][i]:.2f}", va="center", ha="left", color="#1f77b4", + zorder=2, ) + # Text for "before" bar ax.text( df["before"][i] + offset, - i, + i + 0.1, f"{df['before'][i]:.2f}", va="center", ha="left", color="black", + zorder=2, ) ax.set_xlim(0, 1) From 5921cf13c1ee7edc6baa1ac38796541f794b0f86 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 16 Dec 2024 22:13:05 +0530 Subject: [PATCH 2/7] feat: skip samples with None ground truth in DegradationAnalysis accuracy calculation --- langtest/transform/accuracy.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index 5f03be8b4..eef617138 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -1333,8 +1333,13 @@ def qa_evaluation(self, samples: List[QASample], X_test: pd.DataFrame): else: context = sample.original_context index = context + "\n" + sample.original_question + ground_truth = X_test[X_test.index == index]["expected_results"].values[0] + # if ground_truth is having None then skip the sample and continue to the next sample + if ground_truth is None: + continue + expected_results = sample.expected_results actual_results = sample.actual_results From cc46917ff32085a5d493ee702df84bb420098d45 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 16 Dec 2024 22:14:04 +0530 Subject: [PATCH 3/7] fix: correctly decrement total count when skipping samples with None ground truth in DegradationAnalysis --- langtest/transform/accuracy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index eef617138..f6bc6f2a9 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -1338,6 +1338,7 @@ def qa_evaluation(self, samples: List[QASample], X_test: pd.DataFrame): # if ground_truth is having None then skip the sample and continue to the next sample if ground_truth is None: + results["total"] -= 1 continue expected_results = sample.expected_results From c30a310384d08c5bb387f87f52d9f2c9a4c6dc71 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 16 Dec 2024 22:18:13 +0530 Subject: [PATCH 4/7] fix: handle cases where ground truth is missing in DegradationAnalysis accuracy calculation --- langtest/transform/accuracy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index f6bc6f2a9..c4d7dfede 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -1334,7 +1334,8 @@ def qa_evaluation(self, samples: List[QASample], X_test: pd.DataFrame): context = sample.original_context index = context + "\n" + sample.original_question - ground_truth = X_test[X_test.index == index]["expected_results"].values[0] + g_values = X_test[X_test.index == index]["expected_results"].values + ground_truth = g_values[0] if len(g_values) else None # if ground_truth is having None then skip the sample and continue to the next sample if ground_truth is None: From d1c18ae3176959edd46f8e357463aa1417731cf2 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Mon, 16 Dec 2024 23:05:12 +0530 Subject: [PATCH 5/7] feat: make qa_evaluation a static method in DegradationAnalysis for question-answering task evaluation --- langtest/transform/accuracy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index c4d7dfede..9e87a7187 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -1307,7 +1307,8 @@ def preprocess(y_true: Union[list, pd.Series], y_pred: Union[list, pd.Series]): return y_true, y_pred - def qa_evaluation(self, samples: List[QASample], X_test: pd.DataFrame): + @staticmethod + def qa_evaluation(samples: List[QASample], X_test: pd.DataFrame): """ Evaluates the model performance on question-answering tasks. From c672e5bdf1753cef38f0895536a341fa6257f578 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Date: Tue, 17 Dec 2024 12:28:47 +0530 Subject: [PATCH 6/7] refactor: update variable names for clarity in DegradationAnalysis accuracy calculations --- langtest/transform/accuracy.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index 9e87a7187..21fd69ecb 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -1187,10 +1187,11 @@ async def run( X_test = pd.DataFrame( { "original_content": [ - x.original_context if x.original_context else "" for x in X_test + sample.original_context if sample.original_context else "" + for sample in X_test ], - "original_question": [x.original_question for x in X_test], - "expected_results": [x.expected_results for x in X_test], + "original_question": [sample.original_question for sample in X_test], + "expected_results": [sample.expected_results for sample in X_test], } ) X_test["index"] = ( @@ -1201,8 +1202,8 @@ async def run( elif isinstance(X_test, pd.Series) or isinstance(X_test, list): X_test = pd.DataFrame( { - "index": [x.original for x in X_test], - "expected_results": [x.expected_results for x in X_test], + "index": [sample.original for sample in X_test], + "expected_results": [sample.expected_results for sample in X_test], } ) From b4951088be5344b4d7f999298f63a7ef5b75c4c4 Mon Sep 17 00:00:00 2001 From: Kalyan Chakravarthy Thadaka Date: Tue, 17 Dec 2024 13:36:12 +0530 Subject: [PATCH 7/7] Update langtest/transform/accuracy.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- langtest/transform/accuracy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langtest/transform/accuracy.py b/langtest/transform/accuracy.py index 21fd69ecb..f8288a784 100644 --- a/langtest/transform/accuracy.py +++ b/langtest/transform/accuracy.py @@ -1339,7 +1339,7 @@ def qa_evaluation(samples: List[QASample], X_test: pd.DataFrame): g_values = X_test[X_test.index == index]["expected_results"].values ground_truth = g_values[0] if len(g_values) else None - # if ground_truth is having None then skip the sample and continue to the next sample + # if ground_truth is None, skip the sample and continue to the next sample if ground_truth is None: results["total"] -= 1 continue