Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: enhance DegradationAnalysis to support question-answering task #1153

144 changes: 115 additions & 29 deletions langtest/transform/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
)
from langtest.utils.custom_types.helpers import default_user_prompt
from langtest.errors import Errors
from langtest.utils.custom_types.sample import DegradationSample
from langtest.utils.custom_types.sample import DegradationSample, QASample
from langtest.utils.util_metrics import (
calculate_f1_score,
calculate_f1_score_multi_label,
Expand Down Expand Up @@ -1157,7 +1157,7 @@ class DegradationAnalysis(BaseAccuracy):

alias_name = ["degradation_analysis"]

supported_tasks = ["ner", "text-classification"]
supported_tasks = ["ner", "text-classification", "question-answering"]

result_data = defaultdict(dict)

Expand All @@ -1183,7 +1183,22 @@ async def run(
test_cases: Dict[str, Dict[str, List[Sample]]] = kwargs.get("test_cases", [])
X_test = kwargs.get("X_test", [])

if isinstance(X_test, pd.Series) or isinstance(X_test, list):
if len(X_test) and isinstance(X_test[0], QASample):
X_test = pd.DataFrame(
{
"original_content": [
x.original_context if x.original_context else "" for x in X_test
],
"original_question": [x.original_question for x in X_test],
"expected_results": [x.expected_results for x in X_test],
}
)
X_test["index"] = (
X_test["original_content"] + "\n" + X_test["original_question"]
)
X_test.set_index("index", inplace=True)

elif isinstance(X_test, pd.Series) or isinstance(X_test, list):
X_test = pd.DataFrame(
{
"index": [x.original for x in X_test],
Expand All @@ -1208,19 +1223,25 @@ async def run(
if category not in ["robustness", "bias"]:
continue
for test_type, samples in data.items():
ground_truth = X_test[X_test.index.isin([i.original for i in samples])][
"expected_results"
].to_list()
if len(samples) and isinstance(samples[0], QASample):
accuracy_score1, accuracy_score2 = DegradationAnalysis.qa_evaluation(
samples, X_test
)

else:
ground_truth = X_test[
X_test.index.isin([i.original for i in samples])
]["expected_results"].to_list()

expected_results = [x.expected_results for x in samples]
actual_results = [x.actual_results for x in samples]
expected_results = [x.expected_results for x in samples]
actual_results = [x.actual_results for x in samples]

accuracy_score1 = calculate_f1_score(
*DegradationAnalysis.preprocess(ground_truth, expected_results)
)
accuracy_score2 = calculate_f1_score(
*DegradationAnalysis.preprocess(ground_truth, actual_results)
)
accuracy_score1 = calculate_f1_score(
*DegradationAnalysis.preprocess(ground_truth, expected_results)
)
accuracy_score2 = calculate_f1_score(
*DegradationAnalysis.preprocess(ground_truth, actual_results)
)

degradation = accuracy_score2 - accuracy_score1

Expand Down Expand Up @@ -1286,6 +1307,69 @@ def preprocess(y_true: Union[list, pd.Series], y_pred: Union[list, pd.Series]):

return y_true, y_pred

def qa_evaluation(self, samples: List[QASample], X_test: pd.DataFrame):
"""
Evaluates the model performance on question-answering tasks.

Args:

samples (List[QASample]): The list of QASample instances.
X_test (pd.DataFrame): The test data.

Returns:

Tuple[float, float]: The accuracy scores for the original and perturbed samples.

"""

results = {
"original": [],
"perturbed": [],
"total": len(samples),
}
for sample in samples:
if sample.original_context is None:
context = ""
else:
context = sample.original_context
index = context + "\n" + sample.original_question
ground_truth = X_test[X_test.index == index]["expected_results"].values[0]

expected_results = sample.expected_results
actual_results = sample.actual_results

original = sample.original_context + "\n" + sample.original_question
perturbed = sample.perturbed_context + "\n" + sample.perturbed_question

from ..utils.custom_types.helpers import is_pass_llm_eval
from ..langtest import EVAL_MODEL

o_result = is_pass_llm_eval(
eval_model=EVAL_MODEL,
dataset_name=sample.dataset_name,
original_question=original,
answer="\n".join(map(str, ground_truth)),
perturbed_question=perturbed,
prediction=expected_results,
)

p_result = is_pass_llm_eval(
eval_model=EVAL_MODEL,
dataset_name=sample.dataset_name,
original_question=original,
answer="\n".join(map(str, ground_truth)),
perturbed_question=perturbed,
prediction=actual_results,
)

results["original"].append(int(o_result) if o_result else 0)
results["perturbed"].append(int(p_result) if p_result else 0)

accuracy_score1 = sum(results["original"]) / max(results["total"], 1)
accuracy_score2 = sum(results["perturbed"]) / max(results["total"], 1)

return accuracy_score1, accuracy_score2

@staticmethod
def show_results():
import pandas as pd
Expand All @@ -1304,43 +1388,45 @@ def show_results():
y_pos = range(len(y_labels))

for i, label in enumerate(y_labels):
# Before robustness bar
ax.broken_barh(
[(0, df["after"][i])],
(i - 0.2, 0.4),
color="#1f77b4",
label="After" if i == 0 else "",
)
# After robustness bar with adjusted width send this bar to back
# Before robustness bar (back layer)
ax.broken_barh(
[(0, df["before"][i])],
(i - 0.4, 0.8),
color="#d3d3d3",
zorder=0,
label="Before" if i == 0 else "",
)
# After robustness bar (front layer)
ax.broken_barh(
[(0, df["after"][i])],
(i - 0.2, 0.4),
color="#1f77b4",
label="After" if i == 0 else "",
zorder=1,
)

# Adjust label positions if too close
if abs(df["before"][i] - df["after"][i]) < 0.05:
offset = 0.03
else:
offset = 0.01
# Adjust label position dynamically
offset = 0.03 if abs(df["before"][i] - df["after"][i]) < 0.05 else 0.01

# Text for "after" bar
ax.text(
df["after"][i] + 0.01,
i,
i - 0.1,
f"{df['after'][i]:.2f}",
va="center",
ha="left",
color="#1f77b4",
zorder=2,
)
# Text for "before" bar
ax.text(
df["before"][i] + offset,
i,
i + 0.1,
f"{df['before'][i]:.2f}",
va="center",
ha="left",
color="black",
zorder=2,
)

ax.set_xlim(0, 1)
Expand Down
Loading