You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
def read_json(path: str):
with open(path, "r") as f:
r = json.load(f)
# format column
result = r["results"]
result_tr = {}
for task, item in result.items():
if "lighteval|" in task:
for k, v in item.items():
new_key = f"{task}/{k}"
result_tr[new_key] = v
return result_tr
def get_dataframe_result(
paths: str | list[str], separate: bool = False
) -> pd.DataFrame:
if isinstance(paths, list):
if separate:
return pd.concat([pd.DataFrame([read_json(p)]) for p in paths], axis=1)
else:
return pd.concat([pd.DataFrame([read_json(p)]) for p in paths], axis=0)
if isinstance(paths, str):
return pd.DataFrame([read_json(paths)])
llama3_2 = glob.glob("../results/results/meta-llama/Llama-3.2-3B/*.json")
llama3_2_df = get_dataframe_result(paths=llama3_2, separate=True).reset_index(drop=True)
llama3_2_df["model"] = "llama3_2"
random_baseline = pd.read_csv("../lighteval_baseline.csv", index_col=None)
filter_baseline_col = []
for col in random_baseline.columns:
task = col.split("/")[0]
for c in df.columns:
if task in c:
filter_baseline_col.append(col)
filter_baseline_col = sorted(
set(
[
c
for c in filter_baseline_col
if "lighteval|" in c
and ("/acc_" in c or "/f1" in c)
and ("stderr" not in c)
]
)
)
random_baseline = random_baseline.loc[:, filter_baseline_col]
random_baseline.columns = [
c.replace("norm", "") for c in random_baseline.columns
]
metric_cols_1 = [
col
for col in df.columns
if "lighteval|" in col
and ("/acc_" in col or "/f1" in col)
and ("stderr" not in col)
]
# Get all metric columns (containing 'lighteval|')
metric_cols_2 = [
col
for col in df.columns
if "lighteval|" in col
and ":_average" not in col # Exclude existing averages
and ("/acc_" in col or "/f1" in col)
and ("stderr" not in col)
] # Include only metric columns
category_groups = defaultdict(list)
for col in metric_cols_2:
task_name = col.split("|")[1].split("_")[0]
if task_name == "meta":
task_name = "meta_mmlu"
if task_name == "community":
task_name = "community_hellaswag"
# print(task_name)
category = get_task_category(task_name)
# print(category)
if category:
category_groups[category].append(col)
# category_groups
random_baseline.columns = [
f"{col}baseline" for col in random_baseline.columns
]
result_df_list = []
for model in tqdm(df["model"].unique()):
# print(f"Processing model: {model}")
model_df = df.loc[df["model"].eq(model)].copy().reset_index(drop=True)
for col in metric_cols_1:
baseline_col = f"{col}baseline"
# print(baseline_col)
# Skip if baseline is 0 or 1
if (random_baseline[baseline_col] == 0).all() or (
random_baseline[baseline_col] == 1
).all():
continue
# Rescale: (score - baseline) / (1 - baseline)
model_df[col] = (model_df[col] - random_baseline[baseline_col]) / (
1 - random_baseline[baseline_col]
)
# Calculate mean for each category
for category, cols in category_groups.items():
model_df[f"category_{category}"] = model_df[cols].mean(axis=1)
category_cols = [col for col in model_df.columns if col.startswith("category_")]
model_df["agg_score_macro"] = model_df[category_cols].mean(axis=1)
result_df_list.append(model_df)
final_df = pd.concat(result_df_list, axis=0, ignore_index=True).sort_values(
by="agg_score_macro", ascending=False
)
cols = ["model", "category_RC", "category_NLU", "category_GK", "agg_score_macro"]
final_df.loc[:, cols]
Note that
the df is a dataframe containing the results for my models. One model per row, which include llama 3.2 and others.
def rescale_scores(
df: pd.DataFrame, baseline_runs: list[str], metric_columns: list[str]
) -> pd.DataFrame:
"""
Rescales scores relative to a baseline performance.
"""
df = df.copy()
# Calculate mean baseline performance
baseline_mask = df["runname"].isin(baseline_runs)
baseline = df[baseline_mask].groupby("steps")[metric_columns].mean()
# Reindex to match all steps and interpolate missing values
baseline = baseline.reindex(df["steps"].unique()).interpolate()
# Merge baseline scores with main df
df_with_baseline = df.merge(
baseline.reset_index(), on=["steps"], how="left", suffixes=("", "_baseline")
).fillna(0)
# Rescale each metric column
for col in metric_columns:
baseline_col = f"{col}_baseline"
# Skip if baseline is 0 or 1
if (df_with_baseline[baseline_col] == 0).all() or (
df_with_baseline[baseline_col] == 1
).all():
continue
# Rescale: (score - baseline) / (1 - baseline)
df[col] = (df[col] - df_with_baseline[baseline_col]) / (
1 - df_with_baseline[baseline_col]
)
return df
This function can not be used directly with the output of lighteval accelerate command as there is no runname and steps column. So I ommited these line
# Calculate mean baseline performance
baseline_mask = df["runname"].isin(baseline_runs)
baseline = df[baseline_mask].groupby("steps")[metric_columns].mean()
# Reindex to match all steps and interpolate missing values
baseline = baseline.reindex(df["steps"].unique()).interpolate()
# Merge baseline scores with main df
df_with_baseline = df.merge(
baseline.reset_index(), on=["steps"], how="left", suffixes=("", "_baseline")
).fillna(0)
and use only
# Rescale each metric column
for col in metric_columns:
baseline_col = f"{col}_baseline"
# Skip if baseline is 0 or 1
if (df_with_baseline[baseline_col] == 0).all() or (
df_with_baseline[baseline_col] == 1
).all():
continue
# Rescale: (score - baseline) / (1 - baseline)
df[col] = (df[col] - df_with_baseline[baseline_col]) / (
1 - df_with_baseline[baseline_col]
)
return df
Now, I know that the random baseline can be generated via this command
but in the above code, I used the one attached in #509
Result
After I ran the above code, I got this
I really want to compare the result with the leaderboard, but as documentation on how to compare these metrics seem to be not clear right now. This is as much as I can do.
Can someone help me with this, please?
The text was updated successfully, but these errors were encountered:
Hi
I am trying to compute Thai finetasks score of llama 3.2 1B.
As shown in the web,
You can see that the score is 0.14. But When I normalized the score and compute the final result I got 0.296401, which is quite different.
Steps
According to this #509
Here is my code
Note that
df
is a dataframe containing the results for my models. One model per row, which include llama 3.2 and others.here is the command
This function can not be used directly with the output of
lighteval accelerate
command as there is no runname and steps column. So I ommited these lineand use only
but in the above code, I used the one attached in #509
Result
After I ran the above code, I got this
I really want to compare the result with the leaderboard, but as documentation on how to compare these metrics seem to be not clear right now. This is as much as I can do.
Can someone help me with this, please?
The text was updated successfully, but these errors were encountered: