diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py index 0e132f01..167f9c15 100644 --- a/src/lighteval/tasks/extended/__init__.py +++ b/src/lighteval/tasks/extended/__init__.py @@ -24,6 +24,7 @@ if can_load_extended_tasks(): + import lighteval.tasks.extended.codeelo.main as codeelo import lighteval.tasks.extended.ifeval.main as ifeval import lighteval.tasks.extended.lcb.main as lcb import lighteval.tasks.extended.mix_eval.main as mix_eval @@ -31,7 +32,7 @@ import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks - AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, lcb] + AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, lcb, codeelo] else: AVAILABLE_EXTENDED_TASKS_MODULES = [] diff --git a/src/lighteval/tasks/extended/codeelo/main.py b/src/lighteval/tasks/extended/codeelo/main.py new file mode 100644 index 00000000..05d52091 --- /dev/null +++ b/src/lighteval/tasks/extended/codeelo/main.py @@ -0,0 +1,106 @@ +# MIT License + +# Copyright (c) 2025 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Code obtained from: https://github.com/QwenLM/CodeElo + +Usage: +lighteval vllm \ + "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilisation=0.8,generation_parameters={temperature: 0.7,repetition_penalty:1.1,top_p:0.8,top_k:20}" \ + "extended|codeelo:rating|0|0" \ + --use-chat-template +""" + +from typing import Any + +import numpy as np +from aenum import extend_enum + +from lighteval.metrics.metrics import MetricCategory, Metrics, MetricUseCase, SampleLevelMetric +from lighteval.tasks.extended.codeelo.utils import LANG_MAP, extract_code_blocks, make_html_problem, submit_code +from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig + + +def codeelo_prompt_fn(line: dict[str, Any], task_name: str = "codeelo:rating") -> Doc: + instruction = "You are a coding expert. Given a competition-level coding problem, you need to write a C++ program to solve it. You may start by outlining your thought process. In the end, please provide the complete code in a code block enclosed with ``` ```." + return Doc( + task_name=task_name, + query=f"{instruction}\n\n{make_html_problem(line)}", + choices=[""], + gold_index=0, + specific={ + "problem_id": line["id"], + }, + ) + + +def codeelo(predictions: list[str], formatted_doc: Doc, **kwargs) -> float: + """Estimates the Pass@1 metric for the code generation task. + Extract the code from each prediction, Runs it for each sample and generations, + and computes the Pass@1 over the outputs. + """ + code_blocks = [extract_code_blocks(pred)[0] for pred in predictions] + # Extract the code and language + # TODO: For the moment we are only considering a single generation per problem, there will be 8 (like Pass@1:8) + for code_block in code_blocks: + if not code_block: + return 0.0 + code, lang = code_block + lang_id = LANG_MAP.get(lang, None) + if not lang_id: + return 0.0 + submission_id = submit_code(formatted_doc.specific["problem_id"], lang_id, code, tag="") + if isinstance(submission_id, str): + return 0.0 + + return 0.0 + + +codeelo_metric = SampleLevelMetric( + metric_name="codeelo_rating@8", # Generate 8 outputs per problem + category=MetricCategory.GENERATIVE_SAMPLING, + use_case=MetricUseCase.REASONING, + higher_is_better=True, + sample_level_fn=codeelo, + corpus_level_fn=np.mean, +) + + +extend_enum(Metrics, "codeelo_metric", codeelo_metric) + + +task = LightevalTaskConfig( + name="codeelo:rating", + suite=["extended"], + prompt_function=codeelo_prompt_fn, + hf_repo="Qwen/CodeElo", + hf_subset="test", + hf_avail_splits=["test"], + evaluation_splits=["test"], + generation_size=32768, + metric=[Metrics.codeelo_metric], + stop_sequence=[], # no stop sequence, will use EOS token + trust_dataset=True, + version=0, +) + + +TASKS_TABLE = [task] diff --git a/src/lighteval/tasks/extended/codeelo/utils.py b/src/lighteval/tasks/extended/codeelo/utils.py new file mode 100644 index 00000000..1065ccb5 --- /dev/null +++ b/src/lighteval/tasks/extended/codeelo/utils.py @@ -0,0 +1,266 @@ +# MIT License + +# Copyright (c) 2025 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import os +import re +import time +from typing import Any + +import requests +from scipy import optimize + + +URL_CODEFORCES_STANDINGS = "https://codeforces.com/api/contest.standings?contestId={contest_id}&showUnofficial=false" +URL_RATING_CHANGES = "https://codeforces.com/api/contest.ratingChanges?contestId={contest_id}" + + +def make_html_problem(line: dict[str, Any]) -> str: + title = line["title"] + html = f"

{title}

" + input_, output = line["input"], line["output"] + interaction = line["interaction"] + note = line["note"] + examples = line["examples"] + # Use interaction if provided, otherwise use input and output + html += f"

Description

{line['description']}
" + if interaction: + html += f"

Interaction

{interaction}
" + else: + html += f"

Input

{input_}
" + html += f"

Output

{output}
" + + # The example is always present + example_text = "" + for example in examples: + example_text += f"
Input:\n{example[0]}
Output:\n{example[1]}
" + html += f"

{'Example' if len(examples) == 1 else 'Examples'}

{example_text}" + + if note: + html += f"

Note

{note}
" + + html += "" + return html + + +# Let the whole mapping defined, but the output should be cpp +LANG_MAP = { + "kotlin": 88, # Kotlin 1.9.21 + "cpp": 91, # GNU G++23 14.2 (64 bit, msys2) + "ruby": 67, # Ruby 3.2.2 + "d": 28, # D DMD32 v2.105.0 + "python": 70, # PyPy 3.10 (7.3.15, 64bit) + "pascal": 51, # PascalABC.NET 3.8.3 + "rust": 75, # Rust 1.75.0 (2021) + "go": 32, # Go 1.22.2 + "node.js": 55, # Node.js 15.8.0 (64bit) + "haskell": 12, # Haskell GHC 8.10.1 + "javascript": 34, # JavaScript V8 4.8.0 + "csharp": 79, # C# 10, .NET SDK 6.0 + "perl": 13, # Perl 5.20.1 + "java": 87, # Java 21 64bit + "ocaml": 19, # OCaml 4.02.1 + "delphi": 3, # Delphi 7 + "php": 6, # PHP 8.1.7 + "scala": 20, # Scala 2.12.8 + "c": 43, # GNU GCC C11 5.1.0 +} + + +def extract_code_blocks(text: str) -> list[tuple[str, str]]: + """Extracts code blocks from a text, returning a list of tuples with the language and code (if found).""" + pattern = r"```(\w*)\n(.*?)\n```" + matches = re.findall(pattern, text, re.DOTALL) + return matches + + +def submit_code(prob: str, lang_id: int, code: str, tag: str = "", retry: int = 3, delay: int = 10) -> int | str: + """Submits code for a specific problem to the API endpoint. + + Args: + prob (str): The problem identifier to submit code for. + lang (int): The programming language id of the submitted code. + code (str): The actual code to be submitted. + tag (str, optional): Additional tag for the submission. Defaults to empty string. + retry (int, optional): Number of retry attempts if the request fails. + Defaults to RETRY constant. + + Returns: + dict/str: If successful, returns a JSON response containing submission details. + If all retries fail, returns an error message string. + + Example: + >>> result = submit_code("2000A", 70, "print('Hello')") + >>> result = submit_code("2000A", 91, "int main() {}", "test", retry=3) + """ + token = os.getenv("CODEELO_TOKEN") # Replace with your own token + base_url = os.getenv("CODEELO_BASE_URL") + + if not token or not base_url: + raise ValueError("Please set the CODEELO_TOKEN and CODEELO_BASE_URL environment variables.") + + try: + url = f"{base_url}/submit_code" + headers = {"Content-Type": "application/json", "Authorization": token} + payload = {"prob": prob, "lang": lang_id, "code": code, "tag": tag} + response = requests.post(url, json=payload, headers=headers, timeout=60) + assert response.status_code == 200, "Failed to submit code" + return response.json()["submission_id"] + + except Exception as e: + if retry > 0: + print(f"Failed to submit code, retrying in {delay} seconds") + time.sleep(delay) + return submit_code(prob, lang_id, code, tag, retry - 1) + else: + return f"Failed to submit code: {str(e)}" + + +def calc_elo_rating(contest_id: int, problem_status: dict[str, Any]) -> int: # noqa: C901 + """Compute the ELO rating for the given contest id. + + Args: + contest_id (int): _description_ + problem_status (dict[str, Any]): _description_ + + Returns: + int: _description_ + """ + standings = requests.get(URL_CODEFORCES_STANDINGS.format(contest_id=contest_id)).json() + rating_changes = requests.get(URL_RATING_CHANGES.format(contest_id=contest_id)).json() + + try: + handle_set = { + standings["result"]["rows"][i]["party"]["members"][0]["handle"] + for i in range(len(standings["result"]["rows"])) + } and {rating_changes["result"][i]["handle"] for i in range(len(rating_changes["result"]))} + standings["result"]["rows"] = [ + standings["result"]["rows"][i] + for i in range(len(standings["result"]["rows"])) + if standings["result"]["rows"][i]["party"]["members"][0]["handle"] in handle_set + ] + rating_changes["result"] = [ + rating_changes["result"][i] + for i in range(len(rating_changes["result"])) + if rating_changes["result"][i]["handle"] in handle_set + ] + + assert (len(standings["result"]["rows"]) == len(rating_changes["result"])) and len( + standings["result"]["rows"] + ) > 200, "No result" + + except Exception as e: + print(e) + + if ( + ("result" not in standings) + or ("result" not in rating_changes) + or (len(standings["result"]["rows"]) != len(rating_changes["result"])) + or (len(standings["result"]["rows"]) <= 200) + ): + print("No result, return 0") + return 0 + + max_rating = 0 + for i in range(len(rating_changes["result"])): + max_rating = max(max_rating, rating_changes["result"][i]["oldRating"]) + + # Obtain score and penalty + score = 0 + penalty = 0 + + for problem in standings["result"]["problems"]: + prob = f"{problem['contestId']}{problem['index']}" + if prob in problem_status.keys(): + for ith, status in enumerate(problem_status[prob]): + if status == "AC": + if "points" in problem: + score += max(0, problem["points"] - 50 * ith) + else: + score += 1 + penalty += ith * 10 + break + + # Obtain number of participants and target rank + n = len(standings["result"]["rows"]) + + rank = n + for i in range(n): + if (standings["result"]["rows"][i]["points"] < score) or ( + (standings["result"]["rows"][i]["points"] == score) + and (standings["result"]["rows"][i]["penalty"] > penalty) + ): + rank = i + break + + return find_rating(rating_changes, rank, max_rating=max_rating) + + +def calculate_elo_expectation(candidate_rating: float, player_ratings: list[float]) -> float: + """Calculate the expected score based on Elo rating formula""" + return 1 + sum(1 / (1 + 10 ** ((candidate_rating - rating) / 400)) for rating in player_ratings) + + +def find_rating(rating_changes: dict, target_rank: float, max_rating: int = 4000) -> int: + """Find the rating using scipy's root finding methods""" + old_ratings = [change["oldRating"] for change in rating_changes["result"]] + + def rating_difference(x: float) -> float: + return calculate_elo_expectation(x, old_ratings) - target_rank + + # Use binary search method from scipy + result = optimize.root_scalar(rating_difference, bracket=[0, max_rating + 100], method="brentq") + + return int(result.root) + + +def check_status(submission_id, retry: int = 3, delay: int = 10) -> dict[str, Any] | str: + """Checks the status of a specific submission using the API endpoint. + + Args: + submission_id (str): The ID of the submission to check. + retry (int, optional): Number of retry attempts if the request fails. + + Returns: + dict/str: If successful, returns a JSON response containing submission status. + If all retries fail, returns an error message. + + Example: + >>> status = check_status("12345") + >>> status = check_status("67890", retry=3) + """ + token = os.getenv("CODEELO_TOKEN") + base_url = os.getenv("CODEELO_BASE_URL") + + try: + url = f"{base_url}/check_status" + headers = {"Content-Type": "application/json", "Authorization": token} + params = {"submission_id": submission_id} + response = requests.get(url, headers=headers, params=params, timeout=20) + assert response.status_code == 200 + return response.json()["status_canonical"] + except Exception as e: + if retry > 0: + print(f"Failed to get problem, retrying in {delay} seconds") + time.sleep(delay) + return check_status(submission_id, retry - 1) + else: + return f"Failed to get problem: {str(e)}"