generate_open.py

import json
import os
from typing import Optional, Tuple
import argparse
from vllm import LLM, SamplingParams
import torch
from utils import *

SUB_LIST=[
    "Abstract_algebra",
    "Combinatorics",
    "Statistics",
    "Geometry",
    "Set_theory_and_logic",
    "Probability",
    "Arithmetic",
    "Trigonometry",
    "Complex_analysis",
    "Differential_equations",
    "Calculus_-_single_variable",
    "Linear_algebra",
    "Calculus_-_multivariable",
    "Algebra",
    "Number_theory",
    "Financial_mathematics"
]
VERSION_NUM = 3

def generate(model_path: str, 
            dataset_path: str, 
            output_path: str, 
            version: int = 0,
            prompt: str = "raw", 
            tensor_parallel_size: int = 1, 
            test_part: Optional[Tuple[int, int]] = None)-> None:

    """
    Generate model's response using vLLM. 

    Parameters:
        model_path: Path to your model
        dataset_path: Path to your dataset. We provide some data in eval/input as examples. If you wat to use your own dataset, please convert the dataset to json format with a list of test samples. Each sample contains at least "answer" and "type". 
        prompt: The prompt to evaluate. We provide some propmts in prompt.json and you can add new prompts following our examples.
        output_path: Path to save the results.
        tensor_parallel_size: The number of GPUs you want to use. 
        test_part: The range of data to be processed. [low, high]. If left None, will process all the data. 
    """

    # Load test data
    with open(dataset_path, 'r') as f:
        test_dataset_list = json.load(f)
    
    exists = []

    # deal with version
    if version == -1:
        test_dataset_list_new = []
        for i in range(VERSION_NUM):
            if not os.path.exists(output_path.split(".js")[0] + f"_v{i+1}.json"):
                for item in test_dataset_list:
                    ddd = item.copy()
                    ddd['problem'] = item['problem_v'+ str(i+1)] 
                    ddd['answer'] = item['answer_v' + str(i+1)]
                    ddd['version'] = i+1
                    ddd['answer_type'] = item['answer_type_v'+str(i+1)]
                    ddd['options'] = item['options_v'+str(i+1)]
                    test_dataset_list_new.append(ddd)
                print(f"VERSION: {i+1} LOADED!")
            else:
                with open(output_path.split(".js")[0] + f"_v{i+1}.json") as f:
                    exists += json.load(f)
                print(f"VERSION: {i+1} already exists!")
        for item in test_dataset_list_new:
            for i in range(VERSION_NUM):
                try:
                    del item['problem_v'+str(i+1)]
                    del item['answer_v'+str(i+1)]
                    del item['answer_type_v'+str(i+1)]
                    del item['options_v'+str(i+1)]
                except:
                    pass
        print(f"ALL VERSIONS LOADED!")
        test_dataset_list = test_dataset_list_new
        if len(test_dataset_list) == 0:
            print(f"ALL VERSIONS ALREADY EVALUATED, WRITING TO {output_path}!")
            with open(output_path, 'w') as f:
                json.dump(exists, f, indent=4)
    else:
        for i in range(VERSION_NUM):
            if i+1 == version:
                for item in test_dataset_list:
                    item['problem'] = item['problem_v'+ str(i+1)] 
                    item['answer'] = item['answer_v' + str(i+1)]
                    item['version'] = i+1 # add version
                    item['answer_type'] = item['answer_type_v'+str(i+1)]
                    item['options'] = item['options_v'+str(i+1)]
        for item in test_dataset_list:
            for i in range(VERSION_NUM):
                del item['problem_v'+str(i+1)]
                del item['answer_v'+str(i+1)]
                del item['answer_type_v'+str(i+1)]
                del item['options_v'+str(i+1)]
        print(f"VERSION: {version} LOADED!")

    # Load prompt
    with open("prompt.json", 'r') as f:
        prompt_dict = json.load(f)[prompt]
    
    # TODO: deal with inference prompt
    test_dataset_list = make_prompt(prompt_dict, test_dataset_list)

    if test_part is not None:
        test_dataset_list = test_dataset_list[test_part[0]:test_part[1]]
    batch_size = len(test_dataset_list)

    for items in test_dataset_list:
        items["completion"] = ""
    llm = LLM(model=model_path,
              tensor_parallel_size=tensor_parallel_size,
              trust_remote_code=True,
              dtype=torch.bfloat16,
              swap_space=32,
              gpu_memory_utilization=0.95,
              max_model_len=4096)
    stop_tokens = ["<|eot_id|>","</s>", "Question:", "Question", "USER:", "USER", "ASSISTANT:", "ASSISTANT", "Instruction:", "Instruction", "Response:", "Response"]
    sampling_params = SamplingParams(temperature=0, 
                                     top_p=1, 
                                     max_tokens=2048, 
                                     stop=stop_tokens)

    for index_global in range(len(test_dataset_list) // batch_size):
        # Batch Construct
        if batch_size *(index_global+2) > len(test_dataset_list):
            curr_data_list = test_dataset_list[batch_size * index_global:]
        else:
            curr_data_list = test_dataset_list[batch_size * index_global:batch_size * (index_global+1)]

        # Inference
        prompts = [item['prompt'] for item in curr_data_list]
        print(prompts[0])
        completions = llm.generate(prompts, sampling_params)

        # Save
        for index_in_curr_batch, output in enumerate(completions):
            generated_text = output.outputs[0].text
            index_in_data_list = index_global * batch_size + index_in_curr_batch
            item = test_dataset_list[index_in_data_list]
            item["completion"] = generated_text

    
    with open(output_path, 'w') as f:
        json.dump(test_dataset_list, f, indent=4)
    
    # deal with versions 
    for i in range(VERSION_NUM):
        if not os.path.exists(output_path.split(".js")[0] + f"_v{i+1}.json"):
            with open(output_path.split(".js")[0] + f"_v{i+1}.json", "w") as f: # also save different versions
                ddd = [item for item in test_dataset_list if item['version'] == i+1]
                json.dump(ddd, f, indent=4)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', 
                        type = str,
                        help="The path of model to evaluate.")
    parser.add_argument('--subject',
                        type=str,
                        help="The subject to evaluate.",
                        default="all")
    parser.add_argument('--version',
                        type=int,
                        help="Random version to evaluate. -1 for all versions.",
                        default=-1)
    parser.add_argument('--prompt',
                        type=str,
                        help="The prompt template to use in evaluation.",
                        default="raw")
    parser.add_argument('--output_dir', type=str, default="./results/")
    parser.add_argument('--tensor_parallel_size', type=int, default=1)

    args = parser.parse_args()
    out_dir = os.path.join(args.output_dir, args.model_path.split("/")[-1])
    mkdir(out_dir)

    if args.subject == "all":
        for sub in SUB_LIST:
            if args.version != -1:
                raise NotImplementedError("Only support evaluating all versions if subject == all!")
            out_fn = os.path.join(out_dir, sub + ".json")
            if not os.path.exists(out_fn):
                print(f"Start evaluating {args.model_path.split('/')[-1]} on {sub}---version {args.version}!")
                generate(model_path=args.model_path,
                         dataset_path=os.path.join("data", sub + ".json"),
                         version=args.version,
                         output_path=out_fn,
                         prompt=args.prompt,
                         tensor_parallel_size=args.tensor_parallel_size)
            else:
                print(f"{args.model_path.split('/')[-1]} on {sub} already exist!")
    else:
        if args.version == -1:
            out_fn = os.path.join(out_dir, args.subject + ".json")
        else:
            out_fn = os.path.join(out_dir, args.subject + "_v" + str(args.version) + ".json") 
        if not os.path.exists(out_fn):
            print(f"Start evaluating {args.model_path.split('/')[-1]} on {args.subject}---version {args.version}!")
            generate(model_path=args.model_path,
                     dataset_path=os.path.join("data", args.subject + ".json"),
                     version=args.version,
                     output_path=out_fn,
                     prompt=args.prompt,
                     tensor_parallel_size=args.tensor_parallel_size)
        else:
            print(f"{args.model_path.split('/')[-1]} on {args.subject}  already exist!")