inclusionAI
diff --git a/‎Dockerfile
-1 b/‎Dockerfile
-1
diff --git a/‎evaluation/eval_and_aggregate.py
+7-4 b/‎evaluation/eval_and_aggregate.py
+7-4
diff --git a/‎evaluation/math_eval.py
+1-1 b/‎evaluation/math_eval.py
+1-1
diff --git a/‎evaluation/requirements.txt
+4-2 b/‎evaluation/requirements.txt
+4-2
diff --git a/‎evaluation/sh/install_deps_and_eval.sh
+7 b/‎evaluation/sh/install_deps_and_eval.sh
+7
diff --git a/‎examples/README.md
+3-3 b/‎examples/README.md
+3-3
diff --git a/‎examples/README_zh.md
+3-3 b/‎examples/README_zh.md
+3-3
diff --git a/‎examples/train_code_batch_1.5B_n16.sh
+52 b/‎examples/train_code_batch_1.5B_n16.sh
+52
diff --git a/‎examples/train_code_small_on_ray.sh
+123 b/‎examples/train_code_small_on_ray.sh
+123
diff --git a/‎examples/train_small_on_ray.sh
+1-1 b/‎examples/train_small_on_ray.sh
+1-1
diff --git a/‎functioncall/__init__.py
+1 b/‎functioncall/__init__.py
+1
diff --git a/‎functioncall/base/__init__.py b/‎functioncall/base/__init__.py
@@ -39,7 +39,6 @@ RUN cd /vllm && \
     python3 use_existing_torch.py && \
     pip3 install -r requirements-build.txt && \
     MAX_JOBS=64 pip3 install -e . --no-build-isolation
-RUN yes | pip3 uninstall uvloop
 RUN pip3 install opencv-python-headless==4.5.4.58
 
 RUN apt-get update && apt-get install -y python3.10-venv
 
@@ -5,6 +5,7 @@
 from glob import glob
 
 import numpy as np
+import wandb
 from rm_maj_eval import group_pred
 from tqdm import tqdm
 from transformers import AutoTokenizer
@@ -29,6 +30,7 @@ def parse_args():
     parser.add_argument("--overwrite", action="store_true")
     parser.add_argument("--evaluate_train", action="store_true")
     parser.add_argument("--max_gen_tokens", default=32768, type=int)
+
     args = parser.parse_args()
     if args.output_path is None:
         args.output_path = args.model_path
@@ -145,7 +147,7 @@ def process_single_data_name(args, data_name, base_dir, tokenizer):
 
 if __name__ == "__main__":
     args = parse_args()
-
+    print(f"Evaluation output to {args.output_path}")
     assert args.num_sample_nodes * args.samples_per_node >= args.n_sampling
 
     eval_dir = (
@@ -155,6 +157,7 @@ def process_single_data_name(args, data_name, base_dir, tokenizer):
     )
 
     base_dir = os.path.join(args.output_path, eval_dir)
+    os.makedirs(base_dir, exist_ok=True)
     tokenizer = AutoTokenizer.from_pretrained(args.model_path)
     result_path = os.path.join(base_dir, f"aggregate_parallel_{args.prompt_type}.json")
 
@@ -228,10 +231,10 @@ def process_single_data_name(args, data_name, base_dir, tokenizer):
         from prettytable import PrettyTable
 
         table = PrettyTable()
-        filed_names = ["dataset"] + list(all_results[args.data_names[0]].keys())
-        table.field_names = filed_names
+        field_names = ["dataset"] + list(all_results[args.data_names[0]].keys())
+        table.field_names = field_names
         for k, v in all_results.items():
-            table.add_row([k, *[round(v[x], 1) for x in filed_names[1:]]])
+            table.add_row([k, *[round(v[x], 1) for x in field_names[1:]]])
 
         print(table)
     except:
 
@@ -137,7 +137,7 @@ def generate_in_parallel(requests, model_args, sampling_params, data_parallel_si
     def run_inference_one_model(
         model_args: dict, sampling_params, requests, cuda_visisble_devices
     ):
-        os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
+        os.environ["VLLM_LOGGING_LEVEL"] = "INFO"
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
             [str(x) for x in cuda_visisble_devices]
         )
 
@@ -3,7 +3,7 @@ vllm
 tqdm
 datasets
 torch
-transformers
+transformers==4.47.0
 python_dateutil
 flash_attn
 
@@ -12,4 +12,6 @@ sympy==1.12
 antlr4-python3-runtime==4.11.1 # ! The version needs to be compatible with sympy.
 word2number
 Pebble
-timeout-decorator
+prettytable
+timeout-decorator
+wandb
@@ -0,0 +1,7 @@
+#!/bin/bash
+# Users should run this script under AReaL directory
+
+/usr/bin/python3 -m pip install -e evaluation/latex2sympy/
+/usr/bin/python3 -m pip install -r evaluation/requirements.txt
+
+cd evaluation && /usr/bin/python3 eval_and_aggregate.py --model_path $1 --output_path $2 --data_names $3 --max_gen_tokens $4 --prompt_type $5
@@ -13,14 +13,14 @@ Check if your hardware meets these minimum requirements:
 | Memory | 1 TB |1 TB per node|1 TB per node| 1 TB per node |1 TB per node|
 | Network | NVSwitch |NVSwitch + RoCE 3.2 Tbps|NVSwitch + RoCE 3.2 Tbps| NVSwitch + RoCE 3.2 Tbps |NVSwitch + RoCE 3.2 Tbps|
 | Storage | 1TB |Shared storage (NAS) 10TB|Shared storage (NAS) 10TB| Shared storage (NAS) 10TB |Shared storage (NAS) 10TB|
-| **Total Time (Hours)** | **520** | **150** | **50** | **410** | **130** |
+| **Total Time (Hours)** | **520** | **150** | **50** | **680** | **200** |
 
 Notes:
 - GPUs need to have 80GB memory. Other GPU models with similar specs are acceptable.
 - Single-node training can use local storage, but multi-node training requires shared storage.
 - Total Training Time = Number of Epochs × Number of Steps per Epoch × Training Time per Step
   - Number of Epochs defaults to 10.
-  - Number of Steps per Epoch depends on the dataset size. In this tutorial, the dataset approximately requires 40 steps per epoch for the 1.5B model and 20 steps for the 7B model. The batch sizes for 1.5B and 7B models are different.
+  - Number of steps per epoch depends on the dataset size and batch size. For example, with a dataset of 40,315 samples and a batch size of 1024, each epoch requires training for 40,315 / 1024 = 39.37 steps. Ultimately, an epoch will involve a minimum of 39 steps and a maximum of 40 steps of training.
   - Training Time per Step depends on the number of GPUs used.
 
 ## Software Requirements
@@ -358,7 +358,7 @@ The last entry is used to explain the meaning of key fields:
   - `importance_weight`: The average importance sampling ratio across all tokens in the PPO loss. This value is typically close to 1.
   - `actor_clip_ratio`: The ratio of tokens clipped in the PPO loss to the total number of tokens. This is usually less than 0.1.
   - `actor_loss`: The PPO loss. **It does not show a clear upward or downward trend during training** and should not be used as a reference for model performance.
-  - `avg_seq_len`: The average length of all sampled answers in this step. In a full multi-stage training process, this value will first decrease and then increase.
+  - `avg_seq_len`: The average length of all sequences (i.e., prompts with sampled answers) in this step. In a full multi-stage training process, this value will first decrease and then increase.
   - `no_eos_ratio`: The ratio of sampled answers truncated due to exceeding the maximum generation length. An increase in this value indicates that the average length of answers is increasing.
 
 # Evaluation
 
@@ -14,7 +14,7 @@
 | 内存    | 1 TB    |每节点 1 TB|每节点 1 TB    |每节点 1 TB    |每节点 1 TB    |
 | 通信    | NVSwitch    |NVSwitch+RoCE 带宽 3.2 Tbps|NVSwitch+RoCE 带宽 3.2 Tbps|NVSwitch+RoCE 带宽 3.2 Tbps|NVSwitch+RoCE 带宽 3.2 Tbps|
 | 存储    | 1TB    |共享存储（NAS）10TB |共享存储（NAS）10TB |共享存储（NAS）10TB |共享存储（NAS）10TB |
-|总训练时间（小时）|520|150|50|410|130|
+|总训练时间（小时）|520|150|50|680|200|
 
 关于硬件要求的说明：
 
@@ -27,7 +27,7 @@
 -  总训练时间 = Epoch 数量 * 每个 Epoch 的 Step 数量 * 单步训练时间
 
     - Epoch 数量默认为 10
-    - 每个 Epoch 的 Step 数量与数据集大小有关。利用我们的数据集， 1.5B 模型每个epoch需要训练 39 步，7B 模型每个epoch需要训练 19 步。不同大小模型使用的batch size也不同
+    - 每个 Epoch 的 Step 数量与数据集大小和 Batch Size 有关。比如数据集为 40315 条，Batch Size 为 1024 时，每个 Epoch 需要训练 40315 / 1024 = 39.37 步，最终一个 Epoch 最少训练 39 步，最多训练 40 步。
     - 单步训练时间与 GPU 卡数有关
 
 ## 软件要求
@@ -380,7 +380,7 @@ python3 -m realhf.apps.quickstart ppo-math --show-args
 + `importance_weight`: PPO loss中重要性采样比率在所有token上的平均值，通常接近1。
 + `actor_clip_ratio`: PPO loss中被clip掉的token占所有token的比率，通常小于0.1。
 + `actor_loss`: PPO loss，**不会随着训练过程有明显的上升或下降趋势**，不应作为模型表现的参考。
-+ `avg_seq_len`: 这一步中采样的所有答案的平均长度。在完整的多阶段训练中，这个值会先下降再上升。
++ `avg_seq_len`: 这一步中采样的所有序列（即提示词和答案相加）的平均长度。在完整的多阶段训练中，这个值会先下降再上升。
 + `no_eos_ratio`: 这一步中采样的所有答案因为超出最大生成长度被截断的比率。这个值上升也代表了答案的平均长度在上升。
 
 # 评估
 
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+
+EXP_NAME=ppo-zero-distill-1.5B-n16-jun1
+MODEL_NAME="DeepSeek-R1-Distill-Qwen-1.5B"
+DATASET_NAME="codeparrot-apps-test.jsonl"
+NODES=16
+ALLOCATION_MODE="vllm.d64p1m1+d32p2m1"
+
+LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
+mkdir -p ${LOG_DIR}
+echo "Log Dir: ${LOG_DIR}"
+
+MAX_WORKERS=$(expr 16 / ${NODES})
+
+FIFO_NAME=$(mktemp -u)
+mkfifo "$FIFO_NAME"
+exec 3<>"$FIFO_NAME"
+rm -f "$FIFO_NAME"
+
+for ((i=0; i<MAX_WORKERS; i++)); do
+    echo >&3
+done
+
+
+ALL_PARAMS=(
+    "${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
+    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
+    #"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
+)
+
+echo "Task Count: ${#ALL_PARAMS[@]}"
+
+for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
+    read -u3
+
+    {
+        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
+        bash -c "bash ${SCRIPT_DIR}/train_code_small_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
+        echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
+        sleep 120
+        echo >&3
+    } &
+
+    sleep 120
+done
+
+wait
+
+exec 3>&-
+echo "All tasks completed"
@@ -0,0 +1,123 @@
+#!/bin/sh
+MODEL_FAMILY=qwen2
+
+EXP_NAME="$1"
+MODEL_NAME="$2"
+DATASET_NAME="$3"
+TRAIN_BATCH_SIZE="$4"
+GROUP_SIZE="$5"
+NODES="$6"
+ALLOCATION_MODE="$7"
+MAX_NEW_TOKENS=$8
+MAX_NUM_SEQS=$9
+PPO_MBS=${10}
+KL_CTL=${11}
+
+MAX_TOKEN_PER_MB=$(expr 2048 + ${MAX_NEW_TOKENS} + 1024)
+MAX_SEQ_LEN_TO_CAPTURE=$(expr 2048 + ${MAX_NEW_TOKENS})
+
+BASE_MODEL_PATH="/storage/models/${MODEL_NAME}"
+
+# original data
+DATA_PATH="/storage/datasets/${DATASET_NAME}"
+REAL_CODE_METADATA_PATH="/storage/datasets/codeparrot-apps-test.jsonl"
+
+# Option 1: The experiment runs locally with subprocesses.
+# MODE=local
+# Option 2: The experiment runs in a Ray cluster
+# MODE=ray
+# Option 3: The experiment runs in a SLURM + pyxis cluster
+# Using the slurm mode requires a cluster spec file
+# and setting CLUSTER_SPEC_PATH to the path of it.
+MODE=ray
+
+# `experiment_name` and `trial_name` can be arbitrary.
+# Logs and saved checkpoints will be indexed by them.
+#EXP_NAME=ppo-zero--${MODEL_NAME}--${DATASET_NAME}
+#EXP_NAME=ppo-zero-distill-1.5B-default
+TRIAL_NAME="${TRAIN_BATCH_SIZE}x${GROUP_SIZE}-n${NODES}"
+
+# We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
+# for each model function call, i.e., actor generation, critic inference, actor train, etc.
+# The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
+# ReaL will make full use of these available GPUs to design allocations.
+# This does not ensure the optimal throughput, but it is a good starting point.
+
+# The `heuristic` allocation mode is not ensured to run with every model configurations.
+# For example, if the vocabulary size is an odd number, the model parallelism may not work.
+# In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
+
+# The `ppo` subcommand specifies that this is a PPO experiment.
+# The `save_freq_steps` is set to `null` to disable saving checkpoints.
+# Enable it if you want to save checkpoints.
+# The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
+# Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
+# It's the user's responsibility to tune them appropriately.
+unset CLUSTER_SPEC_PATH
+CLUSTER_SPEC_PATH=/storage/ray/cluster_config_on_ray.json \
+REAL_CODE_METADATA_PATH=${REAL_CODE_METADATA_PATH} \
+FUNCTIONCALL_SERVICE_DOMAIN="" \
+REAL_GPU_MEMORY_KILL_THRESHOLD=1 \
+python3 -m realhf.apps.quickstart ppo-code \
+    mode=$MODE \
+    experiment_name=$EXP_NAME \
+    trial_name=$TRIAL_NAME \
+    wandb.mode=disabled \
+    exp_ctrl.total_train_epochs=1 \
+    exp_ctrl.save_freq_epochs=1 \
+    exp_ctrl.ckpt_freq_secs=600 \
+    group_size=${GROUP_SIZE} \
+    group_adv_norm=False \
+    use_dense_reward=False \
+    reward_delta=True \
+    rw_type=sparse \
+    check_xml_format=False \
+    actor.type._class=$MODEL_FAMILY \
+    actor.path=$BASE_MODEL_PATH \
+    actor.vllm.hybrid_train=False \
+    actor.vllm.enforce_eager=False \
+    actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
+    actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
+    actor.vllm.gpu_memory_utilization=1 \
+    actor.vllm.swap_space=64 \
+    critic.type._class=$MODEL_FAMILY \
+    critic.type.is_critic=True \
+    critic.init_critic_from_actor=True \
+    critic.path=$BASE_MODEL_PATH\
+    ref.type._class=$MODEL_FAMILY \
+    ref.path=$BASE_MODEL_PATH \
+    rew.type._class=$MODEL_FAMILY \
+    rew.type.is_critic=True \
+    rew.init_critic_from_actor=True \
+    rew.path=$BASE_MODEL_PATH \
+    dataset.path=$DATA_PATH \
+    dataset.max_prompt_len=2048 \
+    dataset.train_bs_n_seqs=${TRAIN_BATCH_SIZE} \
+    ppo.gen.max_new_tokens=${MAX_NEW_TOKENS} \
+    ppo.gen.min_new_tokens=0 \
+    ppo.disable_value=True \
+    ppo.gen.top_p=1 ppo.gen.top_k=1000000 \
+    ppo.ppo_n_minibatches=${PPO_MBS} \
+    ppo.gen.temperature=0.6 \
+    ppo.kl_ctl=${KL_CTL} \
+    ppo.value_eps_clip=0.2 \
+    ppo.reward_output_scaling=5 \
+    ppo.reward_output_bias=0.0 \
+    ppo.adv_norm=True ppo.value_norm=True \
+    mask_too_long=False \
+    ppo.discount=1.0 \
+    actor.optimizer.lr=1e-6 \
+    critic.optimizer.lr=5e-6 \
+    actor.optimizer.lr_scheduler_type=constant \
+    actor_gen.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    ref_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    rew_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    critic_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    actor_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    critic_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
+    cache_clear_freq=1 \
+    n_nodes=${NODES} \
+    allocation_mode="'${ALLOCATION_MODE}'" n_gpus_per_node=8 \
+    recover_mode=auto \
+    recover_retries=10 \
+    torch_cache_mysophobia=True
@@ -77,7 +77,7 @@ python3 -m realhf.apps.quickstart ppo-math \
     actor.vllm.enforce_eager=False \
     actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
     actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
-    actor.vllm.gpu_memory_utilization=1 \
+    actor.vllm.gpu_memory_utilization=0.85 \
     actor.vllm.swap_space=64 \
     critic.type._class=$MODEL_FAMILY \
     critic.type.is_critic=True \
 
@@ -0,0 +1 @@
+# Copyright 2025 Ant Group Inc.
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ def generate_in_parallel(requests, model_args, sampling_params, data_parallel_si`
`137`	`137`	`def run_inference_one_model(`
`138`	`138`	`model_args: dict, sampling_params, requests, cuda_visisble_devices`
`139`	`139`	`):`
`140`		`- os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"`
	`140`	`+ os.environ["VLLM_LOGGING_LEVEL"] = "INFO"`
`141`	`141`	`os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(`
`142`	`142`	`[str(x) for x in cuda_visisble_devices]`
`143`	`143`	`)`