Skip to content

Commit af158ec

Browse files
authored
Merge pull request #6 from nuzant/update-fix
Update release v0.1.2
2 parents 940dfb6 + 37510e1 commit af158ec

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+6486
-1707
lines changed

Dockerfile

-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ RUN cd /vllm && \
3939
python3 use_existing_torch.py && \
4040
pip3 install -r requirements-build.txt && \
4141
MAX_JOBS=64 pip3 install -e . --no-build-isolation
42-
RUN yes | pip3 uninstall uvloop
4342
RUN pip3 install opencv-python-headless==4.5.4.58
4443

4544
RUN apt-get update && apt-get install -y python3.10-venv

evaluation/eval_and_aggregate.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from glob import glob
66

77
import numpy as np
8+
import wandb
89
from rm_maj_eval import group_pred
910
from tqdm import tqdm
1011
from transformers import AutoTokenizer
@@ -29,6 +30,7 @@ def parse_args():
2930
parser.add_argument("--overwrite", action="store_true")
3031
parser.add_argument("--evaluate_train", action="store_true")
3132
parser.add_argument("--max_gen_tokens", default=32768, type=int)
33+
3234
args = parser.parse_args()
3335
if args.output_path is None:
3436
args.output_path = args.model_path
@@ -145,7 +147,7 @@ def process_single_data_name(args, data_name, base_dir, tokenizer):
145147

146148
if __name__ == "__main__":
147149
args = parse_args()
148-
150+
print(f"Evaluation output to {args.output_path}")
149151
assert args.num_sample_nodes * args.samples_per_node >= args.n_sampling
150152

151153
eval_dir = (
@@ -155,6 +157,7 @@ def process_single_data_name(args, data_name, base_dir, tokenizer):
155157
)
156158

157159
base_dir = os.path.join(args.output_path, eval_dir)
160+
os.makedirs(base_dir, exist_ok=True)
158161
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
159162
result_path = os.path.join(base_dir, f"aggregate_parallel_{args.prompt_type}.json")
160163

@@ -228,10 +231,10 @@ def process_single_data_name(args, data_name, base_dir, tokenizer):
228231
from prettytable import PrettyTable
229232

230233
table = PrettyTable()
231-
filed_names = ["dataset"] + list(all_results[args.data_names[0]].keys())
232-
table.field_names = filed_names
234+
field_names = ["dataset"] + list(all_results[args.data_names[0]].keys())
235+
table.field_names = field_names
233236
for k, v in all_results.items():
234-
table.add_row([k, *[round(v[x], 1) for x in filed_names[1:]]])
237+
table.add_row([k, *[round(v[x], 1) for x in field_names[1:]]])
235238

236239
print(table)
237240
except:

evaluation/math_eval.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def generate_in_parallel(requests, model_args, sampling_params, data_parallel_si
137137
def run_inference_one_model(
138138
model_args: dict, sampling_params, requests, cuda_visisble_devices
139139
):
140-
os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
140+
os.environ["VLLM_LOGGING_LEVEL"] = "INFO"
141141
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
142142
[str(x) for x in cuda_visisble_devices]
143143
)

evaluation/requirements.txt

+4-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ vllm
33
tqdm
44
datasets
55
torch
6-
transformers
6+
transformers==4.47.0
77
python_dateutil
88
flash_attn
99

@@ -12,4 +12,6 @@ sympy==1.12
1212
antlr4-python3-runtime==4.11.1 # ! The version needs to be compatible with sympy.
1313
word2number
1414
Pebble
15-
timeout-decorator
15+
prettytable
16+
timeout-decorator
17+
wandb
+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/bash
2+
# Users should run this script under AReaL directory
3+
4+
/usr/bin/python3 -m pip install -e evaluation/latex2sympy/
5+
/usr/bin/python3 -m pip install -r evaluation/requirements.txt
6+
7+
cd evaluation && /usr/bin/python3 eval_and_aggregate.py --model_path $1 --output_path $2 --data_names $3 --max_gen_tokens $4 --prompt_type $5

examples/README.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@ Check if your hardware meets these minimum requirements:
1313
| Memory | 1 TB |1 TB per node|1 TB per node| 1 TB per node |1 TB per node|
1414
| Network | NVSwitch |NVSwitch + RoCE 3.2 Tbps|NVSwitch + RoCE 3.2 Tbps| NVSwitch + RoCE 3.2 Tbps |NVSwitch + RoCE 3.2 Tbps|
1515
| Storage | 1TB |Shared storage (NAS) 10TB|Shared storage (NAS) 10TB| Shared storage (NAS) 10TB |Shared storage (NAS) 10TB|
16-
| **Total Time (Hours)** | **520** | **150** | **50** | **410** | **130** |
16+
| **Total Time (Hours)** | **520** | **150** | **50** | **680** | **200** |
1717

1818
Notes:
1919
- GPUs need to have 80GB memory. Other GPU models with similar specs are acceptable.
2020
- Single-node training can use local storage, but multi-node training requires shared storage.
2121
- Total Training Time = Number of Epochs × Number of Steps per Epoch × Training Time per Step
2222
- Number of Epochs defaults to 10.
23-
- Number of Steps per Epoch depends on the dataset size. In this tutorial, the dataset approximately requires 40 steps per epoch for the 1.5B model and 20 steps for the 7B model. The batch sizes for 1.5B and 7B models are different.
23+
- Number of steps per epoch depends on the dataset size and batch size. For example, with a dataset of 40,315 samples and a batch size of 1024, each epoch requires training for 40,315 / 1024 = 39.37 steps. Ultimately, an epoch will involve a minimum of 39 steps and a maximum of 40 steps of training.
2424
- Training Time per Step depends on the number of GPUs used.
2525

2626
## Software Requirements
@@ -358,7 +358,7 @@ The last entry is used to explain the meaning of key fields:
358358
- `importance_weight`: The average importance sampling ratio across all tokens in the PPO loss. This value is typically close to 1.
359359
- `actor_clip_ratio`: The ratio of tokens clipped in the PPO loss to the total number of tokens. This is usually less than 0.1.
360360
- `actor_loss`: The PPO loss. **It does not show a clear upward or downward trend during training** and should not be used as a reference for model performance.
361-
- `avg_seq_len`: The average length of all sampled answers in this step. In a full multi-stage training process, this value will first decrease and then increase.
361+
- `avg_seq_len`: The average length of all sequences (i.e., prompts with sampled answers) in this step. In a full multi-stage training process, this value will first decrease and then increase.
362362
- `no_eos_ratio`: The ratio of sampled answers truncated due to exceeding the maximum generation length. An increase in this value indicates that the average length of answers is increasing.
363363

364364
# Evaluation

examples/README_zh.md

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
| 内存 | 1 TB |每节点 1 TB|每节点 1 TB |每节点 1 TB |每节点 1 TB |
1515
| 通信 | NVSwitch |NVSwitch+RoCE 带宽 3.2 Tbps|NVSwitch+RoCE 带宽 3.2 Tbps|NVSwitch+RoCE 带宽 3.2 Tbps|NVSwitch+RoCE 带宽 3.2 Tbps|
1616
| 存储 | 1TB |共享存储(NAS)10TB |共享存储(NAS)10TB |共享存储(NAS)10TB |共享存储(NAS)10TB |
17-
|总训练时间(小时)|520|150|50|410|130|
17+
|总训练时间(小时)|520|150|50|680|200|
1818

1919
关于硬件要求的说明:
2020

@@ -27,7 +27,7 @@
2727
- 总训练时间 = Epoch 数量 * 每个 Epoch 的 Step 数量 * 单步训练时间
2828

2929
- Epoch 数量默认为 10
30-
- 每个 Epoch 的 Step 数量与数据集大小有关。利用我们的数据集, 1.5B 模型每个epoch需要训练 39 步,7B 模型每个epoch需要训练 19 步。不同大小模型使用的batch size也不同
30+
- 每个 Epoch 的 Step 数量与数据集大小和 Batch Size 有关。比如数据集为 40315 条,Batch Size 为 1024 时,每个 Epoch 需要训练 40315 / 1024 = 39.37 步,最终一个 Epoch 最少训练 39 步,最多训练 40 步。
3131
- 单步训练时间与 GPU 卡数有关
3232

3333
## 软件要求
@@ -380,7 +380,7 @@ python3 -m realhf.apps.quickstart ppo-math --show-args
380380
+ `importance_weight`: PPO loss中重要性采样比率在所有token上的平均值,通常接近1。
381381
+ `actor_clip_ratio`: PPO loss中被clip掉的token占所有token的比率,通常小于0.1。
382382
+ `actor_loss`: PPO loss,**不会随着训练过程有明显的上升或下降趋势**,不应作为模型表现的参考。
383-
+ `avg_seq_len`: 这一步中采样的所有答案的平均长度。在完整的多阶段训练中,这个值会先下降再上升。
383+
+ `avg_seq_len`: 这一步中采样的所有序列(即提示词和答案相加)的平均长度。在完整的多阶段训练中,这个值会先下降再上升。
384384
+ `no_eos_ratio`: 这一步中采样的所有答案因为超出最大生成长度被截断的比率。这个值上升也代表了答案的平均长度在上升。
385385

386386
# 评估

examples/train_code_batch_1.5B_n16.sh

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/bin/bash
2+
3+
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
4+
5+
EXP_NAME=ppo-zero-distill-1.5B-n16-jun1
6+
MODEL_NAME="DeepSeek-R1-Distill-Qwen-1.5B"
7+
DATASET_NAME="codeparrot-apps-test.jsonl"
8+
NODES=16
9+
ALLOCATION_MODE="vllm.d64p1m1+d32p2m1"
10+
11+
LOG_DIR="/storage/ray/train_batch_logs/${EXP_NAME}/$(date +'%Y%m%d-%H%M%S')"
12+
mkdir -p ${LOG_DIR}
13+
echo "Log Dir: ${LOG_DIR}"
14+
15+
MAX_WORKERS=$(expr 16 / ${NODES})
16+
17+
FIFO_NAME=$(mktemp -u)
18+
mkfifo "$FIFO_NAME"
19+
exec 3<>"$FIFO_NAME"
20+
rm -f "$FIFO_NAME"
21+
22+
for ((i=0; i<MAX_WORKERS; i++)); do
23+
echo >&3
24+
done
25+
26+
27+
ALL_PARAMS=(
28+
"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
29+
#"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
30+
#"${EXP_NAME} ${MODEL_NAME} ${DATASET_NAME} 1024 8 ${NODES} ${ALLOCATION_MODE} 16384 128 1 0.001"
31+
)
32+
33+
echo "Task Count: ${#ALL_PARAMS[@]}"
34+
35+
for ((i=0; i<${#ALL_PARAMS[@]}; i++)); do
36+
read -u3
37+
38+
{
39+
echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i started: ${ALL_PARAMS[$i]}"
40+
bash -c "bash ${SCRIPT_DIR}/train_code_small_on_ray.sh ${ALL_PARAMS[$i]} &> ${LOG_DIR}/${i}.log"
41+
echo "$(date +"%Y-%m-%d %H:%M.%S") Task $i completed with exit code: $?, ${ALL_PARAMS[$i]}"
42+
sleep 120
43+
echo >&3
44+
} &
45+
46+
sleep 120
47+
done
48+
49+
wait
50+
51+
exec 3>&-
52+
echo "All tasks completed"

examples/train_code_small_on_ray.sh

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/bin/sh
2+
MODEL_FAMILY=qwen2
3+
4+
EXP_NAME="$1"
5+
MODEL_NAME="$2"
6+
DATASET_NAME="$3"
7+
TRAIN_BATCH_SIZE="$4"
8+
GROUP_SIZE="$5"
9+
NODES="$6"
10+
ALLOCATION_MODE="$7"
11+
MAX_NEW_TOKENS=$8
12+
MAX_NUM_SEQS=$9
13+
PPO_MBS=${10}
14+
KL_CTL=${11}
15+
16+
MAX_TOKEN_PER_MB=$(expr 2048 + ${MAX_NEW_TOKENS} + 1024)
17+
MAX_SEQ_LEN_TO_CAPTURE=$(expr 2048 + ${MAX_NEW_TOKENS})
18+
19+
BASE_MODEL_PATH="/storage/models/${MODEL_NAME}"
20+
21+
# original data
22+
DATA_PATH="/storage/datasets/${DATASET_NAME}"
23+
REAL_CODE_METADATA_PATH="/storage/datasets/codeparrot-apps-test.jsonl"
24+
25+
# Option 1: The experiment runs locally with subprocesses.
26+
# MODE=local
27+
# Option 2: The experiment runs in a Ray cluster
28+
# MODE=ray
29+
# Option 3: The experiment runs in a SLURM + pyxis cluster
30+
# Using the slurm mode requires a cluster spec file
31+
# and setting CLUSTER_SPEC_PATH to the path of it.
32+
MODE=ray
33+
34+
# `experiment_name` and `trial_name` can be arbitrary.
35+
# Logs and saved checkpoints will be indexed by them.
36+
#EXP_NAME=ppo-zero--${MODEL_NAME}--${DATASET_NAME}
37+
#EXP_NAME=ppo-zero-distill-1.5B-default
38+
TRIAL_NAME="${TRAIN_BATCH_SIZE}x${GROUP_SIZE}-n${NODES}"
39+
40+
# We use the "heuristic" allocation mode here to automatically determine the parallelism strategy
41+
# for each model function call, i.e., actor generation, critic inference, actor train, etc.
42+
# The number of GPUs is `n_nodes` * `n_gpus_per_node` (not set explictly here, defaults to 8).
43+
# ReaL will make full use of these available GPUs to design allocations.
44+
# This does not ensure the optimal throughput, but it is a good starting point.
45+
46+
# The `heuristic` allocation mode is not ensured to run with every model configurations.
47+
# For example, if the vocabulary size is an odd number, the model parallelism may not work.
48+
# In these cases, you can use the `ppo_manual.sh` to specify the parallelism strategy manually.
49+
50+
# The `ppo` subcommand specifies that this is a PPO experiment.
51+
# The `save_freq_steps` is set to `null` to disable saving checkpoints.
52+
# Enable it if you want to save checkpoints.
53+
# The `ppo` option is used to control the generation and PPO algorithm hyperparameters.
54+
# Note that the performance of PPO is sensitive to the the pre-trained model and hyperparameters.
55+
# It's the user's responsibility to tune them appropriately.
56+
unset CLUSTER_SPEC_PATH
57+
CLUSTER_SPEC_PATH=/storage/ray/cluster_config_on_ray.json \
58+
REAL_CODE_METADATA_PATH=${REAL_CODE_METADATA_PATH} \
59+
FUNCTIONCALL_SERVICE_DOMAIN="" \
60+
REAL_GPU_MEMORY_KILL_THRESHOLD=1 \
61+
python3 -m realhf.apps.quickstart ppo-code \
62+
mode=$MODE \
63+
experiment_name=$EXP_NAME \
64+
trial_name=$TRIAL_NAME \
65+
wandb.mode=disabled \
66+
exp_ctrl.total_train_epochs=1 \
67+
exp_ctrl.save_freq_epochs=1 \
68+
exp_ctrl.ckpt_freq_secs=600 \
69+
group_size=${GROUP_SIZE} \
70+
group_adv_norm=False \
71+
use_dense_reward=False \
72+
reward_delta=True \
73+
rw_type=sparse \
74+
check_xml_format=False \
75+
actor.type._class=$MODEL_FAMILY \
76+
actor.path=$BASE_MODEL_PATH \
77+
actor.vllm.hybrid_train=False \
78+
actor.vllm.enforce_eager=False \
79+
actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
80+
actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
81+
actor.vllm.gpu_memory_utilization=1 \
82+
actor.vllm.swap_space=64 \
83+
critic.type._class=$MODEL_FAMILY \
84+
critic.type.is_critic=True \
85+
critic.init_critic_from_actor=True \
86+
critic.path=$BASE_MODEL_PATH\
87+
ref.type._class=$MODEL_FAMILY \
88+
ref.path=$BASE_MODEL_PATH \
89+
rew.type._class=$MODEL_FAMILY \
90+
rew.type.is_critic=True \
91+
rew.init_critic_from_actor=True \
92+
rew.path=$BASE_MODEL_PATH \
93+
dataset.path=$DATA_PATH \
94+
dataset.max_prompt_len=2048 \
95+
dataset.train_bs_n_seqs=${TRAIN_BATCH_SIZE} \
96+
ppo.gen.max_new_tokens=${MAX_NEW_TOKENS} \
97+
ppo.gen.min_new_tokens=0 \
98+
ppo.disable_value=True \
99+
ppo.gen.top_p=1 ppo.gen.top_k=1000000 \
100+
ppo.ppo_n_minibatches=${PPO_MBS} \
101+
ppo.gen.temperature=0.6 \
102+
ppo.kl_ctl=${KL_CTL} \
103+
ppo.value_eps_clip=0.2 \
104+
ppo.reward_output_scaling=5 \
105+
ppo.reward_output_bias=0.0 \
106+
ppo.adv_norm=True ppo.value_norm=True \
107+
mask_too_long=False \
108+
ppo.discount=1.0 \
109+
actor.optimizer.lr=1e-6 \
110+
critic.optimizer.lr=5e-6 \
111+
actor.optimizer.lr_scheduler_type=constant \
112+
actor_gen.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
113+
ref_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
114+
rew_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
115+
critic_inf.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
116+
actor_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
117+
critic_train.mb_spec.max_tokens_per_mb=${MAX_TOKEN_PER_MB} \
118+
cache_clear_freq=1 \
119+
n_nodes=${NODES} \
120+
allocation_mode="'${ALLOCATION_MODE}'" n_gpus_per_node=8 \
121+
recover_mode=auto \
122+
recover_retries=10 \
123+
torch_cache_mysophobia=True

examples/train_small_on_ray.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ python3 -m realhf.apps.quickstart ppo-math \
7777
actor.vllm.enforce_eager=False \
7878
actor.vllm.max_seq_len_to_capture=${MAX_SEQ_LEN_TO_CAPTURE} \
7979
actor.vllm.max_num_seqs=${MAX_NUM_SEQS} \
80-
actor.vllm.gpu_memory_utilization=1 \
80+
actor.vllm.gpu_memory_utilization=0.85 \
8181
actor.vllm.swap_space=64 \
8282
critic.type._class=$MODEL_FAMILY \
8383
critic.type.is_critic=True \

functioncall/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Copyright 2025 Ant Group Inc.

functioncall/base/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)