diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index f61a68c2c..16a388255 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -331,21 +331,20 @@ RLHF参数继承于[训练参数](#训练参数) - simpo_gamma: SimPO算法中的reward margin项,论文建议设置为0.5-1.5,默认为`1.` - desirable_weight: KTO算法中对desirable response的loss权重 $\lambda_D$,默认为`1.` - undesirable_weight: KTO算法中对undesirable response的loss权重 $\lambda_U$,默认为`1.` -- num_generations: GRPO算法中的G值,默认为8 -- max_completion_length: GRPO算法中的最大生成长度,默认为512 -- reward_funcs: GRPO算法奖励函数,可选项为`accuracy`和`format`,见swift/plugin/orm.py -- use_vllm: 是否使用vLLM作为GRPO生成的backend,默认为False -- vllm_device: 设置vLLM部署的设备,比如部署在卡0上,则`cuda:1`, 默认为`auto`, 即使用最后一张卡 -- vllm_gpu_memory_utilization: vllm透传参数 -- vllm_max_model_len: vllm透传参数 - loss_scale: 覆盖模板参数,默认为'last_round' +- temperature: 默认为0.7,该参数将在PPO、GRPO中使用 + + +#### Reward模型参数 +reward模型参数将在PPO、GRPO中使用。 -#### PPO参数 - reward_model: 默认为None - reward_adapters: 默认为`[]` - reward_model_type: 默认为None - reward_model_revision: 默认为None +#### PPO参数 + 以下参数含义可以参考[这里](https://huggingface.co/docs/trl/main/ppo_trainer) - num_ppo_epochs: 默认为4 - whiten_rewards: 默认为False @@ -359,9 +358,26 @@ RLHF参数继承于[训练参数](#训练参数) - local_rollout_forward_batch_size: 默认为64 - num_sample_generations: 默认为10 - response_length: 默认为512 -- temperature: 默认为0.7 - missing_eos_penalty: 默认为None + +#### GRPO参数 +- num_generations: GRPO算法中的G值,默认为8 +- max_completion_length: GRPO算法中的最大生成长度,默认为512 +- reward_funcs: GRPO算法奖励函数,可选项为`accuracy`和`format`,见swift/plugin/orm.py。你也可以在plugin中自定义自己的奖励函数。默认为`[]` +- use_vllm: 是否使用vLLM作为GRPO生成的infer_backend,默认为False +- vllm_device: 设置vLLM部署的设备,比如部署在卡0上,则`cuda:1`, 默认为`auto`, 即使用最后一张卡 +- vllm_gpu_memory_utilization: vllm透传参数,默认为0.9 +- vllm_max_model_len: vllm透传参数,默认为None +- vllm_max_num_seqs: vllm透传参数,默认为256 +- vllm_enforce_eager: vllm透传参数,默认为False +- vllm_limit_mm_per_prompt: vllm透传参数,默认为None +- vllm_enable_prefix_caching: vllm透传参数,默认为True +- top_k: 默认为None。读取generation_config.json +- top_p: 默认为None。读取generation_config.json +- repetition_penalty: 重复惩罚项。默认为None,读取generation_config.json + + ### 推理参数 推理参数除包含[基本参数](#基本参数)、[合并参数](#合并参数)、[vLLM参数](#vllm参数)、[LMDeploy参数](#LMDeploy参数)外,还包含下面的部分: diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index 217f2e303..6b70c185c 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -340,21 +340,20 @@ RLHF arguments inherit from the [training arguments](#training-arguments). - simpo_gamma: Reward margin term in the SimPO algorithm, with a paper-suggested setting of 0.5-1.5, default is `1.`. - desirable_weight: Loss weight $\lambda_D$ for desirable response in the KTO algorithm, default is `1.`. - undesirable_weight: Loss weight $\lambda_U$ for undesirable response in the KTO algorithm, default is `1.`. -- num_generations: The G value in the GRPO algorithm, with a default of 8. -- max_completion_length: The maximum generation length in the GRPO algorithm, with a default of 512. -- reward_funcs: Reward functions for the GRPO algorithm, with options being accuracy and format. See swift/plugin/orm.py for details. -- use_vllm: Whether to use vLLM as the backend for GRPO generation, with a default of False. -- vllm_device: Set the device for vLLM deployment. For example, to deploy on GPU 0, use cuda:1. The default is auto, which uses the last available GPU. -- vllm_gpu_memory_utilization: A parameter passed through to vLLM. -- vllm_max_model_len: A parameter passed through to vLLM. - loss_scale: Override template arguments, default is 'last_round'. +- temperature: Default is 0.7; this parameter will be used in PPO and GRPO. -#### PPO Arguments -- reward_model: Defaults to None -- reward_adapters: Defaults to `[]` -- reward_model_type: Defaults to None -- reward_model_revision: Defaults to None +#### Reward Model Parameters + +The reward model parameters will be used in PPO and GRPO. + +- reward_model: Default is None. +- reward_adapters: Default is `[]`. +- reward_model_type: Default is None. +- reward_model_revision: Default is None. + +#### PPO Arguments The meanings of the following parameters can be referenced [here](https://huggingface.co/docs/trl/main/ppo_trainer): @@ -370,9 +369,25 @@ The meanings of the following parameters can be referenced [here](https://huggin - local_rollout_forward_batch_size: Defaults to 64 - num_sample_generations: Defaults to 10 - response_length: Defaults to 512 -- temperature: Defaults to 0.7 - missing_eos_penalty: Defaults to None + +#### GRPO Arguments +- num_generations: The G value in the GRPO algorithm, default is 8. +- max_completion_length: The maximum generation length in the GRPO algorithm, default is 512. +- reward_funcs: Reward functions in the GRPO algorithm; options include `accuracy` and `format`, as seen in `swift/plugin/orm.py`. You can also customize your own reward functions in the plugin. Default is `[]`. +- use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False. +- vllm_device: Set the device for vLLM deployment. For example, if deployed on card 0, use `cuda:0`; default is `auto`, which means using the last available GPU. +- vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9. +- vllm_max_model_len: vLLM passthrough parameter, default is None. +- vllm_max_num_seqs: vLLM passthrough parameter, default is 256. +- vllm_enforce_eager: vLLM passthrough parameter, default is False. +- vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None. +- vllm_enable_prefix_caching: vLLM passthrough parameter, default is True. +- top_k: Default is None. Read from `generation_config.json`. +- top_p: Default is None. Read from `generation_config.json`. +- repetition_penalty: Repetition penalty term. Default is None, read from `generation_config.json`. + ### Inference Arguments Inference arguments include the [base arguments](#base-arguments), [merge arguments](#merge-arguments), [vLLM arguments](#vllm-arguments), [LMDeploy arguments](#LMDeploy-arguments), and also contain the following: diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py index fc700343c..1f2c3fdf8 100644 --- a/swift/llm/argument/rlhf_args.py +++ b/swift/llm/argument/rlhf_args.py @@ -88,11 +88,11 @@ class RLHFArguments(GRPOArguments, PPOArguments, RewardModelArguments, TrainArgu desirable_weight: float = 1.0 undesirable_weight: float = 1.0 # PPO/GRPO - temperature: float = 0.9 + temperature: float = 0.7 def _prepare_training_args(self, training_args: Dict[str, Any]) -> None: if self.rlhf_type == 'ppo': - args_dict['world_size'] = self.global_world_size + training_args['world_size'] = self.global_world_size def __post_init__(self): self._init_grpo() diff --git a/swift/trainers/rlhf_arguments.py b/swift/trainers/rlhf_arguments.py index 87520c795..38f218572 100644 --- a/swift/trainers/rlhf_arguments.py +++ b/swift/trainers/rlhf_arguments.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from typing import Optional from trl import CPOConfig as HfCPOConfig from trl import DPOConfig as HfDPOConfig @@ -43,4 +44,6 @@ class PPOConfig(SwiftArgumentsMixin, HfPPOConfig): @dataclass class GRPOConfig(GRPOVllmArguments, SwiftArgumentsMixin, HfGRPOConfig): - pass + top_k: Optional[int] = None + top_p: Optional[float] = None + repetition_penalty: Optional[float] = None diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py index 6f93b9420..7d1d547fc 100644 --- a/swift/trainers/rlhf_trainer/grpo_trainer.py +++ b/swift/trainers/rlhf_trainer/grpo_trainer.py @@ -129,6 +129,9 @@ def __init__(self, self.request_config = RequestConfig( max_tokens=args.max_completion_length, temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + repetition_penalty=args.repetition_penalty, ) self.model_accepts_loss_kwargs = False