Merge branch 'main' into update_docs_swift3

modelscope · Feb 11, 2025 · a5f3491 · a5f3491
2 parents 5323984 + aef2e5c
commit a5f3491
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 25 deletions.
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -331,21 +331,20 @@ RLHF参数继承于[训练参数](#训练参数)
 - simpo_gamma: SimPO算法中的reward margin项，论文建议设置为0.5-1.5，默认为`1.`
 - desirable_weight: KTO算法中对desirable response的loss权重 $\lambda_D$，默认为`1.`
 - undesirable_weight: KTO算法中对undesirable response的loss权重 $\lambda_U$，默认为`1.`
-- num_generations: GRPO算法中的G值，默认为8
-- max_completion_length: GRPO算法中的最大生成长度，默认为512
-- reward_funcs: GRPO算法奖励函数，可选项为`accuracy`和`format`，见swift/plugin/orm.py
-- use_vllm: 是否使用vLLM作为GRPO生成的backend，默认为False
-- vllm_device: 设置vLLM部署的设备，比如部署在卡0上，则`cuda:1`, 默认为`auto`, 即使用最后一张卡
-- vllm_gpu_memory_utilization: vllm透传参数
-- vllm_max_model_len: vllm透传参数
 - loss_scale: 覆盖模板参数，默认为'last_round'
+- temperature: 默认为0.7，该参数将在PPO、GRPO中使用
+
+
+#### Reward模型参数
+reward模型参数将在PPO、GRPO中使用。
 
-#### PPO参数
 - reward_model: 默认为None
 - reward_adapters: 默认为`[]`
 - reward_model_type: 默认为None
 - reward_model_revision: 默认为None
 
+#### PPO参数
+
 以下参数含义可以参考[这里](https://huggingface.co/docs/trl/main/ppo_trainer)
 - num_ppo_epochs: 默认为4
 - whiten_rewards: 默认为False
@@ -359,9 +358,26 @@ RLHF参数继承于[训练参数](#训练参数)
 - local_rollout_forward_batch_size: 默认为64
 - num_sample_generations: 默认为10
 - response_length: 默认为512
-- temperature: 默认为0.7
 - missing_eos_penalty: 默认为None
 
+
+#### GRPO参数
+- num_generations: GRPO算法中的G值，默认为8
+- max_completion_length: GRPO算法中的最大生成长度，默认为512
+- reward_funcs: GRPO算法奖励函数，可选项为`accuracy`和`format`，见swift/plugin/orm.py。你也可以在plugin中自定义自己的奖励函数。默认为`[]`
+- use_vllm: 是否使用vLLM作为GRPO生成的infer_backend，默认为False
+- vllm_device: 设置vLLM部署的设备，比如部署在卡0上，则`cuda:1`, 默认为`auto`, 即使用最后一张卡
+- vllm_gpu_memory_utilization: vllm透传参数，默认为0.9
+- vllm_max_model_len: vllm透传参数，默认为None
+- vllm_max_num_seqs: vllm透传参数，默认为256
+- vllm_enforce_eager: vllm透传参数，默认为False
+- vllm_limit_mm_per_prompt: vllm透传参数，默认为None
+- vllm_enable_prefix_caching: vllm透传参数，默认为True
+- top_k: 默认为None。读取generation_config.json
+- top_p: 默认为None。读取generation_config.json
+- repetition_penalty: 重复惩罚项。默认为None，读取generation_config.json
+
+
 ### 推理参数
 
 推理参数除包含[基本参数](#基本参数)、[合并参数](#合并参数)、[vLLM参数](#vllm参数)、[LMDeploy参数](#LMDeploy参数)外，还包含下面的部分：

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -340,21 +340,20 @@ RLHF arguments inherit from the [training arguments](#training-arguments).
 - simpo_gamma: Reward margin term in the SimPO algorithm, with a paper-suggested setting of 0.5-1.5, default is `1.`.
 - desirable_weight: Loss weight $\lambda_D$ for desirable response in the KTO algorithm, default is `1.`.
 - undesirable_weight: Loss weight $\lambda_U$ for undesirable response in the KTO algorithm, default is `1.`.
-- num_generations: The G value in the GRPO algorithm, with a default of 8.
-- max_completion_length: The maximum generation length in the GRPO algorithm, with a default of 512.
-- reward_funcs: Reward functions for the GRPO algorithm, with options being accuracy and format. See swift/plugin/orm.py for details.
-- use_vllm: Whether to use vLLM as the backend for GRPO generation, with a default of False.
-- vllm_device: Set the device for vLLM deployment. For example, to deploy on GPU 0, use cuda:1. The default is auto, which uses the last available GPU.
-- vllm_gpu_memory_utilization: A parameter passed through to vLLM.
-- vllm_max_model_len: A parameter passed through to vLLM.
 - loss_scale: Override template arguments, default is 'last_round'.
+- temperature: Default is 0.7; this parameter will be used in PPO and GRPO.
 
-#### PPO Arguments
 
-- reward_model: Defaults to None
-- reward_adapters: Defaults to `[]`
-- reward_model_type: Defaults to None
-- reward_model_revision: Defaults to None
+#### Reward Model Parameters
+
+The reward model parameters will be used in PPO and GRPO.
+
+- reward_model: Default is None.
+- reward_adapters: Default is `[]`.
+- reward_model_type: Default is None.
+- reward_model_revision: Default is None.
+
+#### PPO Arguments
 
 The meanings of the following parameters can be referenced [here](https://huggingface.co/docs/trl/main/ppo_trainer):
 
@@ -370,9 +369,25 @@ The meanings of the following parameters can be referenced [here](https://huggin
 - local_rollout_forward_batch_size: Defaults to 64
 - num_sample_generations: Defaults to 10
 - response_length: Defaults to 512
-- temperature: Defaults to 0.7
 - missing_eos_penalty: Defaults to None
 
+
+#### GRPO Arguments
+- num_generations: The G value in the GRPO algorithm, default is 8.
+- max_completion_length: The maximum generation length in the GRPO algorithm, default is 512.
+- reward_funcs: Reward functions in the GRPO algorithm; options include `accuracy` and `format`, as seen in `swift/plugin/orm.py`. You can also customize your own reward functions in the plugin. Default is `[]`.
+- use_vllm: Whether to use vLLM as the infer_backend for GRPO generation, default is False.
+- vllm_device: Set the device for vLLM deployment. For example, if deployed on card 0, use `cuda:0`; default is `auto`, which means using the last available GPU.
+- vllm_gpu_memory_utilization: vLLM passthrough parameter, default is 0.9.
+- vllm_max_model_len: vLLM passthrough parameter, default is None.
+- vllm_max_num_seqs: vLLM passthrough parameter, default is 256.
+- vllm_enforce_eager: vLLM passthrough parameter, default is False.
+- vllm_limit_mm_per_prompt: vLLM passthrough parameter, default is None.
+- vllm_enable_prefix_caching: vLLM passthrough parameter, default is True.
+- top_k: Default is None. Read from `generation_config.json`.
+- top_p: Default is None. Read from `generation_config.json`.
+- repetition_penalty: Repetition penalty term. Default is None, read from `generation_config.json`.
+
 ### Inference Arguments
 
 Inference arguments include the [base arguments](#base-arguments), [merge arguments](#merge-arguments), [vLLM arguments](#vllm-arguments), [LMDeploy arguments](#LMDeploy-arguments), and also contain the following:

diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
@@ -88,11 +88,11 @@ class RLHFArguments(GRPOArguments, PPOArguments, RewardModelArguments, TrainArgu
     desirable_weight: float = 1.0
     undesirable_weight: float = 1.0
     # PPO/GRPO
-    temperature: float = 0.9
+    temperature: float = 0.7
 
     def _prepare_training_args(self, training_args: Dict[str, Any]) -> None:
         if self.rlhf_type == 'ppo':
-            args_dict['world_size'] = self.global_world_size
+            training_args['world_size'] = self.global_world_size
 
     def __post_init__(self):
         self._init_grpo()

diff --git a/swift/trainers/rlhf_arguments.py b/swift/trainers/rlhf_arguments.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+from typing import Optional
 
 from trl import CPOConfig as HfCPOConfig
 from trl import DPOConfig as HfDPOConfig
@@ -43,4 +44,6 @@ class PPOConfig(SwiftArgumentsMixin, HfPPOConfig):
 
 @dataclass
 class GRPOConfig(GRPOVllmArguments, SwiftArgumentsMixin, HfGRPOConfig):
-    pass
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
diff --git a/swift/trainers/rlhf_trainer/grpo_trainer.py b/swift/trainers/rlhf_trainer/grpo_trainer.py
@@ -129,6 +129,9 @@ def __init__(self,
         self.request_config = RequestConfig(
             max_tokens=args.max_completion_length,
             temperature=args.temperature,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            repetition_penalty=args.repetition_penalty,
         )
 
         self.model_accepts_loss_kwargs = False