
This figure illustrates how the configurations affect the training.

【左边generate, 右边learning】
【左边前向,右边后向】
【ppo Adavatage 中计算r】
【grpo adavantge 计算简化 】
PYTHONPATH=/opt/tiger/open_verl python3 -m verl.trainer.main_ppo \\
algorithm.adv_estimator=grpo \\
data.train_files="$train_files" \\
data.val_files="$test_files" \\
data.**train_batch_size**=1024 \\ ###
data.max_prompt_length=1024 \\
data.max_response_length=1024 \\
data.filter_overlong_prompts=True \\
data.truncation='error' \\
actor_rollout_ref.model.path=$model_path \\
actor_rollout_ref.actor.optim.lr=1e-6 \\
actor_rollout_ref.model.use_remove_padding=True \\
actor_rollout_ref.actor.**ppo_mini_batch_size**=256 \\ ###
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \\
actor_rollout_ref.actor.**use_kl_loss**=True \\ ###
actor_rollout_ref.actor.kl_loss_coef=0.001 \\
actor_rollout_ref.actor.kl_loss_type=low_var_kl \\
actor_rollout_ref.actor.entropy_coeff=0 \\
actor_rollout_ref.model.enable_gradient_checkpointing=True \\
actor_rollout_ref.actor.fsdp_config.param_offload=True \\
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \\
actor_rollout_ref.**rollout**.log_prob_micro_batch_size_per_gpu=16 \\
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \\
actor_rollout_ref.rollout.name=vllm \\
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \\
actor_rollout_ref.rollout.n=5 \\
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \\
actor_rollout_ref.ref.fsdp_config.param_offload=True \\
algorithm.use_kl_in_reward=False \\
trainer.critic_warmup=0 \\
trainer.logger='["console","wandb"]' \\
trainer.project_name='verl_grpo_example_gsm8k' \\
trainer.experiment_name='qwen2_14b_function_rm' \\
trainer.n_gpus_per_node=4 \\
trainer.nnodes=1 \\
trainer.save_freq=-1 \\
trainer.test_freq=5 \\
trainer.total_epochs=1 $@