| { | |
| "model_name": "GPT-2 RLHF", | |
| "model_type": "RLHF-trained GPT-2", | |
| "training_pipeline": [ | |
| "Stage 1: Supervised Fine-Tuning (SFT)", | |
| "Stage 2: Reward Model Training", | |
| "Stage 3: PPO Optimization" | |
| ], | |
| "dataset": "Anthropic/hh-rlhf", | |
| "base_model": "gpt2", | |
| "parameters": "124M", | |
| "training_date": "2025-09-29T20:36:42.118760", | |
| "methodology": "3-stage RLHF pipeline (same as ChatGPT)", | |
| "hyperparameters": { | |
| "sft_lr": "5e-5", | |
| "sft_epochs": 3, | |
| "reward_lr": "1e-5", | |
| "reward_epochs": 3, | |
| "ppo_lr": "1e-5", | |
| "ppo_episodes": 10, | |
| "kl_coef": 0.1, | |
| "clip_range": 0.2 | |
| } | |
| } |